Add payment line parser and fix OCR override from payment_line

- Add MachineCodeParser for Swedish invoice payment line parsing - Fix OCR Reference extraction by normalizing account number spaces - Add cross-validation tests for pipeline and field_extractor - Update UI layout for compact upload and full-width results Key changes: - machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713") - pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only - field_extractor.py: Improved invoice number normalization - app.py: Responsive UI layout changes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-21 21:47:02 +01:00
parent e9460e9f34
commit 4ea4bc96d4
33 changed files with 7530 additions and 562 deletions
--- a/src/matcher/field_matcher.py
+++ b/src/matcher/field_matcher.py
@@ -14,11 +14,11 @@ from functools import cached_property
 _DATE_PATTERN = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
 _WHITESPACE_PATTERN = re.compile(r'\s+')
 _NON_DIGIT_PATTERN = re.compile(r'\D')
-_DASH_PATTERN = re.compile(r'[\u2013\u2014\u2212]')  # en-dash, em-dash, minus sign
+_DASH_PATTERN = re.compile(r'[\u2013\u2014\u2212\u00b7]')  # en-dash, em-dash, minus sign, middle dot


 def _normalize_dashes(text: str) -> str:
-    """Normalize different dash types to standard hyphen-minus (ASCII 45)."""
+    """Normalize different dash types and middle dots to standard hyphen-minus (ASCII 45)."""
    return _DASH_PATTERN.sub('-', text)


@@ -195,7 +195,13 @@ class FieldMatcher:
            List of Match objects sorted by score (descending)
        """
        matches = []
-        page_tokens = [t for t in tokens if t.page_no == page_no]
+        # Filter tokens by page and exclude hidden metadata tokens
+        # Hidden tokens often have bbox with y < 0 or y > page_height
+        # These are typically PDF metadata stored as invisible text
+        page_tokens = [
+            t for t in tokens
+            if t.page_no == page_no and t.bbox[1] >= 0 and t.bbox[3] > t.bbox[1]
+        ]

        # Build spatial index for efficient nearby token lookup (O(n) -> O(1))
        self._token_index = TokenIndex(page_tokens, grid_size=self.context_radius)
@@ -373,41 +379,74 @@ class FieldMatcher:
        if field_name not in supported_fields:
            return matches

+        # Fields where spaces/dashes should be ignored during matching
+        # (e.g., org number "55 65 74-6624" should match "5565746624")
+        ignore_spaces_fields = ('supplier_organisation_number', 'Bankgiro', 'Plusgiro', 'supplier_accounts')
+
        for token in tokens:
            token_text = token.text.strip()
            # Normalize different dash types to hyphen-minus for matching
            token_text_normalized = _normalize_dashes(token_text)

+            # For certain fields, also try matching with spaces/dashes removed
+            if field_name in ignore_spaces_fields:
+                token_text_compact = token_text_normalized.replace(' ', '').replace('-', '')
+                value_compact = value.replace(' ', '').replace('-', '')
+            else:
+                token_text_compact = None
+                value_compact = None
+
            # Skip if token is the same length as value (would be exact match)
            if len(token_text_normalized) <= len(value):
                continue

            # Check if value appears as substring (using normalized text)
            # Try case-sensitive first, then case-insensitive
+            idx = None
+            case_sensitive_match = True
+            used_compact = False
+
            if value in token_text_normalized:
                idx = token_text_normalized.find(value)
-                case_sensitive_match = True
            elif value.lower() in token_text_normalized.lower():
                idx = token_text_normalized.lower().find(value.lower())
                case_sensitive_match = False
-            else:
+            elif token_text_compact and value_compact in token_text_compact:
+                # Try compact matching (spaces/dashes removed)
+                idx = token_text_compact.find(value_compact)
+                used_compact = True
+            elif token_text_compact and value_compact.lower() in token_text_compact.lower():
+                idx = token_text_compact.lower().find(value_compact.lower())
+                case_sensitive_match = False
+                used_compact = True
+
+            if idx is None:
                continue

-            # Verify it's a proper boundary match (not part of a larger number)
-            # Check character before (if exists)
-            if idx > 0:
-                char_before = token_text_normalized[idx - 1]
-                # Must be non-digit (allow : space - etc)
-                if char_before.isdigit():
+            # For compact matching, boundary check is simpler (just check it's 10 consecutive digits)
+            if used_compact:
+                # Verify proper boundary in compact text
+                if idx > 0 and token_text_compact[idx - 1].isdigit():
                    continue
+                end_idx = idx + len(value_compact)
+                if end_idx < len(token_text_compact) and token_text_compact[end_idx].isdigit():
+                    continue
+            else:
+                # Verify it's a proper boundary match (not part of a larger number)
+                # Check character before (if exists)
+                if idx > 0:
+                    char_before = token_text_normalized[idx - 1]
+                    # Must be non-digit (allow : space - etc)
+                    if char_before.isdigit():
+                        continue

-            # Check character after (if exists)
-            end_idx = idx + len(value)
-            if end_idx < len(token_text_normalized):
-                char_after = token_text_normalized[end_idx]
-                # Must be non-digit
-                if char_after.isdigit():
-                    continue
+                # Check character after (if exists)
+                end_idx = idx + len(value)
+                if end_idx < len(token_text_normalized):
+                    char_after = token_text_normalized[end_idx]
+                    # Must be non-digit
+                    if char_after.isdigit():
+                        continue

            # Found valid substring match
            context_keywords, context_boost = self._find_context_keywords(
@@ -678,15 +717,44 @@ class FieldMatcher:
        min_height = min(token1.bbox[3] - token1.bbox[1], token2.bbox[3] - token2.bbox[1])
        return y_overlap > min_height * 0.5

-    def _parse_amount(self, text: str) -> float | None:
+    def _parse_amount(self, text: str | int | float) -> float | None:
        """Try to parse text as a monetary amount."""
-        # Remove currency and spaces
-        text = re.sub(r'[SEK|kr|:-]', '', text, flags=re.IGNORECASE)
+        # Convert to string first
+        text = str(text)
+
+        # First, handle Swedish öre format: "239 00" means 239.00 (239 kr 00 öre)
+        # Pattern: digits + space + exactly 2 digits at end
+        ore_match = re.match(r'^(\d+)\s+(\d{2})$', text.strip())
+        if ore_match:
+            kronor = ore_match.group(1)
+            ore = ore_match.group(2)
+            try:
+                return float(f"{kronor}.{ore}")
+            except ValueError:
+                pass
+
+        # Remove everything after and including parentheses (e.g., "(inkl. moms)")
+        text = re.sub(r'\s*\(.*\)', '', text)
+
+        # Remove currency symbols and common suffixes (including trailing dots from "kr.")
+        text = re.sub(r'\b(SEK|kr|kronor|öre)\b\.?', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'[:-]', '', text)
+
+        # Remove spaces (thousand separators) but be careful with öre format
        text = text.replace(' ', '').replace('\xa0', '')

-        # Try comma as decimal separator
-        if ',' in text and '.' not in text:
-            text = text.replace(',', '.')
+        # Handle comma as decimal separator
+        # Swedish format: "500,00" means 500.00
+        # Need to handle cases like "500,00." (after removing "kr.")
+        if ',' in text:
+            # Remove any trailing dots first (from "kr." removal)
+            text = text.rstrip('.')
+            # Now replace comma with dot
+            if '.' not in text:
+                text = text.replace(',', '.')
+
+        # Remove any remaining non-numeric characters except dot
+        text = re.sub(r'[^\d.]', '', text)

        try:
            return float(text)
--- a/src/matcher/test_field_matcher.py
+++ b/src/matcher/test_field_matcher.py
@@ -0,0 +1,896 @@
+"""
+Tests for the Field Matching Module.
+
+Tests cover all matcher functions in src/matcher/field_matcher.py
+
+Usage:
+    pytest src/matcher/test_field_matcher.py -v -o 'addopts='
+"""
+
+import pytest
+from dataclasses import dataclass
+from src.matcher.field_matcher import (
+    FieldMatcher,
+    Match,
+    TokenIndex,
+    CONTEXT_KEYWORDS,
+    _normalize_dashes,
+    find_field_matches,
+)
+
+
+@dataclass
+class MockToken:
+    """Mock token for testing."""
+    text: str
+    bbox: tuple[float, float, float, float]
+    page_no: int = 0
+
+
+class TestNormalizeDashes:
+    """Tests for _normalize_dashes function."""
+
+    def test_normalize_en_dash(self):
+        """Should normalize en-dash to hyphen."""
+        assert _normalize_dashes("123\u2013456") == "123-456"
+
+    def test_normalize_em_dash(self):
+        """Should normalize em-dash to hyphen."""
+        assert _normalize_dashes("123\u2014456") == "123-456"
+
+    def test_normalize_minus_sign(self):
+        """Should normalize minus sign to hyphen."""
+        assert _normalize_dashes("123\u2212456") == "123-456"
+
+    def test_normalize_middle_dot(self):
+        """Should normalize middle dot to hyphen."""
+        assert _normalize_dashes("123\u00b7456") == "123-456"
+
+    def test_normal_hyphen_unchanged(self):
+        """Should keep normal hyphen unchanged."""
+        assert _normalize_dashes("123-456") == "123-456"
+
+
+class TestTokenIndex:
+    """Tests for TokenIndex class."""
+
+    def test_build_index(self):
+        """Should build spatial index from tokens."""
+        tokens = [
+            MockToken("hello", (0, 0, 50, 20)),
+            MockToken("world", (60, 0, 110, 20)),
+        ]
+        index = TokenIndex(tokens)
+        assert len(index.tokens) == 2
+
+    def test_get_center(self):
+        """Should return correct center coordinates."""
+        token = MockToken("test", (0, 0, 100, 50))
+        tokens = [token]
+        index = TokenIndex(tokens)
+        center = index.get_center(token)
+        assert center == (50.0, 25.0)
+
+    def test_get_text_lower(self):
+        """Should return lowercase text."""
+        token = MockToken("HELLO World", (0, 0, 100, 20))
+        tokens = [token]
+        index = TokenIndex(tokens)
+        assert index.get_text_lower(token) == "hello world"
+
+    def test_find_nearby_within_radius(self):
+        """Should find tokens within radius."""
+        token1 = MockToken("hello", (0, 0, 50, 20))
+        token2 = MockToken("world", (60, 0, 110, 20))  # 60px away
+        token3 = MockToken("far", (500, 0, 550, 20))  # 500px away
+        tokens = [token1, token2, token3]
+        index = TokenIndex(tokens)
+
+        nearby = index.find_nearby(token1, radius=100)
+        assert len(nearby) == 1
+        assert nearby[0].text == "world"
+
+    def test_find_nearby_excludes_self(self):
+        """Should not include the target token itself."""
+        token1 = MockToken("hello", (0, 0, 50, 20))
+        token2 = MockToken("world", (60, 0, 110, 20))
+        tokens = [token1, token2]
+        index = TokenIndex(tokens)
+
+        nearby = index.find_nearby(token1, radius=100)
+        assert token1 not in nearby
+
+    def test_find_nearby_empty_when_none_in_range(self):
+        """Should return empty list when no tokens in range."""
+        token1 = MockToken("hello", (0, 0, 50, 20))
+        token2 = MockToken("far", (500, 0, 550, 20))
+        tokens = [token1, token2]
+        index = TokenIndex(tokens)
+
+        nearby = index.find_nearby(token1, radius=50)
+        assert len(nearby) == 0
+
+
+class TestMatch:
+    """Tests for Match dataclass."""
+
+    def test_match_creation(self):
+        """Should create Match with all fields."""
+        match = Match(
+            field="InvoiceNumber",
+            value="12345",
+            bbox=(0, 0, 100, 20),
+            page_no=0,
+            score=0.95,
+            matched_text="12345",
+            context_keywords=["fakturanr"]
+        )
+        assert match.field == "InvoiceNumber"
+        assert match.value == "12345"
+        assert match.score == 0.95
+
+    def test_to_yolo_format(self):
+        """Should convert to YOLO annotation format."""
+        match = Match(
+            field="Amount",
+            value="100",
+            bbox=(100, 200, 200, 250),  # x0, y0, x1, y1
+            page_no=0,
+            score=1.0,
+            matched_text="100",
+            context_keywords=[]
+        )
+        # Image: 1000x1000
+        yolo = match.to_yolo_format(1000, 1000, class_id=5)
+
+        # Expected: center_x=150, center_y=225, width=100, height=50
+        # Normalized: x_center=0.15, y_center=0.225, w=0.1, h=0.05
+        assert yolo.startswith("5 ")
+        parts = yolo.split()
+        assert len(parts) == 5
+        assert float(parts[1]) == pytest.approx(0.15, rel=1e-4)
+        assert float(parts[2]) == pytest.approx(0.225, rel=1e-4)
+        assert float(parts[3]) == pytest.approx(0.1, rel=1e-4)
+        assert float(parts[4]) == pytest.approx(0.05, rel=1e-4)
+
+
+class TestFieldMatcher:
+    """Tests for FieldMatcher class."""
+
+    def test_init_defaults(self):
+        """Should initialize with default values."""
+        matcher = FieldMatcher()
+        assert matcher.context_radius == 200.0
+        assert matcher.min_score_threshold == 0.5
+
+    def test_init_custom_params(self):
+        """Should initialize with custom parameters."""
+        matcher = FieldMatcher(context_radius=300.0, min_score_threshold=0.7)
+        assert matcher.context_radius == 300.0
+        assert matcher.min_score_threshold == 0.7
+
+
+class TestFieldMatcherExactMatch:
+    """Tests for exact matching."""
+
+    def test_exact_match_full_score(self):
+        """Should find exact match with full score."""
+        matcher = FieldMatcher()
+        tokens = [MockToken("12345", (0, 0, 50, 20))]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])
+
+        assert len(matches) >= 1
+        assert matches[0].score == 1.0
+        assert matches[0].matched_text == "12345"
+
+    def test_case_insensitive_match(self):
+        """Should find case-insensitive match with lower score."""
+        matcher = FieldMatcher()
+        tokens = [MockToken("HELLO", (0, 0, 50, 20))]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["hello"])
+
+        assert len(matches) >= 1
+        assert matches[0].score == 0.95
+
+    def test_digits_only_match(self):
+        """Should match by digits only for numeric fields."""
+        matcher = FieldMatcher()
+        tokens = [MockToken("INV-12345", (0, 0, 80, 20))]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])
+
+        assert len(matches) >= 1
+        assert matches[0].score == 0.9
+
+    def test_no_match_when_different(self):
+        """Should return empty when no match found."""
+        matcher = FieldMatcher(min_score_threshold=0.8)
+        tokens = [MockToken("99999", (0, 0, 50, 20))]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])
+
+        assert len(matches) == 0
+
+
+class TestFieldMatcherContextKeywords:
+    """Tests for context keyword boosting."""
+
+    def test_context_boost_with_nearby_keyword(self):
+        """Should boost score when context keyword is nearby."""
+        matcher = FieldMatcher(context_radius=200)
+        tokens = [
+            MockToken("fakturanr", (0, 0, 80, 20)),  # Context keyword
+            MockToken("12345", (100, 0, 150, 20)),   # Value
+        ]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])
+
+        assert len(matches) >= 1
+        # Score should be boosted above 1.0 (capped at 1.0)
+        assert matches[0].score == 1.0
+        assert "fakturanr" in matches[0].context_keywords
+
+    def test_no_boost_when_keyword_far_away(self):
+        """Should not boost when keyword is too far."""
+        matcher = FieldMatcher(context_radius=50)
+        tokens = [
+            MockToken("fakturanr", (0, 0, 80, 20)),   # Context keyword
+            MockToken("12345", (500, 0, 550, 20)),   # Value - far away
+        ]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])
+
+        assert len(matches) >= 1
+        assert "fakturanr" not in matches[0].context_keywords
+
+
+class TestFieldMatcherConcatenatedMatch:
+    """Tests for concatenated token matching."""
+
+    def test_concatenate_adjacent_tokens(self):
+        """Should match value split across adjacent tokens."""
+        matcher = FieldMatcher()
+        tokens = [
+            MockToken("123", (0, 0, 30, 20)),
+            MockToken("456", (35, 0, 65, 20)),  # Adjacent, same line
+        ]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["123456"])
+
+        assert len(matches) >= 1
+        assert "123456" in matches[0].matched_text or matches[0].value == "123456"
+
+    def test_no_concatenate_when_gap_too_large(self):
+        """Should not concatenate when gap is too large."""
+        matcher = FieldMatcher()
+        tokens = [
+            MockToken("123", (0, 0, 30, 20)),
+            MockToken("456", (100, 0, 130, 20)),  # Gap > 50px
+        ]
+
+        # This might still match if exact matches work differently
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["123456"])
+        # No concatenated match expected (only from exact/substring)
+        concat_matches = [m for m in matches if "123456" in m.matched_text]
+        # May or may not find depending on strategy
+
+
+class TestFieldMatcherSubstringMatch:
+    """Tests for substring matching."""
+
+    def test_substring_match_in_longer_text(self):
+        """Should find value as substring in longer token."""
+        matcher = FieldMatcher()
+        tokens = [MockToken("Fakturanummer: 12345", (0, 0, 150, 20))]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"])
+
+        assert len(matches) >= 1
+        # Substring match should have lower score
+        substring_match = [m for m in matches if "12345" in m.matched_text]
+        assert len(substring_match) >= 1
+
+    def test_no_substring_match_when_part_of_larger_number(self):
+        """Should not match when value is part of a larger number."""
+        matcher = FieldMatcher(min_score_threshold=0.6)
+        tokens = [MockToken("123456789", (0, 0, 100, 20))]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["456"])
+
+        # Should not match because 456 is embedded in larger number
+        assert len(matches) == 0
+
+
+class TestFieldMatcherFuzzyMatch:
+    """Tests for fuzzy amount matching."""
+
+    def test_fuzzy_amount_match(self):
+        """Should match amounts that are numerically equal."""
+        matcher = FieldMatcher()
+        tokens = [MockToken("1234,56", (0, 0, 70, 20))]
+
+        matches = matcher.find_matches(tokens, "Amount", ["1234.56"])
+
+        assert len(matches) >= 1
+
+    def test_fuzzy_amount_with_different_formats(self):
+        """Should match amounts in different formats."""
+        matcher = FieldMatcher()
+        tokens = [MockToken("1 234,56", (0, 0, 80, 20))]
+
+        matches = matcher.find_matches(tokens, "Amount", ["1234,56"])
+
+        assert len(matches) >= 1
+
+
+class TestFieldMatcherParseAmount:
+    """Tests for _parse_amount method."""
+
+    def test_parse_simple_integer(self):
+        """Should parse simple integer."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount("100") == 100.0
+
+    def test_parse_decimal_with_dot(self):
+        """Should parse decimal with dot."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount("100.50") == 100.50
+
+    def test_parse_decimal_with_comma(self):
+        """Should parse decimal with comma (European format)."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount("100,50") == 100.50
+
+    def test_parse_with_thousand_separator(self):
+        """Should parse with thousand separator."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount("1 234,56") == 1234.56
+
+    def test_parse_with_currency_suffix(self):
+        """Should parse and remove currency suffix."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount("100 SEK") == 100.0
+        assert matcher._parse_amount("100 kr") == 100.0
+
+    def test_parse_swedish_ore_format(self):
+        """Should parse Swedish öre format (kronor space öre)."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount("239 00") == 239.00
+        assert matcher._parse_amount("1234 50") == 1234.50
+
+    def test_parse_invalid_returns_none(self):
+        """Should return None for invalid input."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount("abc") is None
+        assert matcher._parse_amount("") is None
+
+
+class TestFieldMatcherTokensOnSameLine:
+    """Tests for _tokens_on_same_line method."""
+
+    def test_same_line_tokens(self):
+        """Should detect tokens on same line."""
+        matcher = FieldMatcher()
+        token1 = MockToken("hello", (0, 10, 50, 30))
+        token2 = MockToken("world", (60, 12, 110, 28))  # Slight y variation
+
+        assert matcher._tokens_on_same_line(token1, token2) is True
+
+    def test_different_line_tokens(self):
+        """Should detect tokens on different lines."""
+        matcher = FieldMatcher()
+        token1 = MockToken("hello", (0, 10, 50, 30))
+        token2 = MockToken("world", (0, 50, 50, 70))  # Different y
+
+        assert matcher._tokens_on_same_line(token1, token2) is False
+
+
+class TestFieldMatcherBboxOverlap:
+    """Tests for _bbox_overlap method."""
+
+    def test_full_overlap(self):
+        """Should return 1.0 for identical bboxes."""
+        matcher = FieldMatcher()
+        bbox = (0, 0, 100, 50)
+        assert matcher._bbox_overlap(bbox, bbox) == 1.0
+
+    def test_partial_overlap(self):
+        """Should calculate partial overlap correctly."""
+        matcher = FieldMatcher()
+        bbox1 = (0, 0, 100, 100)
+        bbox2 = (50, 50, 150, 150)  # 50% overlap on each axis
+
+        overlap = matcher._bbox_overlap(bbox1, bbox2)
+        # Intersection: 50x50=2500, Union: 10000+10000-2500=17500
+        # IoU = 2500/17500 ≈ 0.143
+        assert 0.1 < overlap < 0.2
+
+    def test_no_overlap(self):
+        """Should return 0.0 for non-overlapping bboxes."""
+        matcher = FieldMatcher()
+        bbox1 = (0, 0, 50, 50)
+        bbox2 = (100, 100, 150, 150)
+
+        assert matcher._bbox_overlap(bbox1, bbox2) == 0.0
+
+
+class TestFieldMatcherDeduplication:
+    """Tests for match deduplication."""
+
+    def test_deduplicate_overlapping_matches(self):
+        """Should keep only highest scoring match for overlapping bboxes."""
+        matcher = FieldMatcher()
+        tokens = [
+            MockToken("12345", (0, 0, 50, 20)),
+        ]
+
+        # Find matches with multiple values that could match same token
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345", "12345"])
+
+        # Should deduplicate to single match
+        assert len(matches) == 1
+
+
+class TestFieldMatcherFlexibleDateMatch:
+    """Tests for flexible date matching."""
+
+    def test_flexible_date_same_month(self):
+        """Should match dates in same year-month when exact match fails."""
+        matcher = FieldMatcher()
+        tokens = [
+            MockToken("2025-01-15", (0, 0, 80, 20)),  # Slightly different day
+        ]
+
+        # Search for different day in same month
+        matches = matcher.find_matches(
+            tokens, "InvoiceDate", ["2025-01-10"]
+        )
+
+        # Should find flexible match (lower score)
+        # Note: This depends on exact match failing first
+        # If exact match works, flexible won't be tried
+
+
+class TestFieldMatcherPageFiltering:
+    """Tests for page number filtering."""
+
+    def test_filters_by_page_number(self):
+        """Should only match tokens on specified page."""
+        matcher = FieldMatcher()
+        tokens = [
+            MockToken("12345", (0, 0, 50, 20), page_no=0),
+            MockToken("12345", (0, 0, 50, 20), page_no=1),
+        ]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"], page_no=0)
+
+        assert all(m.page_no == 0 for m in matches)
+
+    def test_excludes_hidden_tokens(self):
+        """Should exclude tokens with negative y coordinates (metadata)."""
+        matcher = FieldMatcher()
+        tokens = [
+            MockToken("12345", (0, -100, 50, -80), page_no=0),  # Hidden metadata
+            MockToken("67890", (0, 0, 50, 20), page_no=0),      # Visible
+        ]
+
+        matches = matcher.find_matches(tokens, "InvoiceNumber", ["12345"], page_no=0)
+
+        # Should not match the hidden token
+        assert len(matches) == 0
+
+
+class TestContextKeywordsMapping:
+    """Tests for CONTEXT_KEYWORDS constant."""
+
+    def test_all_fields_have_keywords(self):
+        """Should have keywords for all expected fields."""
+        expected_fields = [
+            "InvoiceNumber",
+            "InvoiceDate",
+            "InvoiceDueDate",
+            "OCR",
+            "Bankgiro",
+            "Plusgiro",
+            "Amount",
+            "supplier_organisation_number",
+            "supplier_accounts",
+        ]
+        for field in expected_fields:
+            assert field in CONTEXT_KEYWORDS
+            assert len(CONTEXT_KEYWORDS[field]) > 0
+
+    def test_keywords_are_lowercase(self):
+        """All keywords should be lowercase."""
+        for field, keywords in CONTEXT_KEYWORDS.items():
+            for kw in keywords:
+                assert kw == kw.lower(), f"Keyword '{kw}' in {field} should be lowercase"
+
+
+class TestFindFieldMatches:
+    """Tests for find_field_matches convenience function."""
+
+    def test_finds_multiple_fields(self):
+        """Should find matches for multiple fields."""
+        tokens = [
+            MockToken("12345", (0, 0, 50, 20)),
+            MockToken("100,00", (0, 30, 60, 50)),
+        ]
+        field_values = {
+            "InvoiceNumber": "12345",
+            "Amount": "100",
+        }
+
+        results = find_field_matches(tokens, field_values)
+
+        assert "InvoiceNumber" in results
+        assert "Amount" in results
+        assert len(results["InvoiceNumber"]) >= 1
+        assert len(results["Amount"]) >= 1
+
+    def test_skips_empty_values(self):
+        """Should skip fields with None or empty values."""
+        tokens = [MockToken("12345", (0, 0, 50, 20))]
+        field_values = {
+            "InvoiceNumber": "12345",
+            "Amount": None,
+            "OCR": "",
+        }
+
+        results = find_field_matches(tokens, field_values)
+
+        assert "InvoiceNumber" in results
+        assert "Amount" not in results
+        assert "OCR" not in results
+
+
+class TestSubstringMatchEdgeCases:
+    """Additional edge case tests for substring matching."""
+
+    def test_unsupported_field_returns_empty(self):
+        """Should return empty for unsupported field types."""
+        # Line 380: field_name not in supported_fields
+        matcher = FieldMatcher()
+        tokens = [MockToken("Faktura: 12345", (0, 0, 100, 20))]
+
+        # Message is not a supported field for substring matching
+        matches = matcher._find_substring_matches(tokens, "12345", "Message")
+        assert len(matches) == 0
+
+    def test_case_insensitive_substring_match(self):
+        """Should find case-insensitive substring match."""
+        # Line 397-398: case-insensitive substring matching
+        matcher = FieldMatcher()
+        # Use token without inline keyword to isolate case-insensitive behavior
+        tokens = [MockToken("REF: ABC123", (0, 0, 100, 20))]
+
+        matches = matcher._find_substring_matches(tokens, "abc123", "InvoiceNumber")
+
+        assert len(matches) >= 1
+        # Case-insensitive base score is 0.70 (vs 0.75 for case-sensitive)
+        # Score may have context boost but base should be lower
+        assert matches[0].score <= 0.80  # 0.70 base + possible small boost
+
+    def test_substring_with_digit_before(self):
+        """Should not match when digit appears before value."""
+        # Line 407-408: char_before.isdigit() continue
+        matcher = FieldMatcher()
+        tokens = [MockToken("9912345", (0, 0, 60, 20))]
+
+        matches = matcher._find_substring_matches(tokens, "12345", "InvoiceNumber")
+        assert len(matches) == 0
+
+    def test_substring_with_digit_after(self):
+        """Should not match when digit appears after value."""
+        # Line 413-416: char_after.isdigit() continue
+        matcher = FieldMatcher()
+        tokens = [MockToken("12345678", (0, 0, 70, 20))]
+
+        matches = matcher._find_substring_matches(tokens, "12345", "InvoiceNumber")
+        assert len(matches) == 0
+
+    def test_substring_with_inline_keyword(self):
+        """Should boost score when keyword is in same token."""
+        matcher = FieldMatcher()
+        tokens = [MockToken("Fakturanr: 12345", (0, 0, 100, 20))]
+
+        matches = matcher._find_substring_matches(tokens, "12345", "InvoiceNumber")
+
+        assert len(matches) >= 1
+        # Should have inline keyword boost
+        assert "fakturanr" in matches[0].context_keywords
+
+
+class TestFlexibleDateMatchEdgeCases:
+    """Additional edge case tests for flexible date matching."""
+
+    def test_no_valid_date_in_normalized_values(self):
+        """Should return empty when no valid date in normalized values."""
+        # Line 520-521, 524: target_date parsing failures
+        matcher = FieldMatcher()
+        tokens = [MockToken("2025-01-15", (0, 0, 80, 20))]
+
+        # Pass non-date values
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["not-a-date", "also-not-date"], "InvoiceDate"
+        )
+        assert len(matches) == 0
+
+    def test_no_date_tokens_found(self):
+        """Should return empty when no date tokens in document."""
+        # Line 571-572: no date_candidates
+        matcher = FieldMatcher()
+        tokens = [MockToken("Hello World", (0, 0, 80, 20))]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+        assert len(matches) == 0
+
+    def test_flexible_date_within_7_days(self):
+        """Should score higher for dates within 7 days."""
+        # Line 582-583: days_diff <= 7
+        matcher = FieldMatcher(min_score_threshold=0.5)
+        tokens = [
+            MockToken("2025-01-18", (0, 0, 80, 20)),  # 3 days from target
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+
+        assert len(matches) >= 1
+        assert matches[0].score >= 0.75
+
+    def test_flexible_date_within_3_days(self):
+        """Should score highest for dates within 3 days."""
+        # Line 584-585: days_diff <= 3
+        matcher = FieldMatcher(min_score_threshold=0.5)
+        tokens = [
+            MockToken("2025-01-17", (0, 0, 80, 20)),  # 2 days from target
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+
+        assert len(matches) >= 1
+        assert matches[0].score >= 0.8
+
+    def test_flexible_date_within_14_days_different_month(self):
+        """Should match dates within 14 days even in different month."""
+        # Line 587-588: days_diff <= 14, different year-month
+        matcher = FieldMatcher(min_score_threshold=0.5)
+        tokens = [
+            MockToken("2025-02-05", (0, 0, 80, 20)),  # 10 days from Jan 26
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-26"], "InvoiceDate"
+        )
+
+        assert len(matches) >= 1
+
+    def test_flexible_date_within_30_days(self):
+        """Should match dates within 30 days with lower score."""
+        # Line 589-590: days_diff <= 30
+        matcher = FieldMatcher(min_score_threshold=0.5)
+        tokens = [
+            MockToken("2025-02-10", (0, 0, 80, 20)),  # 25 days from target
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-16"], "InvoiceDate"
+        )
+
+        assert len(matches) >= 1
+        assert matches[0].score >= 0.55
+
+    def test_flexible_date_far_apart_without_context(self):
+        """Should skip dates too far apart without context keywords."""
+        # Line 591-595: > 30 days, no context
+        matcher = FieldMatcher(min_score_threshold=0.5)
+        tokens = [
+            MockToken("2025-06-15", (0, 0, 80, 20)),  # Many months from target
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+
+        # Should be empty - too far apart and no context
+        assert len(matches) == 0
+
+    def test_flexible_date_far_with_context(self):
+        """Should match distant dates if context keywords present."""
+        # Line 592-595: > 30 days but has context
+        matcher = FieldMatcher(min_score_threshold=0.5, context_radius=200)
+        tokens = [
+            MockToken("fakturadatum", (0, 0, 80, 20)),  # Context keyword
+            MockToken("2025-06-15", (90, 0, 170, 20)),  # Distant date
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+
+        # May match due to context keyword
+        # (depends on how context is detected in flexible match)
+
+    def test_flexible_date_boost_with_context(self):
+        """Should boost flexible date score with context keywords."""
+        # Line 598, 602-603: context_boost applied
+        matcher = FieldMatcher(min_score_threshold=0.5, context_radius=200)
+        tokens = [
+            MockToken("fakturadatum", (0, 0, 80, 20)),
+            MockToken("2025-01-18", (90, 0, 170, 20)),  # 3 days from target
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+
+        if len(matches) > 0:
+            assert len(matches[0].context_keywords) >= 0
+
+
+class TestContextKeywordFallback:
+    """Tests for context keyword lookup fallback (no spatial index)."""
+
+    def test_fallback_context_lookup_without_index(self):
+        """Should find context using O(n) scan when no index available."""
+        # Line 650-673: fallback context lookup
+        matcher = FieldMatcher(context_radius=200)
+        # Don't use find_matches which builds index, call internal method directly
+
+        tokens = [
+            MockToken("fakturanr", (0, 0, 80, 20)),
+            MockToken("12345", (100, 0, 150, 20)),
+        ]
+
+        # _token_index is None, so fallback is used
+        keywords, boost = matcher._find_context_keywords(tokens, tokens[1], "InvoiceNumber")
+
+        assert "fakturanr" in keywords
+        assert boost > 0
+
+    def test_context_lookup_skips_self(self):
+        """Should skip the target token itself in fallback search."""
+        # Line 656-657: token is target_token continue
+        matcher = FieldMatcher(context_radius=200)
+        matcher._token_index = None  # Force fallback
+
+        token = MockToken("fakturanr 12345", (0, 0, 150, 20))
+        tokens = [token]
+
+        keywords, boost = matcher._find_context_keywords(tokens, token, "InvoiceNumber")
+
+        # Token contains keyword but is the target - should still find if keyword in token
+        # Actually this tests that it doesn't error when target is in list
+
+
+class TestFieldWithoutContextKeywords:
+    """Tests for fields without defined context keywords."""
+
+    def test_field_without_keywords_returns_empty(self):
+        """Should return empty keywords for fields not in CONTEXT_KEYWORDS."""
+        # Line 633-635: keywords empty, return early
+        matcher = FieldMatcher()
+        matcher._token_index = None
+
+        tokens = [MockToken("hello", (0, 0, 50, 20))]
+
+        # customer_number is not in CONTEXT_KEYWORDS
+        keywords, boost = matcher._find_context_keywords(tokens, tokens[0], "UnknownField")
+
+        assert keywords == []
+        assert boost == 0.0
+
+
+class TestParseAmountEdgeCases:
+    """Additional edge case tests for _parse_amount."""
+
+    def test_parse_amount_with_parentheses(self):
+        """Should remove parenthesized text like (inkl. moms)."""
+        matcher = FieldMatcher()
+        result = matcher._parse_amount("100 (inkl. moms)")
+        assert result == 100.0
+
+    def test_parse_amount_with_kronor_suffix(self):
+        """Should handle 'kronor' suffix."""
+        matcher = FieldMatcher()
+        result = matcher._parse_amount("100 kronor")
+        assert result == 100.0
+
+    def test_parse_amount_numeric_input(self):
+        """Should handle numeric input (int/float)."""
+        matcher = FieldMatcher()
+        assert matcher._parse_amount(100) == 100.0
+        assert matcher._parse_amount(100.5) == 100.5
+
+
+class TestFuzzyMatchExceptionHandling:
+    """Tests for exception handling in fuzzy matching."""
+
+    def test_fuzzy_match_with_unparseable_token(self):
+        """Should handle tokens that can't be parsed as amounts."""
+        # Line 481-482: except clause in fuzzy matching
+        matcher = FieldMatcher()
+        # Create a token that will cause parse issues
+        tokens = [MockToken("abc xyz", (0, 0, 50, 20))]
+
+        # This should not raise, just return empty matches
+        matches = matcher._find_fuzzy_matches(tokens, "100", "Amount")
+        assert len(matches) == 0
+
+    def test_fuzzy_match_exception_in_context_lookup(self):
+        """Should catch exceptions during fuzzy match processing."""
+        # Line 481-482: general exception handler
+        from unittest.mock import patch, MagicMock
+
+        matcher = FieldMatcher()
+        tokens = [MockToken("100", (0, 0, 50, 20))]
+
+        # Mock _find_context_keywords to raise an exception
+        with patch.object(matcher, '_find_context_keywords', side_effect=RuntimeError("Test error")):
+            # Should not raise, exception should be caught
+            matches = matcher._find_fuzzy_matches(tokens, "100", "Amount")
+            # Should return empty due to exception
+            assert len(matches) == 0
+
+
+class TestFlexibleDateInvalidDateParsing:
+    """Tests for invalid date parsing in flexible date matching."""
+
+    def test_invalid_date_in_normalized_values(self):
+        """Should handle invalid dates in normalized values gracefully."""
+        # Line 520-521: ValueError continue in target date parsing
+        matcher = FieldMatcher()
+        tokens = [MockToken("2025-01-15", (0, 0, 80, 20))]
+
+        # Pass an invalid date that matches the pattern but is not a valid date
+        # e.g., 2025-13-45 matches pattern but month 13 is invalid
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-13-45"], "InvoiceDate"
+        )
+        # Should return empty as no valid target date could be parsed
+        assert len(matches) == 0
+
+    def test_invalid_date_token_in_document(self):
+        """Should skip invalid date-like tokens in document."""
+        # Line 568-569: ValueError continue in date token parsing
+        matcher = FieldMatcher(min_score_threshold=0.5)
+        tokens = [
+            MockToken("2025-99-99", (0, 0, 80, 20)),  # Invalid date in doc
+            MockToken("2025-01-18", (0, 50, 80, 70)), # Valid date
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+
+        # Should only match the valid date
+        assert len(matches) >= 1
+        assert matches[0].value == "2025-01-18"
+
+    def test_flexible_date_with_inline_keyword(self):
+        """Should detect inline keywords in date tokens."""
+        # Line 555: inline_keywords append
+        matcher = FieldMatcher(min_score_threshold=0.5)
+        tokens = [
+            MockToken("Fakturadatum: 2025-01-18", (0, 0, 150, 20)),
+        ]
+
+        matches = matcher._find_flexible_date_matches(
+            tokens, ["2025-01-15"], "InvoiceDate"
+        )
+
+        # Should find match with inline keyword
+        assert len(matches) >= 1
+        assert "fakturadatum" in matches[0].context_keywords
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])