Re-structure the project.

This commit is contained in:
Yaojia Wang
2026-01-25 15:21:11 +01:00
parent 8fd61ea928
commit e599424a92
80 changed files with 10672 additions and 1584 deletions

View File

@@ -0,0 +1 @@
# Strategy tests

View File

@@ -0,0 +1,69 @@
"""
Tests for ExactMatcher strategy
Usage:
pytest tests/matcher/strategies/test_exact_matcher.py -v
"""
import pytest
from dataclasses import dataclass
from src.matcher.strategies.exact_matcher import ExactMatcher
@dataclass
class MockToken:
"""Mock token for testing"""
text: str
bbox: tuple[float, float, float, float]
page_no: int = 0
class TestExactMatcher:
"""Test ExactMatcher functionality"""
@pytest.fixture
def matcher(self):
"""Create matcher instance for testing"""
return ExactMatcher(context_radius=200.0)
def test_exact_match(self, matcher):
"""Exact text match should score 1.0"""
tokens = [
MockToken('100017500321', (100, 100, 200, 120)),
]
matches = matcher.find_matches(tokens, '100017500321', 'InvoiceNumber')
assert len(matches) == 1
assert matches[0].score == 1.0
assert matches[0].matched_text == '100017500321'
def test_case_insensitive_match(self, matcher):
"""Case-insensitive match should score 0.9 (digits-only for numeric fields)"""
tokens = [
MockToken('INV-12345', (100, 100, 200, 120)),
]
matches = matcher.find_matches(tokens, 'inv-12345', 'InvoiceNumber')
assert len(matches) == 1
# Without token_index, case-insensitive falls through to digits-only match
assert matches[0].score == 0.9
def test_digits_only_match(self, matcher):
"""Digits-only match for numeric fields should score 0.9"""
tokens = [
MockToken('INV-12345', (100, 100, 200, 120)),
]
matches = matcher.find_matches(tokens, '12345', 'InvoiceNumber')
assert len(matches) == 1
assert matches[0].score == 0.9
def test_no_match(self, matcher):
"""Non-matching value should return empty list"""
tokens = [
MockToken('100017500321', (100, 100, 200, 120)),
]
matches = matcher.find_matches(tokens, '999999', 'InvoiceNumber')
assert len(matches) == 0
def test_empty_tokens(self, matcher):
"""Empty token list should return empty matches"""
matches = matcher.find_matches([], '100017500321', 'InvoiceNumber')
assert len(matches) == 0