Re-structure the project.
This commit is contained in:
92
src/matcher/context.py
Normal file
92
src/matcher/context.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
Context keywords for field matching.
|
||||
"""
|
||||
|
||||
from .models import TokenLike
|
||||
from .token_index import TokenIndex
|
||||
|
||||
|
||||
# Context keywords for each field type (Swedish invoice terms)
|
||||
CONTEXT_KEYWORDS = {
|
||||
'InvoiceNumber': ['fakturanr', 'fakturanummer', 'invoice', 'inv.nr', 'inv nr', 'nr'],
|
||||
'InvoiceDate': ['fakturadatum', 'datum', 'date', 'utfärdad', 'utskriftsdatum', 'dokumentdatum'],
|
||||
'InvoiceDueDate': ['förfallodatum', 'förfaller', 'due date', 'betalas senast', 'att betala senast',
|
||||
'förfallodag', 'oss tillhanda senast', 'senast'],
|
||||
'OCR': ['ocr', 'referens', 'betalningsreferens', 'ref'],
|
||||
'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'],
|
||||
'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'],
|
||||
'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'],
|
||||
'supplier_organisation_number': ['organisationsnummer', 'org.nr', 'org nr', 'orgnr', 'org.nummer',
|
||||
'momsreg', 'momsnr', 'moms nr', 'vat', 'corporate id'],
|
||||
'supplier_accounts': ['konto', 'kontonr', 'konto nr', 'account', 'klientnr', 'kundnr'],
|
||||
}
|
||||
|
||||
|
||||
def find_context_keywords(
|
||||
tokens: list[TokenLike],
|
||||
target_token: TokenLike,
|
||||
field_name: str,
|
||||
context_radius: float,
|
||||
token_index: TokenIndex | None = None
|
||||
) -> tuple[list[str], float]:
|
||||
"""
|
||||
Find context keywords near the target token.
|
||||
|
||||
Uses spatial index for O(1) average lookup instead of O(n) scan.
|
||||
|
||||
Args:
|
||||
tokens: List of all tokens
|
||||
target_token: The token to find context for
|
||||
field_name: Name of the field
|
||||
context_radius: Search radius in pixels
|
||||
token_index: Optional spatial index for efficient lookup
|
||||
|
||||
Returns:
|
||||
Tuple of (found_keywords, boost_score)
|
||||
"""
|
||||
keywords = CONTEXT_KEYWORDS.get(field_name, [])
|
||||
if not keywords:
|
||||
return [], 0.0
|
||||
|
||||
found_keywords = []
|
||||
|
||||
# Use spatial index for efficient nearby token lookup
|
||||
if token_index:
|
||||
nearby_tokens = token_index.find_nearby(target_token, context_radius)
|
||||
for token in nearby_tokens:
|
||||
# Use cached lowercase text
|
||||
token_lower = token_index.get_text_lower(token)
|
||||
for keyword in keywords:
|
||||
if keyword in token_lower:
|
||||
found_keywords.append(keyword)
|
||||
else:
|
||||
# Fallback to O(n) scan if no index available
|
||||
target_center = (
|
||||
(target_token.bbox[0] + target_token.bbox[2]) / 2,
|
||||
(target_token.bbox[1] + target_token.bbox[3]) / 2
|
||||
)
|
||||
|
||||
for token in tokens:
|
||||
if token is target_token:
|
||||
continue
|
||||
|
||||
token_center = (
|
||||
(token.bbox[0] + token.bbox[2]) / 2,
|
||||
(token.bbox[1] + token.bbox[3]) / 2
|
||||
)
|
||||
|
||||
distance = (
|
||||
(target_center[0] - token_center[0]) ** 2 +
|
||||
(target_center[1] - token_center[1]) ** 2
|
||||
) ** 0.5
|
||||
|
||||
if distance <= context_radius:
|
||||
token_lower = token.text.lower()
|
||||
for keyword in keywords:
|
||||
if keyword in token_lower:
|
||||
found_keywords.append(keyword)
|
||||
|
||||
# Calculate boost based on keywords found
|
||||
# Increased boost to better differentiate matches with/without context
|
||||
boost = min(0.25, len(found_keywords) * 0.10)
|
||||
return found_keywords, boost
|
||||
Reference in New Issue
Block a user