code issue fix

This commit is contained in:
Yaojia Wang
2026-01-17 18:55:46 +01:00
parent 510890d18c
commit e9460e9f34
9 changed files with 729 additions and 57 deletions

View File

@@ -219,7 +219,7 @@ class FieldMatcher:
# Note: Amount is excluded because short numbers like "451" can incorrectly match
# in OCR payment lines or other unrelated text
if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
'supplier_organisation_number', 'supplier_accounts'):
'supplier_organisation_number', 'supplier_accounts', 'customer_number'):
substring_matches = self._find_substring_matches(page_tokens, value, field_name)
matches.extend(substring_matches)
@@ -369,7 +369,7 @@ class FieldMatcher:
# Supported fields for substring matching
supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount',
'supplier_organisation_number', 'supplier_accounts')
'supplier_organisation_number', 'supplier_accounts', 'customer_number')
if field_name not in supported_fields:
return matches
@@ -383,49 +383,59 @@ class FieldMatcher:
continue
# Check if value appears as substring (using normalized text)
# Try case-sensitive first, then case-insensitive
if value in token_text_normalized:
# Verify it's a proper boundary match (not part of a larger number)
idx = token_text_normalized.find(value)
case_sensitive_match = True
elif value.lower() in token_text_normalized.lower():
idx = token_text_normalized.lower().find(value.lower())
case_sensitive_match = False
else:
continue
# Check character before (if exists)
if idx > 0:
char_before = token_text_normalized[idx - 1]
# Must be non-digit (allow : space - etc)
if char_before.isdigit():
continue
# Verify it's a proper boundary match (not part of a larger number)
# Check character before (if exists)
if idx > 0:
char_before = token_text_normalized[idx - 1]
# Must be non-digit (allow : space - etc)
if char_before.isdigit():
continue
# Check character after (if exists)
end_idx = idx + len(value)
if end_idx < len(token_text_normalized):
char_after = token_text_normalized[end_idx]
# Must be non-digit
if char_after.isdigit():
continue
# Check character after (if exists)
end_idx = idx + len(value)
if end_idx < len(token_text_normalized):
char_after = token_text_normalized[end_idx]
# Must be non-digit
if char_after.isdigit():
continue
# Found valid substring match
context_keywords, context_boost = self._find_context_keywords(
tokens, token, field_name
)
# Found valid substring match
context_keywords, context_boost = self._find_context_keywords(
tokens, token, field_name
)
# Check if context keyword is in the same token (like "Fakturadatum:")
token_lower = token_text.lower()
inline_context = []
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
if keyword in token_lower:
inline_context.append(keyword)
# Check if context keyword is in the same token (like "Fakturadatum:")
token_lower = token_text.lower()
inline_context = []
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
if keyword in token_lower:
inline_context.append(keyword)
# Boost score if keyword is inline
inline_boost = 0.1 if inline_context else 0
# Boost score if keyword is inline
inline_boost = 0.1 if inline_context else 0
matches.append(Match(
field=field_name,
value=value,
bbox=token.bbox, # Use full token bbox
page_no=token.page_no,
score=min(1.0, 0.75 + context_boost + inline_boost), # Lower than exact match
matched_text=token_text,
context_keywords=context_keywords + inline_context
))
# Lower score for case-insensitive match
base_score = 0.75 if case_sensitive_match else 0.70
matches.append(Match(
field=field_name,
value=value,
bbox=token.bbox, # Use full token bbox
page_no=token.page_no,
score=min(1.0, base_score + context_boost + inline_boost),
matched_text=token_text,
context_keywords=context_keywords + inline_context
))
return matches