code issue fix
This commit is contained in:
@@ -219,7 +219,7 @@ class FieldMatcher:
|
||||
# Note: Amount is excluded because short numbers like "451" can incorrectly match
|
||||
# in OCR payment lines or other unrelated text
|
||||
if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
|
||||
'supplier_organisation_number', 'supplier_accounts'):
|
||||
'supplier_organisation_number', 'supplier_accounts', 'customer_number'):
|
||||
substring_matches = self._find_substring_matches(page_tokens, value, field_name)
|
||||
matches.extend(substring_matches)
|
||||
|
||||
@@ -369,7 +369,7 @@ class FieldMatcher:
|
||||
|
||||
# Supported fields for substring matching
|
||||
supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount',
|
||||
'supplier_organisation_number', 'supplier_accounts')
|
||||
'supplier_organisation_number', 'supplier_accounts', 'customer_number')
|
||||
if field_name not in supported_fields:
|
||||
return matches
|
||||
|
||||
@@ -383,49 +383,59 @@ class FieldMatcher:
|
||||
continue
|
||||
|
||||
# Check if value appears as substring (using normalized text)
|
||||
# Try case-sensitive first, then case-insensitive
|
||||
if value in token_text_normalized:
|
||||
# Verify it's a proper boundary match (not part of a larger number)
|
||||
idx = token_text_normalized.find(value)
|
||||
case_sensitive_match = True
|
||||
elif value.lower() in token_text_normalized.lower():
|
||||
idx = token_text_normalized.lower().find(value.lower())
|
||||
case_sensitive_match = False
|
||||
else:
|
||||
continue
|
||||
|
||||
# Check character before (if exists)
|
||||
if idx > 0:
|
||||
char_before = token_text_normalized[idx - 1]
|
||||
# Must be non-digit (allow : space - etc)
|
||||
if char_before.isdigit():
|
||||
continue
|
||||
# Verify it's a proper boundary match (not part of a larger number)
|
||||
# Check character before (if exists)
|
||||
if idx > 0:
|
||||
char_before = token_text_normalized[idx - 1]
|
||||
# Must be non-digit (allow : space - etc)
|
||||
if char_before.isdigit():
|
||||
continue
|
||||
|
||||
# Check character after (if exists)
|
||||
end_idx = idx + len(value)
|
||||
if end_idx < len(token_text_normalized):
|
||||
char_after = token_text_normalized[end_idx]
|
||||
# Must be non-digit
|
||||
if char_after.isdigit():
|
||||
continue
|
||||
# Check character after (if exists)
|
||||
end_idx = idx + len(value)
|
||||
if end_idx < len(token_text_normalized):
|
||||
char_after = token_text_normalized[end_idx]
|
||||
# Must be non-digit
|
||||
if char_after.isdigit():
|
||||
continue
|
||||
|
||||
# Found valid substring match
|
||||
context_keywords, context_boost = self._find_context_keywords(
|
||||
tokens, token, field_name
|
||||
)
|
||||
# Found valid substring match
|
||||
context_keywords, context_boost = self._find_context_keywords(
|
||||
tokens, token, field_name
|
||||
)
|
||||
|
||||
# Check if context keyword is in the same token (like "Fakturadatum:")
|
||||
token_lower = token_text.lower()
|
||||
inline_context = []
|
||||
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
|
||||
if keyword in token_lower:
|
||||
inline_context.append(keyword)
|
||||
# Check if context keyword is in the same token (like "Fakturadatum:")
|
||||
token_lower = token_text.lower()
|
||||
inline_context = []
|
||||
for keyword in CONTEXT_KEYWORDS.get(field_name, []):
|
||||
if keyword in token_lower:
|
||||
inline_context.append(keyword)
|
||||
|
||||
# Boost score if keyword is inline
|
||||
inline_boost = 0.1 if inline_context else 0
|
||||
# Boost score if keyword is inline
|
||||
inline_boost = 0.1 if inline_context else 0
|
||||
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=value,
|
||||
bbox=token.bbox, # Use full token bbox
|
||||
page_no=token.page_no,
|
||||
score=min(1.0, 0.75 + context_boost + inline_boost), # Lower than exact match
|
||||
matched_text=token_text,
|
||||
context_keywords=context_keywords + inline_context
|
||||
))
|
||||
# Lower score for case-insensitive match
|
||||
base_score = 0.75 if case_sensitive_match else 0.70
|
||||
|
||||
matches.append(Match(
|
||||
field=field_name,
|
||||
value=value,
|
||||
bbox=token.bbox, # Use full token bbox
|
||||
page_no=token.page_no,
|
||||
score=min(1.0, base_score + context_boost + inline_boost),
|
||||
matched_text=token_text,
|
||||
context_keywords=context_keywords + inline_context
|
||||
))
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
Reference in New Issue
Block a user