WIP
This commit is contained in:
@@ -14,6 +14,12 @@ from functools import cached_property
|
||||
_DATE_PATTERN = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
|
||||
_WHITESPACE_PATTERN = re.compile(r'\s+')
|
||||
_NON_DIGIT_PATTERN = re.compile(r'\D')
|
||||
_DASH_PATTERN = re.compile(r'[\u2013\u2014\u2212]') # en-dash, em-dash, minus sign
|
||||
|
||||
|
||||
def _normalize_dashes(text: str) -> str:
|
||||
"""Normalize different dash types to standard hyphen-minus (ASCII 45)."""
|
||||
return _DASH_PATTERN.sub('-', text)
|
||||
|
||||
|
||||
class TokenLike(Protocol):
|
||||
@@ -143,6 +149,9 @@ CONTEXT_KEYWORDS = {
|
||||
'Bankgiro': ['bankgiro', 'bg', 'bg-nr', 'bg nr'],
|
||||
'Plusgiro': ['plusgiro', 'pg', 'pg-nr', 'pg nr'],
|
||||
'Amount': ['att betala', 'summa', 'total', 'belopp', 'amount', 'totalt', 'att erlägga', 'sek', 'kr'],
|
||||
'supplier_organisation_number': ['organisationsnummer', 'org.nr', 'org nr', 'orgnr', 'org.nummer',
|
||||
'momsreg', 'momsnr', 'moms nr', 'vat', 'corporate id'],
|
||||
'supplier_accounts': ['konto', 'kontonr', 'konto nr', 'account', 'klientnr', 'kundnr'],
|
||||
}
|
||||
|
||||
|
||||
@@ -207,7 +216,10 @@ class FieldMatcher:
|
||||
|
||||
# Strategy 4: Substring match (for values embedded in longer text)
|
||||
# e.g., "Fakturanummer: 2465027205" should match OCR value "2465027205"
|
||||
if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount'):
|
||||
# Note: Amount is excluded because short numbers like "451" can incorrectly match
|
||||
# in OCR payment lines or other unrelated text
|
||||
if field_name in ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
|
||||
'supplier_organisation_number', 'supplier_accounts'):
|
||||
substring_matches = self._find_substring_matches(page_tokens, value, field_name)
|
||||
matches.extend(substring_matches)
|
||||
|
||||
@@ -237,7 +249,8 @@ class FieldMatcher:
|
||||
"""Find tokens that exactly match the value."""
|
||||
matches = []
|
||||
value_lower = value.lower()
|
||||
value_digits = _NON_DIGIT_PATTERN.sub('', value) if field_name in ('InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro') else None
|
||||
value_digits = _NON_DIGIT_PATTERN.sub('', value) if field_name in ('InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro',
|
||||
'supplier_organisation_number', 'supplier_accounts') else None
|
||||
|
||||
for token in tokens:
|
||||
token_text = token.text.strip()
|
||||
@@ -355,33 +368,36 @@ class FieldMatcher:
|
||||
matches = []
|
||||
|
||||
# Supported fields for substring matching
|
||||
supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount')
|
||||
supported_fields = ('InvoiceDate', 'InvoiceDueDate', 'InvoiceNumber', 'OCR', 'Bankgiro', 'Plusgiro', 'Amount',
|
||||
'supplier_organisation_number', 'supplier_accounts')
|
||||
if field_name not in supported_fields:
|
||||
return matches
|
||||
|
||||
for token in tokens:
|
||||
token_text = token.text.strip()
|
||||
# Normalize different dash types to hyphen-minus for matching
|
||||
token_text_normalized = _normalize_dashes(token_text)
|
||||
|
||||
# Skip if token is the same length as value (would be exact match)
|
||||
if len(token_text) <= len(value):
|
||||
if len(token_text_normalized) <= len(value):
|
||||
continue
|
||||
|
||||
# Check if value appears as substring
|
||||
if value in token_text:
|
||||
# Check if value appears as substring (using normalized text)
|
||||
if value in token_text_normalized:
|
||||
# Verify it's a proper boundary match (not part of a larger number)
|
||||
idx = token_text.find(value)
|
||||
idx = token_text_normalized.find(value)
|
||||
|
||||
# Check character before (if exists)
|
||||
if idx > 0:
|
||||
char_before = token_text[idx - 1]
|
||||
char_before = token_text_normalized[idx - 1]
|
||||
# Must be non-digit (allow : space - etc)
|
||||
if char_before.isdigit():
|
||||
continue
|
||||
|
||||
# Check character after (if exists)
|
||||
end_idx = idx + len(value)
|
||||
if end_idx < len(token_text):
|
||||
char_after = token_text[end_idx]
|
||||
if end_idx < len(token_text_normalized):
|
||||
char_after = token_text_normalized[end_idx]
|
||||
# Must be non-digit
|
||||
if char_after.isdigit():
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user