Add payment line parser and fix OCR override from payment_line
- Add MachineCodeParser for Swedish invoice payment line parsing - Fix OCR Reference extraction by normalizing account number spaces - Add cross-validation tests for pipeline and field_extractor - Update UI layout for compact upload and full-width results Key changes: - machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713") - pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only - field_extractor.py: Improved invoice number normalization - app.py: Responsive UI layout changes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -43,8 +43,8 @@ class FieldNormalizer:
|
||||
# Remove zero-width characters
|
||||
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
|
||||
# Normalize different dash types to standard hyphen-minus (ASCII 45)
|
||||
# en-dash (–, U+2013), em-dash (—, U+2014), minus sign (−, U+2212)
|
||||
text = re.sub(r'[\u2013\u2014\u2212]', '-', text)
|
||||
# en-dash (–, U+2013), em-dash (—, U+2014), minus sign (−, U+2212), middle dot (·, U+00B7)
|
||||
text = re.sub(r'[\u2013\u2014\u2212\u00b7]', '-', text)
|
||||
# Normalize whitespace
|
||||
text = ' '.join(text.split())
|
||||
return text.strip()
|
||||
@@ -571,6 +571,15 @@ class FieldNormalizer:
|
||||
# Short year with dot separator (e.g., 02.01.26)
|
||||
eu_dot_short = parsed_date.strftime('%d.%m.%y')
|
||||
|
||||
# Short year with slash separator (e.g., 20/10/24) - DD/MM/YY format
|
||||
eu_slash_short = parsed_date.strftime('%d/%m/%y')
|
||||
|
||||
# Short year with hyphen separator (e.g., 23-11-01) - common in Swedish invoices
|
||||
yy_mm_dd_short = parsed_date.strftime('%y-%m-%d')
|
||||
|
||||
# Middle dot separator (OCR sometimes reads hyphens as middle dots)
|
||||
iso_middot = parsed_date.strftime('%Y·%m·%d')
|
||||
|
||||
# Spaced formats (e.g., "2026 01 12", "26 01 12")
|
||||
spaced_full = parsed_date.strftime('%Y %m %d')
|
||||
spaced_short = parsed_date.strftime('%y %m %d')
|
||||
@@ -581,10 +590,23 @@ class FieldNormalizer:
|
||||
swedish_format_full = f"{parsed_date.day} {month_full} {parsed_date.year}"
|
||||
swedish_format_abbrev = f"{parsed_date.day} {month_abbrev} {parsed_date.year}"
|
||||
|
||||
# Swedish month abbreviation with hyphen (e.g., "30-OKT-24", "30-okt-24")
|
||||
month_abbrev_upper = month_abbrev.upper()
|
||||
swedish_hyphen_short = f"{parsed_date.day:02d}-{month_abbrev_upper}-{parsed_date.strftime('%y')}"
|
||||
swedish_hyphen_short_lower = f"{parsed_date.day:02d}-{month_abbrev}-{parsed_date.strftime('%y')}"
|
||||
# Also without leading zero on day
|
||||
swedish_hyphen_short_no_zero = f"{parsed_date.day}-{month_abbrev_upper}-{parsed_date.strftime('%y')}"
|
||||
|
||||
# Swedish month abbreviation with short year in different format (e.g., "SEP-24", "30 SEP 24")
|
||||
month_year_only = f"{month_abbrev_upper}-{parsed_date.strftime('%y')}"
|
||||
swedish_spaced = f"{parsed_date.day:02d} {month_abbrev_upper} {parsed_date.strftime('%y')}"
|
||||
|
||||
variants.extend([
|
||||
iso, eu_slash, us_slash, eu_dot, iso_dot, compact, compact_short,
|
||||
eu_dot_short, spaced_full, spaced_short,
|
||||
swedish_format_full, swedish_format_abbrev
|
||||
eu_dot_short, eu_slash_short, yy_mm_dd_short, iso_middot, spaced_full, spaced_short,
|
||||
swedish_format_full, swedish_format_abbrev,
|
||||
swedish_hyphen_short, swedish_hyphen_short_lower, swedish_hyphen_short_no_zero,
|
||||
month_year_only, swedish_spaced
|
||||
])
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
|
||||
Reference in New Issue
Block a user