This commit is contained in:
Yaojia Wang
2026-01-22 22:03:24 +01:00
parent 4ea4bc96d4
commit 8fd61ea928
19 changed files with 4069 additions and 226 deletions

View File

@@ -42,6 +42,7 @@ from dataclasses import dataclass, field
from typing import Optional
from src.pdf.extractor import Token as TextToken
from src.utils.validators import FieldValidators
@dataclass
@@ -484,21 +485,42 @@ class MachineCodeParser:
def format_account(account_digits: str) -> tuple[str, str]:
"""Format account and determine type (bankgiro or plusgiro).
Uses context keywords first, then falls back to Luhn validation
to determine the most likely account type.
Returns: (formatted_account, account_type)
"""
if is_plusgiro_context:
# Plusgiro format: XXXXXXX-X
# Context explicitly indicates Plusgiro
formatted = f"{account_digits[:-1]}-{account_digits[-1]}"
return formatted, 'plusgiro'
# No explicit context - use Luhn validation to determine type
# Try both formats and see which passes Luhn check
# Format as Plusgiro: XXXXXXX-X (all digits, check digit at end)
pg_formatted = f"{account_digits[:-1]}-{account_digits[-1]}"
pg_valid = FieldValidators.is_valid_plusgiro(account_digits)
# Format as Bankgiro: XXX-XXXX or XXXX-XXXX
if len(account_digits) == 7:
bg_formatted = f"{account_digits[:3]}-{account_digits[3:]}"
elif len(account_digits) == 8:
bg_formatted = f"{account_digits[:4]}-{account_digits[4:]}"
else:
# Bankgiro format: XXX-XXXX or XXXX-XXXX
if len(account_digits) == 7:
formatted = f"{account_digits[:3]}-{account_digits[3:]}"
elif len(account_digits) == 8:
formatted = f"{account_digits[:4]}-{account_digits[4:]}"
else:
formatted = account_digits
return formatted, 'bankgiro'
bg_formatted = account_digits
bg_valid = FieldValidators.is_valid_bankgiro(account_digits)
# Decision logic:
# 1. If only one format passes Luhn, use that
# 2. If both pass or both fail, default to Bankgiro (more common in payment lines)
if pg_valid and not bg_valid:
return pg_formatted, 'plusgiro'
elif bg_valid and not pg_valid:
return bg_formatted, 'bankgiro'
else:
# Both valid or both invalid - default to bankgiro
return bg_formatted, 'bankgiro'
# Try primary pattern
match = self.PAYMENT_LINE_PATTERN.search(raw_line)