- Add MachineCodeParser for Swedish invoice payment line parsing - Fix OCR Reference extraction by normalizing account number spaces - Add cross-validation tests for pipeline and field_extractor - Update UI layout for compact upload and full-width results Key changes: - machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713") - pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only - field_extractor.py: Improved invoice number normalization - app.py: Responsive UI layout changes Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
898 lines
33 KiB
Python
898 lines
33 KiB
Python
"""
|
|
Machine Code Line Parser for Swedish Invoices
|
|
|
|
Parses the bottom machine-readable payment line to extract:
|
|
- OCR reference number (10-25 digits)
|
|
- Amount (payment amount in SEK)
|
|
- Bankgiro account number (XXX-XXXX or XXXX-XXXX format)
|
|
- Plusgiro account number (XXXXXXX-X format)
|
|
|
|
The machine code line is typically found at the bottom of Swedish invoices,
|
|
in the payment slip (Inbetalningskort) section. It contains machine-readable
|
|
data for automated payment processing.
|
|
|
|
## Swedish Payment Line Standard Format
|
|
|
|
The standard machine-readable payment line follows this structure:
|
|
|
|
# <OCR> # <Kronor> <Öre> <Type> > <Bankgiro>#<Control>#
|
|
|
|
Example:
|
|
# 31130954410 # 315 00 2 > 8983025#14#
|
|
|
|
Components:
|
|
- `#` - Start delimiter
|
|
- `31130954410` - OCR number (with Mod 10 check digit)
|
|
- `#` - Separator
|
|
- `315 00` - Amount: 315 SEK and 00 öre (315.00 SEK)
|
|
- `2` - Payment type / record type
|
|
- `>` - Points to recipient info
|
|
- `8983025` - Bankgiro number
|
|
- `#14#` - End marker with control code
|
|
|
|
Legacy patterns also supported:
|
|
- OCR: 8120000849965361 (10-25 consecutive digits)
|
|
- Bankgiro: 5393-9484 or 53939484
|
|
- Plusgiro: 1234567-8
|
|
- Amount: 1234,56 or 1234.56 (with decimal separator)
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
from src.pdf.extractor import Token as TextToken
|
|
|
|
|
|
@dataclass
|
|
class MachineCodeResult:
|
|
"""Result of machine code parsing."""
|
|
ocr: Optional[str] = None
|
|
amount: Optional[str] = None
|
|
bankgiro: Optional[str] = None
|
|
plusgiro: Optional[str] = None
|
|
confidence: float = 0.0
|
|
source_tokens: list[TextToken] = field(default_factory=list)
|
|
raw_line: str = ""
|
|
# Region bounding box in PDF coordinates (x0, y0, x1, y1)
|
|
region_bbox: Optional[tuple[float, float, float, float]] = None
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary for serialization."""
|
|
return {
|
|
'ocr': self.ocr,
|
|
'amount': self.amount,
|
|
'bankgiro': self.bankgiro,
|
|
'plusgiro': self.plusgiro,
|
|
'confidence': self.confidence,
|
|
'raw_line': self.raw_line,
|
|
'region_bbox': self.region_bbox,
|
|
}
|
|
|
|
def get_region_bbox(self) -> Optional[tuple[float, float, float, float]]:
|
|
"""
|
|
Get the bounding box of the payment slip region.
|
|
|
|
Returns:
|
|
Tuple (x0, y0, x1, y1) in PDF coordinates, or None if no region detected
|
|
"""
|
|
if self.region_bbox:
|
|
return self.region_bbox
|
|
|
|
if not self.source_tokens:
|
|
return None
|
|
|
|
# Calculate bbox from source tokens
|
|
x0 = min(t.bbox[0] for t in self.source_tokens)
|
|
y0 = min(t.bbox[1] for t in self.source_tokens)
|
|
x1 = max(t.bbox[2] for t in self.source_tokens)
|
|
y1 = max(t.bbox[3] for t in self.source_tokens)
|
|
|
|
return (x0, y0, x1, y1)
|
|
|
|
|
|
class MachineCodeParser:
|
|
"""
|
|
Parser for machine-readable payment lines on Swedish invoices.
|
|
|
|
The parser focuses on the bottom region of the invoice where
|
|
the payment slip (Inbetalningskort) is typically located.
|
|
"""
|
|
|
|
# Payment slip detection keywords (Swedish)
|
|
PAYMENT_SLIP_KEYWORDS = [
|
|
'inbetalning', 'girering', 'avi', 'betalning',
|
|
'plusgiro', 'postgiro', 'bankgiro', 'bankgirot',
|
|
'betalningsavsändare', 'betalningsmottagare',
|
|
'maskinellt', 'ändringar', # "DEN AVLÄSES MASKINELLT"
|
|
]
|
|
|
|
# Patterns for field extraction
|
|
# OCR: 10-25 consecutive digits (may have spaces or # at end)
|
|
OCR_PATTERN = re.compile(r'(?<!\d)(\d{10,25})(?!\d)')
|
|
|
|
# Bankgiro: XXX-XXXX or XXXX-XXXX (7-8 digits with optional dash)
|
|
BANKGIRO_PATTERN = re.compile(r'\b(\d{3,4}[-\s]?\d{4})\b')
|
|
|
|
# Plusgiro: XXXXXXX-X (7-8 digits with dash before last digit)
|
|
PLUSGIRO_PATTERN = re.compile(r'\b(\d{6,7}[-\s]?\d)\b')
|
|
|
|
# Amount: digits with comma or dot as decimal separator
|
|
# Supports formats: 1234,56 | 1234.56 | 1 234,56 | 1.234,56
|
|
AMOUNT_PATTERN = re.compile(
|
|
r'\b(\d{1,3}(?:[\s\.\xa0]\d{3})*[,\.]\d{2})\b'
|
|
)
|
|
|
|
# Alternative amount pattern for integers (no decimal)
|
|
AMOUNT_INTEGER_PATTERN = re.compile(r'\b(\d{2,6})\b')
|
|
|
|
# Standard Swedish payment line pattern
|
|
# Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
|
|
# Example: # 31130954410 # 315 00 2 > 8983025#14#
|
|
# This pattern captures both Bankgiro and Plusgiro accounts
|
|
PAYMENT_LINE_PATTERN = re.compile(
|
|
r'#\s*' # Start delimiter
|
|
r'(\d{5,25})\s*' # OCR number (capture group 1)
|
|
r'#\s*' # Separator
|
|
r'(\d{1,7})\s+' # Kronor (capture group 2)
|
|
r'(\d{2})\s+' # Öre (capture group 3)
|
|
r'(\d)\s*' # Type (capture group 4)
|
|
r'>\s*' # Direction marker
|
|
r'(\d{5,10})' # Bankgiro/Plusgiro (capture group 5)
|
|
r'(?:#\d{1,3}#)?' # Optional end marker
|
|
)
|
|
|
|
# Alternative pattern with different spacing
|
|
PAYMENT_LINE_PATTERN_ALT = re.compile(
|
|
r'#?\s*' # Optional start delimiter
|
|
r'(\d{8,25})\s*' # OCR number
|
|
r'#?\s*' # Optional separator
|
|
r'(\d{1,7})\s+' # Kronor
|
|
r'(\d{2})\s+' # Öre
|
|
r'\d\s*' # Type
|
|
r'>?\s*' # Optional direction marker
|
|
r'(\d{5,10})' # Bankgiro
|
|
)
|
|
|
|
# Reverse format pattern (Bankgiro first, then OCR)
|
|
# Format: <Bankgiro>#<Control># <Kronor> <Öre> <Type> > <OCR> #
|
|
# Example: 53241469#41# 2428 00 1 > 4388595300 #
|
|
PAYMENT_LINE_PATTERN_REVERSE = re.compile(
|
|
r'(\d{7,8})' # Bankgiro (capture group 1)
|
|
r'#\d{1,3}#\s+' # Control marker
|
|
r'(\d{1,7})\s+' # Kronor (capture group 2)
|
|
r'(\d{2})\s+' # Öre (capture group 3)
|
|
r'\d\s*' # Type
|
|
r'>\s*' # Direction marker
|
|
r'(\d{5,25})' # OCR number (capture group 4)
|
|
)
|
|
|
|
def __init__(self, bottom_region_ratio: float = 0.35):
|
|
"""
|
|
Initialize the parser.
|
|
|
|
Args:
|
|
bottom_region_ratio: Fraction of page height to consider as bottom region.
|
|
Default 0.35 means bottom 35% of the page.
|
|
"""
|
|
self.bottom_region_ratio = bottom_region_ratio
|
|
|
|
def parse(
|
|
self,
|
|
tokens: list[TextToken],
|
|
page_height: float,
|
|
page_width: float | None = None,
|
|
) -> MachineCodeResult:
|
|
"""
|
|
Parse machine code from tokens.
|
|
|
|
Args:
|
|
tokens: List of text tokens from OCR or text extraction
|
|
page_height: Height of the page in points
|
|
page_width: Width of the page in points (optional)
|
|
|
|
Returns:
|
|
MachineCodeResult with extracted fields
|
|
"""
|
|
if not tokens:
|
|
return MachineCodeResult()
|
|
|
|
# Filter to bottom region tokens
|
|
bottom_y_threshold = page_height * (1 - self.bottom_region_ratio)
|
|
bottom_tokens = [
|
|
t for t in tokens
|
|
if t.bbox[1] >= bottom_y_threshold # y0 >= threshold
|
|
]
|
|
|
|
if not bottom_tokens:
|
|
return MachineCodeResult()
|
|
|
|
# Sort by y position (top to bottom) then x (left to right)
|
|
bottom_tokens.sort(key=lambda t: (t.bbox[1], t.bbox[0]))
|
|
|
|
# Check if this looks like a payment slip region
|
|
combined_text = ' '.join(t.text for t in bottom_tokens).lower()
|
|
has_payment_keywords = any(
|
|
kw in combined_text for kw in self.PAYMENT_SLIP_KEYWORDS
|
|
)
|
|
|
|
# Build raw line from bottom tokens
|
|
raw_line = ' '.join(t.text for t in bottom_tokens)
|
|
|
|
# Try standard payment line format first and find the matching tokens
|
|
standard_result, matched_tokens = self._parse_standard_payment_line_with_tokens(
|
|
raw_line, bottom_tokens
|
|
)
|
|
|
|
if standard_result and matched_tokens:
|
|
# Calculate bbox only from tokens that contain the machine code
|
|
x0 = min(t.bbox[0] for t in matched_tokens)
|
|
y0 = min(t.bbox[1] for t in matched_tokens)
|
|
x1 = max(t.bbox[2] for t in matched_tokens)
|
|
y1 = max(t.bbox[3] for t in matched_tokens)
|
|
region_bbox = (x0, y0, x1, y1)
|
|
|
|
result = MachineCodeResult(
|
|
ocr=standard_result.get('ocr'),
|
|
amount=standard_result.get('amount'),
|
|
bankgiro=standard_result.get('bankgiro'),
|
|
plusgiro=standard_result.get('plusgiro'),
|
|
confidence=0.95,
|
|
source_tokens=matched_tokens,
|
|
raw_line=raw_line,
|
|
region_bbox=region_bbox,
|
|
)
|
|
return result
|
|
|
|
# Fall back to individual field extraction
|
|
result = MachineCodeResult(
|
|
source_tokens=bottom_tokens,
|
|
raw_line=raw_line,
|
|
)
|
|
|
|
# Extract OCR number (longest digit sequence 10-25 digits)
|
|
result.ocr = self._extract_ocr(bottom_tokens)
|
|
|
|
# Extract Bankgiro
|
|
result.bankgiro = self._extract_bankgiro(bottom_tokens)
|
|
|
|
# Extract Plusgiro (if no Bankgiro found)
|
|
if not result.bankgiro:
|
|
result.plusgiro = self._extract_plusgiro(bottom_tokens)
|
|
|
|
# Extract Amount
|
|
result.amount = self._extract_amount(bottom_tokens)
|
|
|
|
# Calculate confidence
|
|
result.confidence = self._calculate_confidence(
|
|
result, has_payment_keywords
|
|
)
|
|
|
|
# For fallback extraction, compute bbox from tokens that contain the extracted values
|
|
matched_tokens = self._find_tokens_with_values(bottom_tokens, result)
|
|
if matched_tokens:
|
|
x0 = min(t.bbox[0] for t in matched_tokens)
|
|
y0 = min(t.bbox[1] for t in matched_tokens)
|
|
x1 = max(t.bbox[2] for t in matched_tokens)
|
|
y1 = max(t.bbox[3] for t in matched_tokens)
|
|
result.region_bbox = (x0, y0, x1, y1)
|
|
result.source_tokens = matched_tokens
|
|
|
|
return result
|
|
|
|
def _find_tokens_with_values(
|
|
self,
|
|
tokens: list[TextToken],
|
|
result: MachineCodeResult
|
|
) -> list[TextToken]:
|
|
"""Find tokens that contain the extracted values (OCR, Amount, Bankgiro)."""
|
|
matched = []
|
|
values_to_find = []
|
|
|
|
if result.ocr:
|
|
values_to_find.append(result.ocr)
|
|
if result.amount:
|
|
# Amount might be just digits
|
|
amount_digits = re.sub(r'\D', '', result.amount)
|
|
values_to_find.append(amount_digits)
|
|
values_to_find.append(result.amount)
|
|
if result.bankgiro:
|
|
# Bankgiro might have dash or not
|
|
bg_digits = re.sub(r'\D', '', result.bankgiro)
|
|
values_to_find.append(bg_digits)
|
|
values_to_find.append(result.bankgiro)
|
|
if result.plusgiro:
|
|
pg_digits = re.sub(r'\D', '', result.plusgiro)
|
|
values_to_find.append(pg_digits)
|
|
values_to_find.append(result.plusgiro)
|
|
|
|
for token in tokens:
|
|
text = token.text.replace(' ', '').replace('#', '')
|
|
text_digits = re.sub(r'\D', '', token.text)
|
|
|
|
for value in values_to_find:
|
|
if value in text or value in text_digits:
|
|
if token not in matched:
|
|
matched.append(token)
|
|
break
|
|
|
|
return matched
|
|
|
|
def _find_machine_code_line_tokens(
|
|
self,
|
|
tokens: list[TextToken]
|
|
) -> list[TextToken]:
|
|
"""
|
|
Find tokens that belong to the machine code line using pure regex patterns.
|
|
|
|
The machine code line typically contains:
|
|
- Control markers like #14#, #41#
|
|
- Direction marker >
|
|
- Account numbers with # suffix
|
|
|
|
Returns:
|
|
List of tokens belonging to the machine code line
|
|
"""
|
|
# Find tokens with characteristic machine code patterns
|
|
ref_y = None
|
|
|
|
# First, find the reference y-coordinate from tokens with machine code patterns
|
|
for token in tokens:
|
|
text = token.text
|
|
|
|
# Check if token contains machine code patterns
|
|
# Priority 1: Control marker like #14#, 47304035#14#
|
|
has_control_marker = bool(re.search(r'#\d+#', text))
|
|
# Priority 2: Direction marker >
|
|
has_direction = '>' in text
|
|
|
|
if has_control_marker:
|
|
# This is very likely part of the machine code line
|
|
ref_y = token.bbox[1]
|
|
break
|
|
elif has_direction and ref_y is None:
|
|
# Direction marker is also a good indicator
|
|
ref_y = token.bbox[1]
|
|
|
|
if ref_y is None:
|
|
return []
|
|
|
|
# Collect all tokens on the same line (within 3 points of ref_y)
|
|
# Use very small tolerance because Swedish invoices often have duplicate
|
|
# machine code lines (upper and lower part of payment slip)
|
|
y_tolerance = 3
|
|
machine_code_tokens = []
|
|
for token in tokens:
|
|
if abs(token.bbox[1] - ref_y) < y_tolerance:
|
|
text = token.text
|
|
# Include token if it contains:
|
|
# - Digits (OCR, amount, account numbers)
|
|
# - # symbol (delimiters, control markers)
|
|
# - > symbol (direction marker)
|
|
if (re.search(r'\d', text) or '#' in text or '>' in text):
|
|
machine_code_tokens.append(token)
|
|
|
|
# If we found very few tokens, try to expand to nearby y values
|
|
# that might be part of the same logical line
|
|
if len(machine_code_tokens) < 3:
|
|
y_tolerance = 10
|
|
machine_code_tokens = []
|
|
for token in tokens:
|
|
if abs(token.bbox[1] - ref_y) < y_tolerance:
|
|
text = token.text
|
|
if (re.search(r'\d', text) or '#' in text or '>' in text):
|
|
machine_code_tokens.append(token)
|
|
|
|
return machine_code_tokens
|
|
|
|
def _parse_standard_payment_line_with_tokens(
|
|
self,
|
|
raw_line: str,
|
|
tokens: list[TextToken]
|
|
) -> tuple[Optional[dict], list[TextToken]]:
|
|
"""
|
|
Parse standard Swedish payment line format and find matching tokens.
|
|
|
|
Uses pure regex to identify the machine code line, then finds tokens
|
|
that are part of that line based on their position.
|
|
|
|
Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
|
|
Example: # 31130954410 # 315 00 2 > 8983025#14#
|
|
|
|
Returns:
|
|
Tuple of (parsed_dict, matched_tokens) or (None, [])
|
|
"""
|
|
# First find the machine code line tokens using pattern matching
|
|
machine_code_tokens = self._find_machine_code_line_tokens(tokens)
|
|
|
|
if not machine_code_tokens:
|
|
# Fall back to regex on raw_line
|
|
parsed = self._parse_standard_payment_line(raw_line, raw_line)
|
|
return parsed, []
|
|
|
|
# Build a line from just the machine code tokens (sorted by x position)
|
|
# Group tokens by approximate x position to handle duplicate OCR results
|
|
mc_tokens_sorted = sorted(machine_code_tokens, key=lambda t: t.bbox[0])
|
|
|
|
# Deduplicate tokens at similar x positions (keep the first one)
|
|
deduped_tokens = []
|
|
last_x = -100
|
|
for t in mc_tokens_sorted:
|
|
# Skip tokens that are too close to the previous one (likely duplicates)
|
|
if t.bbox[0] - last_x < 5:
|
|
continue
|
|
deduped_tokens.append(t)
|
|
last_x = t.bbox[2] # Use end x for next comparison
|
|
|
|
mc_line = ' '.join(t.text for t in deduped_tokens)
|
|
|
|
# Try to parse this line, using raw_line for context detection
|
|
parsed = self._parse_standard_payment_line(mc_line, raw_line)
|
|
if parsed:
|
|
return parsed, deduped_tokens
|
|
|
|
# If machine code line parsing failed, try the full raw_line
|
|
parsed = self._parse_standard_payment_line(raw_line, raw_line)
|
|
if parsed:
|
|
return parsed, machine_code_tokens
|
|
|
|
return None, []
|
|
|
|
def _parse_standard_payment_line(
|
|
self,
|
|
raw_line: str,
|
|
context_line: str | None = None
|
|
) -> Optional[dict]:
|
|
"""
|
|
Parse standard Swedish payment line format.
|
|
|
|
Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
|
|
Example: # 31130954410 # 315 00 2 > 8983025#14#
|
|
|
|
Args:
|
|
raw_line: The line to parse (may be just the machine code tokens)
|
|
context_line: Optional full line for context detection (e.g., to find "plusgiro" keywords)
|
|
|
|
Returns:
|
|
Dict with 'ocr', 'amount', and 'bankgiro' or 'plusgiro' if matched, None otherwise
|
|
"""
|
|
# Use context_line for detecting Plusgiro/Bankgiro, fall back to raw_line
|
|
context = (context_line or raw_line).lower()
|
|
is_plusgiro_context = (
|
|
('plusgiro' in context or 'postgiro' in context or 'plusgirokonto' in context)
|
|
and 'bankgiro' not in context
|
|
)
|
|
|
|
# Preprocess: remove spaces in the account number part (after >)
|
|
# This handles cases like "78 2 1 713" -> "7821713"
|
|
def normalize_account_spaces(line: str) -> str:
|
|
"""Remove spaces in account number portion after > marker."""
|
|
if '>' in line:
|
|
parts = line.split('>', 1)
|
|
# After >, remove spaces between digits (but keep # markers)
|
|
after_arrow = parts[1]
|
|
# Extract digits and # markers, remove spaces between digits
|
|
normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', after_arrow)
|
|
# May need multiple passes for sequences like "78 2 1 713"
|
|
while re.search(r'(\d)\s+(\d)', normalized):
|
|
normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized)
|
|
return parts[0] + '>' + normalized
|
|
return line
|
|
|
|
raw_line = normalize_account_spaces(raw_line)
|
|
|
|
def format_account(account_digits: str) -> tuple[str, str]:
|
|
"""Format account and determine type (bankgiro or plusgiro).
|
|
|
|
Returns: (formatted_account, account_type)
|
|
"""
|
|
if is_plusgiro_context:
|
|
# Plusgiro format: XXXXXXX-X
|
|
formatted = f"{account_digits[:-1]}-{account_digits[-1]}"
|
|
return formatted, 'plusgiro'
|
|
else:
|
|
# Bankgiro format: XXX-XXXX or XXXX-XXXX
|
|
if len(account_digits) == 7:
|
|
formatted = f"{account_digits[:3]}-{account_digits[3:]}"
|
|
elif len(account_digits) == 8:
|
|
formatted = f"{account_digits[:4]}-{account_digits[4:]}"
|
|
else:
|
|
formatted = account_digits
|
|
return formatted, 'bankgiro'
|
|
|
|
# Try primary pattern
|
|
match = self.PAYMENT_LINE_PATTERN.search(raw_line)
|
|
if match:
|
|
ocr = match.group(1)
|
|
kronor = match.group(2)
|
|
ore = match.group(3)
|
|
account_digits = match.group(5)
|
|
|
|
# Format amount: combine kronor and öre
|
|
amount = f"{kronor},{ore}" if ore != "00" else kronor
|
|
|
|
formatted_account, account_type = format_account(account_digits)
|
|
|
|
return {
|
|
'ocr': ocr,
|
|
'amount': amount,
|
|
account_type: formatted_account,
|
|
}
|
|
|
|
# Try alternative pattern
|
|
match = self.PAYMENT_LINE_PATTERN_ALT.search(raw_line)
|
|
if match:
|
|
ocr = match.group(1)
|
|
kronor = match.group(2)
|
|
ore = match.group(3)
|
|
account_digits = match.group(4)
|
|
|
|
amount = f"{kronor},{ore}" if ore != "00" else kronor
|
|
|
|
formatted_account, account_type = format_account(account_digits)
|
|
|
|
return {
|
|
'ocr': ocr,
|
|
'amount': amount,
|
|
account_type: formatted_account,
|
|
}
|
|
|
|
# Try reverse pattern (Account first, then OCR)
|
|
match = self.PAYMENT_LINE_PATTERN_REVERSE.search(raw_line)
|
|
if match:
|
|
account_digits = match.group(1)
|
|
kronor = match.group(2)
|
|
ore = match.group(3)
|
|
ocr = match.group(4)
|
|
|
|
amount = f"{kronor},{ore}" if ore != "00" else kronor
|
|
|
|
formatted_account, account_type = format_account(account_digits)
|
|
|
|
return {
|
|
'ocr': ocr,
|
|
'amount': amount,
|
|
account_type: formatted_account,
|
|
}
|
|
|
|
return None
|
|
|
|
def _extract_ocr(self, tokens: list[TextToken]) -> Optional[str]:
|
|
"""Extract OCR reference number."""
|
|
candidates = []
|
|
|
|
# First, collect all bankgiro-like patterns to exclude
|
|
bankgiro_digits = set()
|
|
for token in tokens:
|
|
text = token.text.strip()
|
|
bg_matches = self.BANKGIRO_PATTERN.findall(text)
|
|
for bg in bg_matches:
|
|
digits = re.sub(r'\D', '', bg)
|
|
bankgiro_digits.add(digits)
|
|
# Also add with potential check digits (common pattern)
|
|
for i in range(10):
|
|
bankgiro_digits.add(digits + str(i))
|
|
bankgiro_digits.add(digits + str(i) + str(i))
|
|
|
|
for token in tokens:
|
|
# Remove spaces and common suffixes
|
|
text = token.text.replace(' ', '').replace('#', '').strip()
|
|
|
|
# Find all digit sequences
|
|
matches = self.OCR_PATTERN.findall(text)
|
|
for match in matches:
|
|
# OCR numbers are typically 10-25 digits
|
|
if 10 <= len(match) <= 25:
|
|
# Skip if this looks like a bankgiro number with check digit
|
|
is_bankgiro_variant = any(
|
|
match.startswith(bg) or match.endswith(bg)
|
|
for bg in bankgiro_digits if len(bg) >= 7
|
|
)
|
|
|
|
# Also check if it's exactly bankgiro with 2-3 extra digits
|
|
for bg in bankgiro_digits:
|
|
if len(bg) >= 7 and (
|
|
match == bg or
|
|
(len(match) - len(bg) <= 3 and match.startswith(bg))
|
|
):
|
|
is_bankgiro_variant = True
|
|
break
|
|
|
|
if not is_bankgiro_variant:
|
|
candidates.append((match, len(match), token))
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
# Prefer longer sequences (more likely to be OCR)
|
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
return candidates[0][0]
|
|
|
|
def _extract_bankgiro(self, tokens: list[TextToken]) -> Optional[str]:
|
|
"""Extract Bankgiro account number.
|
|
|
|
Bankgiro format: XXX-XXXX or XXXX-XXXX (dash in middle)
|
|
NOT Plusgiro: XXXXXXX-X (dash before last digit)
|
|
"""
|
|
candidates = []
|
|
context_text = ' '.join(t.text.lower() for t in tokens)
|
|
|
|
# Check if this is clearly a Plusgiro context (not Bankgiro)
|
|
is_plusgiro_only_context = (
|
|
('plusgiro' in context_text or 'postgiro' in context_text or 'plusgirokonto' in context_text)
|
|
and 'bankgiro' not in context_text
|
|
)
|
|
|
|
# If clearly Plusgiro context, don't extract as Bankgiro
|
|
if is_plusgiro_only_context:
|
|
return None
|
|
|
|
for token in tokens:
|
|
text = token.text.strip()
|
|
|
|
# Look for Bankgiro pattern
|
|
matches = self.BANKGIRO_PATTERN.findall(text)
|
|
for match in matches:
|
|
# Check if this looks like Plusgiro format (dash before last digit)
|
|
# Plusgiro: 1234567-8 (dash at position -2)
|
|
if '-' in match:
|
|
parts = match.replace(' ', '').split('-')
|
|
if len(parts) == 2 and len(parts[1]) == 1:
|
|
# This is Plusgiro format, skip
|
|
continue
|
|
|
|
# Normalize: remove spaces, ensure dash
|
|
digits = re.sub(r'\D', '', match)
|
|
if len(digits) == 7:
|
|
normalized = f"{digits[:3]}-{digits[3:]}"
|
|
elif len(digits) == 8:
|
|
normalized = f"{digits[:4]}-{digits[4:]}"
|
|
else:
|
|
continue
|
|
|
|
# Check if "bankgiro" or "bg" appears nearby
|
|
is_bankgiro_context = (
|
|
'bankgiro' in context_text or
|
|
'bg:' in context_text or
|
|
'bg ' in context_text
|
|
)
|
|
|
|
candidates.append((normalized, is_bankgiro_context, token))
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
# Prefer matches with bankgiro context
|
|
candidates.sort(key=lambda x: (x[1], 1), reverse=True)
|
|
return candidates[0][0]
|
|
|
|
def _extract_plusgiro(self, tokens: list[TextToken]) -> Optional[str]:
|
|
"""Extract Plusgiro account number."""
|
|
candidates = []
|
|
|
|
for token in tokens:
|
|
text = token.text.strip()
|
|
|
|
matches = self.PLUSGIRO_PATTERN.findall(text)
|
|
for match in matches:
|
|
# Normalize: remove spaces, ensure dash before last digit
|
|
digits = re.sub(r'\D', '', match)
|
|
if 7 <= len(digits) <= 8:
|
|
normalized = f"{digits[:-1]}-{digits[-1]}"
|
|
|
|
# Check context
|
|
context_text = ' '.join(t.text.lower() for t in tokens)
|
|
is_plusgiro_context = (
|
|
'plusgiro' in context_text or
|
|
'postgiro' in context_text or
|
|
'pg:' in context_text or
|
|
'pg ' in context_text
|
|
)
|
|
|
|
candidates.append((normalized, is_plusgiro_context, token))
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
candidates.sort(key=lambda x: (x[1], 1), reverse=True)
|
|
return candidates[0][0]
|
|
|
|
def _extract_amount(self, tokens: list[TextToken]) -> Optional[str]:
|
|
"""Extract payment amount."""
|
|
candidates = []
|
|
|
|
for token in tokens:
|
|
text = token.text.strip()
|
|
|
|
# Try decimal amount pattern first
|
|
matches = self.AMOUNT_PATTERN.findall(text)
|
|
for match in matches:
|
|
# Normalize: remove thousand separators, use comma as decimal
|
|
normalized = match.replace(' ', '').replace('\xa0', '')
|
|
# Convert dot thousand separator to none, keep comma decimal
|
|
if '.' in normalized and ',' in normalized:
|
|
# Format like 1.234,56 -> 1234,56
|
|
normalized = normalized.replace('.', '')
|
|
elif '.' in normalized:
|
|
# Could be 1234.56 -> 1234,56
|
|
parts = normalized.split('.')
|
|
if len(parts) == 2 and len(parts[1]) == 2:
|
|
normalized = f"{parts[0]},{parts[1]}"
|
|
|
|
# Parse to verify it's a valid amount
|
|
try:
|
|
value = float(normalized.replace(',', '.'))
|
|
if 0 < value < 1000000: # Reasonable amount range
|
|
candidates.append((normalized, value, token))
|
|
except ValueError:
|
|
continue
|
|
|
|
# If no decimal amounts found, try integer amounts
|
|
# Look for "Kronor" label nearby and extract integer
|
|
if not candidates:
|
|
for i, token in enumerate(tokens):
|
|
text = token.text.strip().lower()
|
|
if 'kronor' in text or 'kr' == text or text.endswith(' kr'):
|
|
# Look at nearby tokens for amounts (wider range)
|
|
for j in range(max(0, i - 5), min(len(tokens), i + 5)):
|
|
nearby_text = tokens[j].text.strip()
|
|
# Match pure integer (1-6 digits)
|
|
int_match = re.match(r'^(\d{1,6})$', nearby_text)
|
|
if int_match:
|
|
value = int(int_match.group(1))
|
|
if 0 < value < 1000000:
|
|
candidates.append((str(value), float(value), tokens[j]))
|
|
|
|
# Also try to find amounts near "öre" label (Swedish cents)
|
|
if not candidates:
|
|
for i, token in enumerate(tokens):
|
|
text = token.text.strip().lower()
|
|
if 'öre' in text:
|
|
# Look at nearby tokens for amounts
|
|
for j in range(max(0, i - 5), min(len(tokens), i + 5)):
|
|
nearby_text = tokens[j].text.strip()
|
|
int_match = re.match(r'^(\d{1,6})$', nearby_text)
|
|
if int_match:
|
|
value = int(int_match.group(1))
|
|
if 0 < value < 1000000:
|
|
candidates.append((str(value), float(value), tokens[j]))
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
# Sort by value (prefer larger amounts - likely total)
|
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
return candidates[0][0]
|
|
|
|
def _calculate_confidence(
|
|
self,
|
|
result: MachineCodeResult,
|
|
has_payment_keywords: bool
|
|
) -> float:
|
|
"""Calculate confidence score for the extraction."""
|
|
confidence = 0.0
|
|
|
|
# Base confidence from payment keywords
|
|
if has_payment_keywords:
|
|
confidence += 0.3
|
|
|
|
# Points for each extracted field
|
|
if result.ocr:
|
|
confidence += 0.25
|
|
# Bonus for typical OCR length (15-17 digits)
|
|
if 15 <= len(result.ocr) <= 17:
|
|
confidence += 0.1
|
|
|
|
if result.bankgiro or result.plusgiro:
|
|
confidence += 0.2
|
|
|
|
if result.amount:
|
|
confidence += 0.15
|
|
|
|
return min(confidence, 1.0)
|
|
|
|
def cross_validate(
|
|
self,
|
|
machine_result: MachineCodeResult,
|
|
csv_values: dict[str, str],
|
|
) -> dict[str, dict]:
|
|
"""
|
|
Cross-validate machine code extraction with CSV ground truth.
|
|
|
|
Args:
|
|
machine_result: Result from parse()
|
|
csv_values: Dict of field values from CSV
|
|
(keys: 'ocr', 'amount', 'bankgiro', 'plusgiro')
|
|
|
|
Returns:
|
|
Dict with validation results for each field:
|
|
{
|
|
'ocr': {
|
|
'machine': '123456789',
|
|
'csv': '123456789',
|
|
'match': True,
|
|
'use_machine': False, # CSV has value
|
|
},
|
|
...
|
|
}
|
|
"""
|
|
from src.normalize import normalize_field
|
|
|
|
results = {}
|
|
|
|
field_mapping = [
|
|
('ocr', 'OCR', machine_result.ocr),
|
|
('amount', 'Amount', machine_result.amount),
|
|
('bankgiro', 'Bankgiro', machine_result.bankgiro),
|
|
('plusgiro', 'Plusgiro', machine_result.plusgiro),
|
|
]
|
|
|
|
for field_key, normalizer_name, machine_value in field_mapping:
|
|
csv_value = csv_values.get(field_key, '').strip()
|
|
|
|
result_entry = {
|
|
'machine': machine_value,
|
|
'csv': csv_value if csv_value else None,
|
|
'match': False,
|
|
'use_machine': False,
|
|
}
|
|
|
|
if machine_value and csv_value:
|
|
# Both have values - check if they match
|
|
machine_variants = normalize_field(normalizer_name, machine_value)
|
|
csv_variants = normalize_field(normalizer_name, csv_value)
|
|
|
|
# Check for any overlap
|
|
result_entry['match'] = bool(
|
|
set(machine_variants) & set(csv_variants)
|
|
)
|
|
|
|
# Special handling for amounts - allow rounding differences
|
|
if not result_entry['match'] and field_key == 'amount':
|
|
try:
|
|
# Parse both values as floats
|
|
machine_float = float(
|
|
machine_value.replace(' ', '')
|
|
.replace(',', '.').replace('\xa0', '')
|
|
)
|
|
csv_float = float(
|
|
csv_value.replace(' ', '')
|
|
.replace(',', '.').replace('\xa0', '')
|
|
)
|
|
# Allow 1 unit difference (rounding)
|
|
if abs(machine_float - csv_float) <= 1.0:
|
|
result_entry['match'] = True
|
|
result_entry['rounding_diff'] = True
|
|
except ValueError:
|
|
pass
|
|
|
|
elif machine_value and not csv_value:
|
|
# CSV is missing, use machine value
|
|
result_entry['use_machine'] = True
|
|
|
|
results[field_key] = result_entry
|
|
|
|
return results
|
|
|
|
|
|
def parse_machine_code(
|
|
tokens: list[TextToken],
|
|
page_height: float,
|
|
page_width: float | None = None,
|
|
bottom_ratio: float = 0.35,
|
|
) -> MachineCodeResult:
|
|
"""
|
|
Convenience function to parse machine code from tokens.
|
|
|
|
Args:
|
|
tokens: List of text tokens
|
|
page_height: Page height in points
|
|
page_width: Page width in points (optional)
|
|
bottom_ratio: Fraction of page to consider as bottom region
|
|
|
|
Returns:
|
|
MachineCodeResult with extracted fields
|
|
"""
|
|
parser = MachineCodeParser(bottom_region_ratio=bottom_ratio)
|
|
return parser.parse(tokens, page_height, page_width)
|