Files
invoice-master-poc-v2/src/ocr/machine_code_parser.py
Yaojia Wang 4ea4bc96d4 Add payment line parser and fix OCR override from payment_line
- Add MachineCodeParser for Swedish invoice payment line parsing
- Fix OCR Reference extraction by normalizing account number spaces
- Add cross-validation tests for pipeline and field_extractor
- Update UI layout for compact upload and full-width results

Key changes:
- machine_code_parser.py: Handle spaces in Bankgiro numbers (e.g. "78 2 1 713")
- pipeline.py: OCR and Amount override from payment_line, BG/PG comparison only
- field_extractor.py: Improved invoice number normalization
- app.py: Responsive UI layout changes

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-21 21:47:02 +01:00

898 lines
33 KiB
Python

"""
Machine Code Line Parser for Swedish Invoices
Parses the bottom machine-readable payment line to extract:
- OCR reference number (10-25 digits)
- Amount (payment amount in SEK)
- Bankgiro account number (XXX-XXXX or XXXX-XXXX format)
- Plusgiro account number (XXXXXXX-X format)
The machine code line is typically found at the bottom of Swedish invoices,
in the payment slip (Inbetalningskort) section. It contains machine-readable
data for automated payment processing.
## Swedish Payment Line Standard Format
The standard machine-readable payment line follows this structure:
# <OCR> # <Kronor> <Öre> <Type> > <Bankgiro>#<Control>#
Example:
# 31130954410 # 315 00 2 > 8983025#14#
Components:
- `#` - Start delimiter
- `31130954410` - OCR number (with Mod 10 check digit)
- `#` - Separator
- `315 00` - Amount: 315 SEK and 00 öre (315.00 SEK)
- `2` - Payment type / record type
- `>` - Points to recipient info
- `8983025` - Bankgiro number
- `#14#` - End marker with control code
Legacy patterns also supported:
- OCR: 8120000849965361 (10-25 consecutive digits)
- Bankgiro: 5393-9484 or 53939484
- Plusgiro: 1234567-8
- Amount: 1234,56 or 1234.56 (with decimal separator)
"""
import re
from dataclasses import dataclass, field
from typing import Optional
from src.pdf.extractor import Token as TextToken
@dataclass
class MachineCodeResult:
"""Result of machine code parsing."""
ocr: Optional[str] = None
amount: Optional[str] = None
bankgiro: Optional[str] = None
plusgiro: Optional[str] = None
confidence: float = 0.0
source_tokens: list[TextToken] = field(default_factory=list)
raw_line: str = ""
# Region bounding box in PDF coordinates (x0, y0, x1, y1)
region_bbox: Optional[tuple[float, float, float, float]] = None
def to_dict(self) -> dict:
"""Convert to dictionary for serialization."""
return {
'ocr': self.ocr,
'amount': self.amount,
'bankgiro': self.bankgiro,
'plusgiro': self.plusgiro,
'confidence': self.confidence,
'raw_line': self.raw_line,
'region_bbox': self.region_bbox,
}
def get_region_bbox(self) -> Optional[tuple[float, float, float, float]]:
"""
Get the bounding box of the payment slip region.
Returns:
Tuple (x0, y0, x1, y1) in PDF coordinates, or None if no region detected
"""
if self.region_bbox:
return self.region_bbox
if not self.source_tokens:
return None
# Calculate bbox from source tokens
x0 = min(t.bbox[0] for t in self.source_tokens)
y0 = min(t.bbox[1] for t in self.source_tokens)
x1 = max(t.bbox[2] for t in self.source_tokens)
y1 = max(t.bbox[3] for t in self.source_tokens)
return (x0, y0, x1, y1)
class MachineCodeParser:
"""
Parser for machine-readable payment lines on Swedish invoices.
The parser focuses on the bottom region of the invoice where
the payment slip (Inbetalningskort) is typically located.
"""
# Payment slip detection keywords (Swedish)
PAYMENT_SLIP_KEYWORDS = [
'inbetalning', 'girering', 'avi', 'betalning',
'plusgiro', 'postgiro', 'bankgiro', 'bankgirot',
'betalningsavsändare', 'betalningsmottagare',
'maskinellt', 'ändringar', # "DEN AVLÄSES MASKINELLT"
]
# Patterns for field extraction
# OCR: 10-25 consecutive digits (may have spaces or # at end)
OCR_PATTERN = re.compile(r'(?<!\d)(\d{10,25})(?!\d)')
# Bankgiro: XXX-XXXX or XXXX-XXXX (7-8 digits with optional dash)
BANKGIRO_PATTERN = re.compile(r'\b(\d{3,4}[-\s]?\d{4})\b')
# Plusgiro: XXXXXXX-X (7-8 digits with dash before last digit)
PLUSGIRO_PATTERN = re.compile(r'\b(\d{6,7}[-\s]?\d)\b')
# Amount: digits with comma or dot as decimal separator
# Supports formats: 1234,56 | 1234.56 | 1 234,56 | 1.234,56
AMOUNT_PATTERN = re.compile(
r'\b(\d{1,3}(?:[\s\.\xa0]\d{3})*[,\.]\d{2})\b'
)
# Alternative amount pattern for integers (no decimal)
AMOUNT_INTEGER_PATTERN = re.compile(r'\b(\d{2,6})\b')
# Standard Swedish payment line pattern
# Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
# Example: # 31130954410 # 315 00 2 > 8983025#14#
# This pattern captures both Bankgiro and Plusgiro accounts
PAYMENT_LINE_PATTERN = re.compile(
r'#\s*' # Start delimiter
r'(\d{5,25})\s*' # OCR number (capture group 1)
r'#\s*' # Separator
r'(\d{1,7})\s+' # Kronor (capture group 2)
r'(\d{2})\s+' # Öre (capture group 3)
r'(\d)\s*' # Type (capture group 4)
r'>\s*' # Direction marker
r'(\d{5,10})' # Bankgiro/Plusgiro (capture group 5)
r'(?:#\d{1,3}#)?' # Optional end marker
)
# Alternative pattern with different spacing
PAYMENT_LINE_PATTERN_ALT = re.compile(
r'#?\s*' # Optional start delimiter
r'(\d{8,25})\s*' # OCR number
r'#?\s*' # Optional separator
r'(\d{1,7})\s+' # Kronor
r'(\d{2})\s+' # Öre
r'\d\s*' # Type
r'>?\s*' # Optional direction marker
r'(\d{5,10})' # Bankgiro
)
# Reverse format pattern (Bankgiro first, then OCR)
# Format: <Bankgiro>#<Control># <Kronor> <Öre> <Type> > <OCR> #
# Example: 53241469#41# 2428 00 1 > 4388595300 #
PAYMENT_LINE_PATTERN_REVERSE = re.compile(
r'(\d{7,8})' # Bankgiro (capture group 1)
r'#\d{1,3}#\s+' # Control marker
r'(\d{1,7})\s+' # Kronor (capture group 2)
r'(\d{2})\s+' # Öre (capture group 3)
r'\d\s*' # Type
r'>\s*' # Direction marker
r'(\d{5,25})' # OCR number (capture group 4)
)
def __init__(self, bottom_region_ratio: float = 0.35):
"""
Initialize the parser.
Args:
bottom_region_ratio: Fraction of page height to consider as bottom region.
Default 0.35 means bottom 35% of the page.
"""
self.bottom_region_ratio = bottom_region_ratio
def parse(
self,
tokens: list[TextToken],
page_height: float,
page_width: float | None = None,
) -> MachineCodeResult:
"""
Parse machine code from tokens.
Args:
tokens: List of text tokens from OCR or text extraction
page_height: Height of the page in points
page_width: Width of the page in points (optional)
Returns:
MachineCodeResult with extracted fields
"""
if not tokens:
return MachineCodeResult()
# Filter to bottom region tokens
bottom_y_threshold = page_height * (1 - self.bottom_region_ratio)
bottom_tokens = [
t for t in tokens
if t.bbox[1] >= bottom_y_threshold # y0 >= threshold
]
if not bottom_tokens:
return MachineCodeResult()
# Sort by y position (top to bottom) then x (left to right)
bottom_tokens.sort(key=lambda t: (t.bbox[1], t.bbox[0]))
# Check if this looks like a payment slip region
combined_text = ' '.join(t.text for t in bottom_tokens).lower()
has_payment_keywords = any(
kw in combined_text for kw in self.PAYMENT_SLIP_KEYWORDS
)
# Build raw line from bottom tokens
raw_line = ' '.join(t.text for t in bottom_tokens)
# Try standard payment line format first and find the matching tokens
standard_result, matched_tokens = self._parse_standard_payment_line_with_tokens(
raw_line, bottom_tokens
)
if standard_result and matched_tokens:
# Calculate bbox only from tokens that contain the machine code
x0 = min(t.bbox[0] for t in matched_tokens)
y0 = min(t.bbox[1] for t in matched_tokens)
x1 = max(t.bbox[2] for t in matched_tokens)
y1 = max(t.bbox[3] for t in matched_tokens)
region_bbox = (x0, y0, x1, y1)
result = MachineCodeResult(
ocr=standard_result.get('ocr'),
amount=standard_result.get('amount'),
bankgiro=standard_result.get('bankgiro'),
plusgiro=standard_result.get('plusgiro'),
confidence=0.95,
source_tokens=matched_tokens,
raw_line=raw_line,
region_bbox=region_bbox,
)
return result
# Fall back to individual field extraction
result = MachineCodeResult(
source_tokens=bottom_tokens,
raw_line=raw_line,
)
# Extract OCR number (longest digit sequence 10-25 digits)
result.ocr = self._extract_ocr(bottom_tokens)
# Extract Bankgiro
result.bankgiro = self._extract_bankgiro(bottom_tokens)
# Extract Plusgiro (if no Bankgiro found)
if not result.bankgiro:
result.plusgiro = self._extract_plusgiro(bottom_tokens)
# Extract Amount
result.amount = self._extract_amount(bottom_tokens)
# Calculate confidence
result.confidence = self._calculate_confidence(
result, has_payment_keywords
)
# For fallback extraction, compute bbox from tokens that contain the extracted values
matched_tokens = self._find_tokens_with_values(bottom_tokens, result)
if matched_tokens:
x0 = min(t.bbox[0] for t in matched_tokens)
y0 = min(t.bbox[1] for t in matched_tokens)
x1 = max(t.bbox[2] for t in matched_tokens)
y1 = max(t.bbox[3] for t in matched_tokens)
result.region_bbox = (x0, y0, x1, y1)
result.source_tokens = matched_tokens
return result
def _find_tokens_with_values(
self,
tokens: list[TextToken],
result: MachineCodeResult
) -> list[TextToken]:
"""Find tokens that contain the extracted values (OCR, Amount, Bankgiro)."""
matched = []
values_to_find = []
if result.ocr:
values_to_find.append(result.ocr)
if result.amount:
# Amount might be just digits
amount_digits = re.sub(r'\D', '', result.amount)
values_to_find.append(amount_digits)
values_to_find.append(result.amount)
if result.bankgiro:
# Bankgiro might have dash or not
bg_digits = re.sub(r'\D', '', result.bankgiro)
values_to_find.append(bg_digits)
values_to_find.append(result.bankgiro)
if result.plusgiro:
pg_digits = re.sub(r'\D', '', result.plusgiro)
values_to_find.append(pg_digits)
values_to_find.append(result.plusgiro)
for token in tokens:
text = token.text.replace(' ', '').replace('#', '')
text_digits = re.sub(r'\D', '', token.text)
for value in values_to_find:
if value in text or value in text_digits:
if token not in matched:
matched.append(token)
break
return matched
def _find_machine_code_line_tokens(
self,
tokens: list[TextToken]
) -> list[TextToken]:
"""
Find tokens that belong to the machine code line using pure regex patterns.
The machine code line typically contains:
- Control markers like #14#, #41#
- Direction marker >
- Account numbers with # suffix
Returns:
List of tokens belonging to the machine code line
"""
# Find tokens with characteristic machine code patterns
ref_y = None
# First, find the reference y-coordinate from tokens with machine code patterns
for token in tokens:
text = token.text
# Check if token contains machine code patterns
# Priority 1: Control marker like #14#, 47304035#14#
has_control_marker = bool(re.search(r'#\d+#', text))
# Priority 2: Direction marker >
has_direction = '>' in text
if has_control_marker:
# This is very likely part of the machine code line
ref_y = token.bbox[1]
break
elif has_direction and ref_y is None:
# Direction marker is also a good indicator
ref_y = token.bbox[1]
if ref_y is None:
return []
# Collect all tokens on the same line (within 3 points of ref_y)
# Use very small tolerance because Swedish invoices often have duplicate
# machine code lines (upper and lower part of payment slip)
y_tolerance = 3
machine_code_tokens = []
for token in tokens:
if abs(token.bbox[1] - ref_y) < y_tolerance:
text = token.text
# Include token if it contains:
# - Digits (OCR, amount, account numbers)
# - # symbol (delimiters, control markers)
# - > symbol (direction marker)
if (re.search(r'\d', text) or '#' in text or '>' in text):
machine_code_tokens.append(token)
# If we found very few tokens, try to expand to nearby y values
# that might be part of the same logical line
if len(machine_code_tokens) < 3:
y_tolerance = 10
machine_code_tokens = []
for token in tokens:
if abs(token.bbox[1] - ref_y) < y_tolerance:
text = token.text
if (re.search(r'\d', text) or '#' in text or '>' in text):
machine_code_tokens.append(token)
return machine_code_tokens
def _parse_standard_payment_line_with_tokens(
self,
raw_line: str,
tokens: list[TextToken]
) -> tuple[Optional[dict], list[TextToken]]:
"""
Parse standard Swedish payment line format and find matching tokens.
Uses pure regex to identify the machine code line, then finds tokens
that are part of that line based on their position.
Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
Example: # 31130954410 # 315 00 2 > 8983025#14#
Returns:
Tuple of (parsed_dict, matched_tokens) or (None, [])
"""
# First find the machine code line tokens using pattern matching
machine_code_tokens = self._find_machine_code_line_tokens(tokens)
if not machine_code_tokens:
# Fall back to regex on raw_line
parsed = self._parse_standard_payment_line(raw_line, raw_line)
return parsed, []
# Build a line from just the machine code tokens (sorted by x position)
# Group tokens by approximate x position to handle duplicate OCR results
mc_tokens_sorted = sorted(machine_code_tokens, key=lambda t: t.bbox[0])
# Deduplicate tokens at similar x positions (keep the first one)
deduped_tokens = []
last_x = -100
for t in mc_tokens_sorted:
# Skip tokens that are too close to the previous one (likely duplicates)
if t.bbox[0] - last_x < 5:
continue
deduped_tokens.append(t)
last_x = t.bbox[2] # Use end x for next comparison
mc_line = ' '.join(t.text for t in deduped_tokens)
# Try to parse this line, using raw_line for context detection
parsed = self._parse_standard_payment_line(mc_line, raw_line)
if parsed:
return parsed, deduped_tokens
# If machine code line parsing failed, try the full raw_line
parsed = self._parse_standard_payment_line(raw_line, raw_line)
if parsed:
return parsed, machine_code_tokens
return None, []
def _parse_standard_payment_line(
self,
raw_line: str,
context_line: str | None = None
) -> Optional[dict]:
"""
Parse standard Swedish payment line format.
Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
Example: # 31130954410 # 315 00 2 > 8983025#14#
Args:
raw_line: The line to parse (may be just the machine code tokens)
context_line: Optional full line for context detection (e.g., to find "plusgiro" keywords)
Returns:
Dict with 'ocr', 'amount', and 'bankgiro' or 'plusgiro' if matched, None otherwise
"""
# Use context_line for detecting Plusgiro/Bankgiro, fall back to raw_line
context = (context_line or raw_line).lower()
is_plusgiro_context = (
('plusgiro' in context or 'postgiro' in context or 'plusgirokonto' in context)
and 'bankgiro' not in context
)
# Preprocess: remove spaces in the account number part (after >)
# This handles cases like "78 2 1 713" -> "7821713"
def normalize_account_spaces(line: str) -> str:
"""Remove spaces in account number portion after > marker."""
if '>' in line:
parts = line.split('>', 1)
# After >, remove spaces between digits (but keep # markers)
after_arrow = parts[1]
# Extract digits and # markers, remove spaces between digits
normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', after_arrow)
# May need multiple passes for sequences like "78 2 1 713"
while re.search(r'(\d)\s+(\d)', normalized):
normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized)
return parts[0] + '>' + normalized
return line
raw_line = normalize_account_spaces(raw_line)
def format_account(account_digits: str) -> tuple[str, str]:
"""Format account and determine type (bankgiro or plusgiro).
Returns: (formatted_account, account_type)
"""
if is_plusgiro_context:
# Plusgiro format: XXXXXXX-X
formatted = f"{account_digits[:-1]}-{account_digits[-1]}"
return formatted, 'plusgiro'
else:
# Bankgiro format: XXX-XXXX or XXXX-XXXX
if len(account_digits) == 7:
formatted = f"{account_digits[:3]}-{account_digits[3:]}"
elif len(account_digits) == 8:
formatted = f"{account_digits[:4]}-{account_digits[4:]}"
else:
formatted = account_digits
return formatted, 'bankgiro'
# Try primary pattern
match = self.PAYMENT_LINE_PATTERN.search(raw_line)
if match:
ocr = match.group(1)
kronor = match.group(2)
ore = match.group(3)
account_digits = match.group(5)
# Format amount: combine kronor and öre
amount = f"{kronor},{ore}" if ore != "00" else kronor
formatted_account, account_type = format_account(account_digits)
return {
'ocr': ocr,
'amount': amount,
account_type: formatted_account,
}
# Try alternative pattern
match = self.PAYMENT_LINE_PATTERN_ALT.search(raw_line)
if match:
ocr = match.group(1)
kronor = match.group(2)
ore = match.group(3)
account_digits = match.group(4)
amount = f"{kronor},{ore}" if ore != "00" else kronor
formatted_account, account_type = format_account(account_digits)
return {
'ocr': ocr,
'amount': amount,
account_type: formatted_account,
}
# Try reverse pattern (Account first, then OCR)
match = self.PAYMENT_LINE_PATTERN_REVERSE.search(raw_line)
if match:
account_digits = match.group(1)
kronor = match.group(2)
ore = match.group(3)
ocr = match.group(4)
amount = f"{kronor},{ore}" if ore != "00" else kronor
formatted_account, account_type = format_account(account_digits)
return {
'ocr': ocr,
'amount': amount,
account_type: formatted_account,
}
return None
def _extract_ocr(self, tokens: list[TextToken]) -> Optional[str]:
"""Extract OCR reference number."""
candidates = []
# First, collect all bankgiro-like patterns to exclude
bankgiro_digits = set()
for token in tokens:
text = token.text.strip()
bg_matches = self.BANKGIRO_PATTERN.findall(text)
for bg in bg_matches:
digits = re.sub(r'\D', '', bg)
bankgiro_digits.add(digits)
# Also add with potential check digits (common pattern)
for i in range(10):
bankgiro_digits.add(digits + str(i))
bankgiro_digits.add(digits + str(i) + str(i))
for token in tokens:
# Remove spaces and common suffixes
text = token.text.replace(' ', '').replace('#', '').strip()
# Find all digit sequences
matches = self.OCR_PATTERN.findall(text)
for match in matches:
# OCR numbers are typically 10-25 digits
if 10 <= len(match) <= 25:
# Skip if this looks like a bankgiro number with check digit
is_bankgiro_variant = any(
match.startswith(bg) or match.endswith(bg)
for bg in bankgiro_digits if len(bg) >= 7
)
# Also check if it's exactly bankgiro with 2-3 extra digits
for bg in bankgiro_digits:
if len(bg) >= 7 and (
match == bg or
(len(match) - len(bg) <= 3 and match.startswith(bg))
):
is_bankgiro_variant = True
break
if not is_bankgiro_variant:
candidates.append((match, len(match), token))
if not candidates:
return None
# Prefer longer sequences (more likely to be OCR)
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates[0][0]
def _extract_bankgiro(self, tokens: list[TextToken]) -> Optional[str]:
"""Extract Bankgiro account number.
Bankgiro format: XXX-XXXX or XXXX-XXXX (dash in middle)
NOT Plusgiro: XXXXXXX-X (dash before last digit)
"""
candidates = []
context_text = ' '.join(t.text.lower() for t in tokens)
# Check if this is clearly a Plusgiro context (not Bankgiro)
is_plusgiro_only_context = (
('plusgiro' in context_text or 'postgiro' in context_text or 'plusgirokonto' in context_text)
and 'bankgiro' not in context_text
)
# If clearly Plusgiro context, don't extract as Bankgiro
if is_plusgiro_only_context:
return None
for token in tokens:
text = token.text.strip()
# Look for Bankgiro pattern
matches = self.BANKGIRO_PATTERN.findall(text)
for match in matches:
# Check if this looks like Plusgiro format (dash before last digit)
# Plusgiro: 1234567-8 (dash at position -2)
if '-' in match:
parts = match.replace(' ', '').split('-')
if len(parts) == 2 and len(parts[1]) == 1:
# This is Plusgiro format, skip
continue
# Normalize: remove spaces, ensure dash
digits = re.sub(r'\D', '', match)
if len(digits) == 7:
normalized = f"{digits[:3]}-{digits[3:]}"
elif len(digits) == 8:
normalized = f"{digits[:4]}-{digits[4:]}"
else:
continue
# Check if "bankgiro" or "bg" appears nearby
is_bankgiro_context = (
'bankgiro' in context_text or
'bg:' in context_text or
'bg ' in context_text
)
candidates.append((normalized, is_bankgiro_context, token))
if not candidates:
return None
# Prefer matches with bankgiro context
candidates.sort(key=lambda x: (x[1], 1), reverse=True)
return candidates[0][0]
def _extract_plusgiro(self, tokens: list[TextToken]) -> Optional[str]:
"""Extract Plusgiro account number."""
candidates = []
for token in tokens:
text = token.text.strip()
matches = self.PLUSGIRO_PATTERN.findall(text)
for match in matches:
# Normalize: remove spaces, ensure dash before last digit
digits = re.sub(r'\D', '', match)
if 7 <= len(digits) <= 8:
normalized = f"{digits[:-1]}-{digits[-1]}"
# Check context
context_text = ' '.join(t.text.lower() for t in tokens)
is_plusgiro_context = (
'plusgiro' in context_text or
'postgiro' in context_text or
'pg:' in context_text or
'pg ' in context_text
)
candidates.append((normalized, is_plusgiro_context, token))
if not candidates:
return None
candidates.sort(key=lambda x: (x[1], 1), reverse=True)
return candidates[0][0]
def _extract_amount(self, tokens: list[TextToken]) -> Optional[str]:
"""Extract payment amount."""
candidates = []
for token in tokens:
text = token.text.strip()
# Try decimal amount pattern first
matches = self.AMOUNT_PATTERN.findall(text)
for match in matches:
# Normalize: remove thousand separators, use comma as decimal
normalized = match.replace(' ', '').replace('\xa0', '')
# Convert dot thousand separator to none, keep comma decimal
if '.' in normalized and ',' in normalized:
# Format like 1.234,56 -> 1234,56
normalized = normalized.replace('.', '')
elif '.' in normalized:
# Could be 1234.56 -> 1234,56
parts = normalized.split('.')
if len(parts) == 2 and len(parts[1]) == 2:
normalized = f"{parts[0]},{parts[1]}"
# Parse to verify it's a valid amount
try:
value = float(normalized.replace(',', '.'))
if 0 < value < 1000000: # Reasonable amount range
candidates.append((normalized, value, token))
except ValueError:
continue
# If no decimal amounts found, try integer amounts
# Look for "Kronor" label nearby and extract integer
if not candidates:
for i, token in enumerate(tokens):
text = token.text.strip().lower()
if 'kronor' in text or 'kr' == text or text.endswith(' kr'):
# Look at nearby tokens for amounts (wider range)
for j in range(max(0, i - 5), min(len(tokens), i + 5)):
nearby_text = tokens[j].text.strip()
# Match pure integer (1-6 digits)
int_match = re.match(r'^(\d{1,6})$', nearby_text)
if int_match:
value = int(int_match.group(1))
if 0 < value < 1000000:
candidates.append((str(value), float(value), tokens[j]))
# Also try to find amounts near "öre" label (Swedish cents)
if not candidates:
for i, token in enumerate(tokens):
text = token.text.strip().lower()
if 'öre' in text:
# Look at nearby tokens for amounts
for j in range(max(0, i - 5), min(len(tokens), i + 5)):
nearby_text = tokens[j].text.strip()
int_match = re.match(r'^(\d{1,6})$', nearby_text)
if int_match:
value = int(int_match.group(1))
if 0 < value < 1000000:
candidates.append((str(value), float(value), tokens[j]))
if not candidates:
return None
# Sort by value (prefer larger amounts - likely total)
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates[0][0]
def _calculate_confidence(
self,
result: MachineCodeResult,
has_payment_keywords: bool
) -> float:
"""Calculate confidence score for the extraction."""
confidence = 0.0
# Base confidence from payment keywords
if has_payment_keywords:
confidence += 0.3
# Points for each extracted field
if result.ocr:
confidence += 0.25
# Bonus for typical OCR length (15-17 digits)
if 15 <= len(result.ocr) <= 17:
confidence += 0.1
if result.bankgiro or result.plusgiro:
confidence += 0.2
if result.amount:
confidence += 0.15
return min(confidence, 1.0)
def cross_validate(
self,
machine_result: MachineCodeResult,
csv_values: dict[str, str],
) -> dict[str, dict]:
"""
Cross-validate machine code extraction with CSV ground truth.
Args:
machine_result: Result from parse()
csv_values: Dict of field values from CSV
(keys: 'ocr', 'amount', 'bankgiro', 'plusgiro')
Returns:
Dict with validation results for each field:
{
'ocr': {
'machine': '123456789',
'csv': '123456789',
'match': True,
'use_machine': False, # CSV has value
},
...
}
"""
from src.normalize import normalize_field
results = {}
field_mapping = [
('ocr', 'OCR', machine_result.ocr),
('amount', 'Amount', machine_result.amount),
('bankgiro', 'Bankgiro', machine_result.bankgiro),
('plusgiro', 'Plusgiro', machine_result.plusgiro),
]
for field_key, normalizer_name, machine_value in field_mapping:
csv_value = csv_values.get(field_key, '').strip()
result_entry = {
'machine': machine_value,
'csv': csv_value if csv_value else None,
'match': False,
'use_machine': False,
}
if machine_value and csv_value:
# Both have values - check if they match
machine_variants = normalize_field(normalizer_name, machine_value)
csv_variants = normalize_field(normalizer_name, csv_value)
# Check for any overlap
result_entry['match'] = bool(
set(machine_variants) & set(csv_variants)
)
# Special handling for amounts - allow rounding differences
if not result_entry['match'] and field_key == 'amount':
try:
# Parse both values as floats
machine_float = float(
machine_value.replace(' ', '')
.replace(',', '.').replace('\xa0', '')
)
csv_float = float(
csv_value.replace(' ', '')
.replace(',', '.').replace('\xa0', '')
)
# Allow 1 unit difference (rounding)
if abs(machine_float - csv_float) <= 1.0:
result_entry['match'] = True
result_entry['rounding_diff'] = True
except ValueError:
pass
elif machine_value and not csv_value:
# CSV is missing, use machine value
result_entry['use_machine'] = True
results[field_key] = result_entry
return results
def parse_machine_code(
tokens: list[TextToken],
page_height: float,
page_width: float | None = None,
bottom_ratio: float = 0.35,
) -> MachineCodeResult:
"""
Convenience function to parse machine code from tokens.
Args:
tokens: List of text tokens
page_height: Page height in points
page_width: Page width in points (optional)
bottom_ratio: Fraction of page to consider as bottom region
Returns:
MachineCodeResult with extracted fields
"""
parser = MachineCodeParser(bottom_region_ratio=bottom_ratio)
return parser.parse(tokens, page_height, page_width)