""" Machine Code Line Parser for Swedish Invoices Parses the bottom machine-readable payment line to extract: - OCR reference number (10-25 digits) - Amount (payment amount in SEK) - Bankgiro account number (XXX-XXXX or XXXX-XXXX format) - Plusgiro account number (XXXXXXX-X format) The machine code line is typically found at the bottom of Swedish invoices, in the payment slip (Inbetalningskort) section. It contains machine-readable data for automated payment processing. ## Swedish Payment Line Standard Format The standard machine-readable payment line follows this structure: # # <Öre> > ## Example: # 31130954410 # 315 00 2 > 8983025#14# Components: - `#` - Start delimiter - `31130954410` - OCR number (with Mod 10 check digit) - `#` - Separator - `315 00` - Amount: 315 SEK and 00 öre (315.00 SEK) - `2` - Payment type / record type - `>` - Points to recipient info - `8983025` - Bankgiro number - `#14#` - End marker with control code Legacy patterns also supported: - OCR: 8120000849965361 (10-25 consecutive digits) - Bankgiro: 5393-9484 or 53939484 - Plusgiro: 1234567-8 - Amount: 1234,56 or 1234.56 (with decimal separator) """ import re from dataclasses import dataclass, field from typing import Optional from src.pdf.extractor import Token as TextToken @dataclass class MachineCodeResult: """Result of machine code parsing.""" ocr: Optional[str] = None amount: Optional[str] = None bankgiro: Optional[str] = None plusgiro: Optional[str] = None confidence: float = 0.0 source_tokens: list[TextToken] = field(default_factory=list) raw_line: str = "" # Region bounding box in PDF coordinates (x0, y0, x1, y1) region_bbox: Optional[tuple[float, float, float, float]] = None def to_dict(self) -> dict: """Convert to dictionary for serialization.""" return { 'ocr': self.ocr, 'amount': self.amount, 'bankgiro': self.bankgiro, 'plusgiro': self.plusgiro, 'confidence': self.confidence, 'raw_line': self.raw_line, 'region_bbox': self.region_bbox, } def get_region_bbox(self) -> Optional[tuple[float, float, float, float]]: """ Get the bounding box of the payment slip region. Returns: Tuple (x0, y0, x1, y1) in PDF coordinates, or None if no region detected """ if self.region_bbox: return self.region_bbox if not self.source_tokens: return None # Calculate bbox from source tokens x0 = min(t.bbox[0] for t in self.source_tokens) y0 = min(t.bbox[1] for t in self.source_tokens) x1 = max(t.bbox[2] for t in self.source_tokens) y1 = max(t.bbox[3] for t in self.source_tokens) return (x0, y0, x1, y1) class MachineCodeParser: """ Parser for machine-readable payment lines on Swedish invoices. The parser focuses on the bottom region of the invoice where the payment slip (Inbetalningskort) is typically located. """ # Payment slip detection keywords (Swedish) PAYMENT_SLIP_KEYWORDS = [ 'inbetalning', 'girering', 'avi', 'betalning', 'plusgiro', 'postgiro', 'bankgiro', 'bankgirot', 'betalningsavsändare', 'betalningsmottagare', 'maskinellt', 'ändringar', # "DEN AVLÄSES MASKINELLT" ] # Patterns for field extraction # OCR: 10-25 consecutive digits (may have spaces or # at end) OCR_PATTERN = re.compile(r'(? # <Öre> > ## # Example: # 31130954410 # 315 00 2 > 8983025#14# # This pattern captures both Bankgiro and Plusgiro accounts PAYMENT_LINE_PATTERN = re.compile( r'#\s*' # Start delimiter r'(\d{5,25})\s*' # OCR number (capture group 1) r'#\s*' # Separator r'(\d{1,7})\s+' # Kronor (capture group 2) r'(\d{2})\s+' # Öre (capture group 3) r'(\d)\s*' # Type (capture group 4) r'>\s*' # Direction marker r'(\d{5,10})' # Bankgiro/Plusgiro (capture group 5) r'(?:#\d{1,3}#)?' # Optional end marker ) # Alternative pattern with different spacing PAYMENT_LINE_PATTERN_ALT = re.compile( r'#?\s*' # Optional start delimiter r'(\d{8,25})\s*' # OCR number r'#?\s*' # Optional separator r'(\d{1,7})\s+' # Kronor r'(\d{2})\s+' # Öre r'\d\s*' # Type r'>?\s*' # Optional direction marker r'(\d{5,10})' # Bankgiro ) # Reverse format pattern (Bankgiro first, then OCR) # Format: ## <Öre> > # # Example: 53241469#41# 2428 00 1 > 4388595300 # PAYMENT_LINE_PATTERN_REVERSE = re.compile( r'(\d{7,8})' # Bankgiro (capture group 1) r'#\d{1,3}#\s+' # Control marker r'(\d{1,7})\s+' # Kronor (capture group 2) r'(\d{2})\s+' # Öre (capture group 3) r'\d\s*' # Type r'>\s*' # Direction marker r'(\d{5,25})' # OCR number (capture group 4) ) def __init__(self, bottom_region_ratio: float = 0.35): """ Initialize the parser. Args: bottom_region_ratio: Fraction of page height to consider as bottom region. Default 0.35 means bottom 35% of the page. """ self.bottom_region_ratio = bottom_region_ratio def parse( self, tokens: list[TextToken], page_height: float, page_width: float | None = None, ) -> MachineCodeResult: """ Parse machine code from tokens. Args: tokens: List of text tokens from OCR or text extraction page_height: Height of the page in points page_width: Width of the page in points (optional) Returns: MachineCodeResult with extracted fields """ if not tokens: return MachineCodeResult() # Filter to bottom region tokens bottom_y_threshold = page_height * (1 - self.bottom_region_ratio) bottom_tokens = [ t for t in tokens if t.bbox[1] >= bottom_y_threshold # y0 >= threshold ] if not bottom_tokens: return MachineCodeResult() # Sort by y position (top to bottom) then x (left to right) bottom_tokens.sort(key=lambda t: (t.bbox[1], t.bbox[0])) # Check if this looks like a payment slip region combined_text = ' '.join(t.text for t in bottom_tokens).lower() has_payment_keywords = any( kw in combined_text for kw in self.PAYMENT_SLIP_KEYWORDS ) # Build raw line from bottom tokens raw_line = ' '.join(t.text for t in bottom_tokens) # Try standard payment line format first and find the matching tokens standard_result, matched_tokens = self._parse_standard_payment_line_with_tokens( raw_line, bottom_tokens ) if standard_result and matched_tokens: # Calculate bbox only from tokens that contain the machine code x0 = min(t.bbox[0] for t in matched_tokens) y0 = min(t.bbox[1] for t in matched_tokens) x1 = max(t.bbox[2] for t in matched_tokens) y1 = max(t.bbox[3] for t in matched_tokens) region_bbox = (x0, y0, x1, y1) result = MachineCodeResult( ocr=standard_result.get('ocr'), amount=standard_result.get('amount'), bankgiro=standard_result.get('bankgiro'), plusgiro=standard_result.get('plusgiro'), confidence=0.95, source_tokens=matched_tokens, raw_line=raw_line, region_bbox=region_bbox, ) return result # Fall back to individual field extraction result = MachineCodeResult( source_tokens=bottom_tokens, raw_line=raw_line, ) # Extract OCR number (longest digit sequence 10-25 digits) result.ocr = self._extract_ocr(bottom_tokens) # Extract Bankgiro result.bankgiro = self._extract_bankgiro(bottom_tokens) # Extract Plusgiro (if no Bankgiro found) if not result.bankgiro: result.plusgiro = self._extract_plusgiro(bottom_tokens) # Extract Amount result.amount = self._extract_amount(bottom_tokens) # Calculate confidence result.confidence = self._calculate_confidence( result, has_payment_keywords ) # For fallback extraction, compute bbox from tokens that contain the extracted values matched_tokens = self._find_tokens_with_values(bottom_tokens, result) if matched_tokens: x0 = min(t.bbox[0] for t in matched_tokens) y0 = min(t.bbox[1] for t in matched_tokens) x1 = max(t.bbox[2] for t in matched_tokens) y1 = max(t.bbox[3] for t in matched_tokens) result.region_bbox = (x0, y0, x1, y1) result.source_tokens = matched_tokens return result def _find_tokens_with_values( self, tokens: list[TextToken], result: MachineCodeResult ) -> list[TextToken]: """Find tokens that contain the extracted values (OCR, Amount, Bankgiro).""" matched = [] values_to_find = [] if result.ocr: values_to_find.append(result.ocr) if result.amount: # Amount might be just digits amount_digits = re.sub(r'\D', '', result.amount) values_to_find.append(amount_digits) values_to_find.append(result.amount) if result.bankgiro: # Bankgiro might have dash or not bg_digits = re.sub(r'\D', '', result.bankgiro) values_to_find.append(bg_digits) values_to_find.append(result.bankgiro) if result.plusgiro: pg_digits = re.sub(r'\D', '', result.plusgiro) values_to_find.append(pg_digits) values_to_find.append(result.plusgiro) for token in tokens: text = token.text.replace(' ', '').replace('#', '') text_digits = re.sub(r'\D', '', token.text) for value in values_to_find: if value in text or value in text_digits: if token not in matched: matched.append(token) break return matched def _find_machine_code_line_tokens( self, tokens: list[TextToken] ) -> list[TextToken]: """ Find tokens that belong to the machine code line using pure regex patterns. The machine code line typically contains: - Control markers like #14#, #41# - Direction marker > - Account numbers with # suffix Returns: List of tokens belonging to the machine code line """ # Find tokens with characteristic machine code patterns ref_y = None # First, find the reference y-coordinate from tokens with machine code patterns for token in tokens: text = token.text # Check if token contains machine code patterns # Priority 1: Control marker like #14#, 47304035#14# has_control_marker = bool(re.search(r'#\d+#', text)) # Priority 2: Direction marker > has_direction = '>' in text if has_control_marker: # This is very likely part of the machine code line ref_y = token.bbox[1] break elif has_direction and ref_y is None: # Direction marker is also a good indicator ref_y = token.bbox[1] if ref_y is None: return [] # Collect all tokens on the same line (within 3 points of ref_y) # Use very small tolerance because Swedish invoices often have duplicate # machine code lines (upper and lower part of payment slip) y_tolerance = 3 machine_code_tokens = [] for token in tokens: if abs(token.bbox[1] - ref_y) < y_tolerance: text = token.text # Include token if it contains: # - Digits (OCR, amount, account numbers) # - # symbol (delimiters, control markers) # - > symbol (direction marker) if (re.search(r'\d', text) or '#' in text or '>' in text): machine_code_tokens.append(token) # If we found very few tokens, try to expand to nearby y values # that might be part of the same logical line if len(machine_code_tokens) < 3: y_tolerance = 10 machine_code_tokens = [] for token in tokens: if abs(token.bbox[1] - ref_y) < y_tolerance: text = token.text if (re.search(r'\d', text) or '#' in text or '>' in text): machine_code_tokens.append(token) return machine_code_tokens def _parse_standard_payment_line_with_tokens( self, raw_line: str, tokens: list[TextToken] ) -> tuple[Optional[dict], list[TextToken]]: """ Parse standard Swedish payment line format and find matching tokens. Uses pure regex to identify the machine code line, then finds tokens that are part of that line based on their position. Format: # # <Öre> > ## Example: # 31130954410 # 315 00 2 > 8983025#14# Returns: Tuple of (parsed_dict, matched_tokens) or (None, []) """ # First find the machine code line tokens using pattern matching machine_code_tokens = self._find_machine_code_line_tokens(tokens) if not machine_code_tokens: # Fall back to regex on raw_line parsed = self._parse_standard_payment_line(raw_line, raw_line) return parsed, [] # Build a line from just the machine code tokens (sorted by x position) # Group tokens by approximate x position to handle duplicate OCR results mc_tokens_sorted = sorted(machine_code_tokens, key=lambda t: t.bbox[0]) # Deduplicate tokens at similar x positions (keep the first one) deduped_tokens = [] last_x = -100 for t in mc_tokens_sorted: # Skip tokens that are too close to the previous one (likely duplicates) if t.bbox[0] - last_x < 5: continue deduped_tokens.append(t) last_x = t.bbox[2] # Use end x for next comparison mc_line = ' '.join(t.text for t in deduped_tokens) # Try to parse this line, using raw_line for context detection parsed = self._parse_standard_payment_line(mc_line, raw_line) if parsed: return parsed, deduped_tokens # If machine code line parsing failed, try the full raw_line parsed = self._parse_standard_payment_line(raw_line, raw_line) if parsed: return parsed, machine_code_tokens return None, [] def _parse_standard_payment_line( self, raw_line: str, context_line: str | None = None ) -> Optional[dict]: """ Parse standard Swedish payment line format. Format: # # <Öre> > ## Example: # 31130954410 # 315 00 2 > 8983025#14# Args: raw_line: The line to parse (may be just the machine code tokens) context_line: Optional full line for context detection (e.g., to find "plusgiro" keywords) Returns: Dict with 'ocr', 'amount', and 'bankgiro' or 'plusgiro' if matched, None otherwise """ # Use context_line for detecting Plusgiro/Bankgiro, fall back to raw_line context = (context_line or raw_line).lower() is_plusgiro_context = ( ('plusgiro' in context or 'postgiro' in context or 'plusgirokonto' in context) and 'bankgiro' not in context ) # Preprocess: remove spaces in the account number part (after >) # This handles cases like "78 2 1 713" -> "7821713" def normalize_account_spaces(line: str) -> str: """Remove spaces in account number portion after > marker.""" if '>' in line: parts = line.split('>', 1) # After >, remove spaces between digits (but keep # markers) after_arrow = parts[1] # Extract digits and # markers, remove spaces between digits normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', after_arrow) # May need multiple passes for sequences like "78 2 1 713" while re.search(r'(\d)\s+(\d)', normalized): normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized) return parts[0] + '>' + normalized return line raw_line = normalize_account_spaces(raw_line) def format_account(account_digits: str) -> tuple[str, str]: """Format account and determine type (bankgiro or plusgiro). Returns: (formatted_account, account_type) """ if is_plusgiro_context: # Plusgiro format: XXXXXXX-X formatted = f"{account_digits[:-1]}-{account_digits[-1]}" return formatted, 'plusgiro' else: # Bankgiro format: XXX-XXXX or XXXX-XXXX if len(account_digits) == 7: formatted = f"{account_digits[:3]}-{account_digits[3:]}" elif len(account_digits) == 8: formatted = f"{account_digits[:4]}-{account_digits[4:]}" else: formatted = account_digits return formatted, 'bankgiro' # Try primary pattern match = self.PAYMENT_LINE_PATTERN.search(raw_line) if match: ocr = match.group(1) kronor = match.group(2) ore = match.group(3) account_digits = match.group(5) # Format amount: combine kronor and öre amount = f"{kronor},{ore}" if ore != "00" else kronor formatted_account, account_type = format_account(account_digits) return { 'ocr': ocr, 'amount': amount, account_type: formatted_account, } # Try alternative pattern match = self.PAYMENT_LINE_PATTERN_ALT.search(raw_line) if match: ocr = match.group(1) kronor = match.group(2) ore = match.group(3) account_digits = match.group(4) amount = f"{kronor},{ore}" if ore != "00" else kronor formatted_account, account_type = format_account(account_digits) return { 'ocr': ocr, 'amount': amount, account_type: formatted_account, } # Try reverse pattern (Account first, then OCR) match = self.PAYMENT_LINE_PATTERN_REVERSE.search(raw_line) if match: account_digits = match.group(1) kronor = match.group(2) ore = match.group(3) ocr = match.group(4) amount = f"{kronor},{ore}" if ore != "00" else kronor formatted_account, account_type = format_account(account_digits) return { 'ocr': ocr, 'amount': amount, account_type: formatted_account, } return None def _extract_ocr(self, tokens: list[TextToken]) -> Optional[str]: """Extract OCR reference number.""" candidates = [] # First, collect all bankgiro-like patterns to exclude bankgiro_digits = set() for token in tokens: text = token.text.strip() bg_matches = self.BANKGIRO_PATTERN.findall(text) for bg in bg_matches: digits = re.sub(r'\D', '', bg) bankgiro_digits.add(digits) # Also add with potential check digits (common pattern) for i in range(10): bankgiro_digits.add(digits + str(i)) bankgiro_digits.add(digits + str(i) + str(i)) for token in tokens: # Remove spaces and common suffixes text = token.text.replace(' ', '').replace('#', '').strip() # Find all digit sequences matches = self.OCR_PATTERN.findall(text) for match in matches: # OCR numbers are typically 10-25 digits if 10 <= len(match) <= 25: # Skip if this looks like a bankgiro number with check digit is_bankgiro_variant = any( match.startswith(bg) or match.endswith(bg) for bg in bankgiro_digits if len(bg) >= 7 ) # Also check if it's exactly bankgiro with 2-3 extra digits for bg in bankgiro_digits: if len(bg) >= 7 and ( match == bg or (len(match) - len(bg) <= 3 and match.startswith(bg)) ): is_bankgiro_variant = True break if not is_bankgiro_variant: candidates.append((match, len(match), token)) if not candidates: return None # Prefer longer sequences (more likely to be OCR) candidates.sort(key=lambda x: x[1], reverse=True) return candidates[0][0] def _extract_bankgiro(self, tokens: list[TextToken]) -> Optional[str]: """Extract Bankgiro account number. Bankgiro format: XXX-XXXX or XXXX-XXXX (dash in middle) NOT Plusgiro: XXXXXXX-X (dash before last digit) """ candidates = [] context_text = ' '.join(t.text.lower() for t in tokens) # Check if this is clearly a Plusgiro context (not Bankgiro) is_plusgiro_only_context = ( ('plusgiro' in context_text or 'postgiro' in context_text or 'plusgirokonto' in context_text) and 'bankgiro' not in context_text ) # If clearly Plusgiro context, don't extract as Bankgiro if is_plusgiro_only_context: return None for token in tokens: text = token.text.strip() # Look for Bankgiro pattern matches = self.BANKGIRO_PATTERN.findall(text) for match in matches: # Check if this looks like Plusgiro format (dash before last digit) # Plusgiro: 1234567-8 (dash at position -2) if '-' in match: parts = match.replace(' ', '').split('-') if len(parts) == 2 and len(parts[1]) == 1: # This is Plusgiro format, skip continue # Normalize: remove spaces, ensure dash digits = re.sub(r'\D', '', match) if len(digits) == 7: normalized = f"{digits[:3]}-{digits[3:]}" elif len(digits) == 8: normalized = f"{digits[:4]}-{digits[4:]}" else: continue # Check if "bankgiro" or "bg" appears nearby is_bankgiro_context = ( 'bankgiro' in context_text or 'bg:' in context_text or 'bg ' in context_text ) candidates.append((normalized, is_bankgiro_context, token)) if not candidates: return None # Prefer matches with bankgiro context candidates.sort(key=lambda x: (x[1], 1), reverse=True) return candidates[0][0] def _extract_plusgiro(self, tokens: list[TextToken]) -> Optional[str]: """Extract Plusgiro account number.""" candidates = [] for token in tokens: text = token.text.strip() matches = self.PLUSGIRO_PATTERN.findall(text) for match in matches: # Normalize: remove spaces, ensure dash before last digit digits = re.sub(r'\D', '', match) if 7 <= len(digits) <= 8: normalized = f"{digits[:-1]}-{digits[-1]}" # Check context context_text = ' '.join(t.text.lower() for t in tokens) is_plusgiro_context = ( 'plusgiro' in context_text or 'postgiro' in context_text or 'pg:' in context_text or 'pg ' in context_text ) candidates.append((normalized, is_plusgiro_context, token)) if not candidates: return None candidates.sort(key=lambda x: (x[1], 1), reverse=True) return candidates[0][0] def _extract_amount(self, tokens: list[TextToken]) -> Optional[str]: """Extract payment amount.""" candidates = [] for token in tokens: text = token.text.strip() # Try decimal amount pattern first matches = self.AMOUNT_PATTERN.findall(text) for match in matches: # Normalize: remove thousand separators, use comma as decimal normalized = match.replace(' ', '').replace('\xa0', '') # Convert dot thousand separator to none, keep comma decimal if '.' in normalized and ',' in normalized: # Format like 1.234,56 -> 1234,56 normalized = normalized.replace('.', '') elif '.' in normalized: # Could be 1234.56 -> 1234,56 parts = normalized.split('.') if len(parts) == 2 and len(parts[1]) == 2: normalized = f"{parts[0]},{parts[1]}" # Parse to verify it's a valid amount try: value = float(normalized.replace(',', '.')) if 0 < value < 1000000: # Reasonable amount range candidates.append((normalized, value, token)) except ValueError: continue # If no decimal amounts found, try integer amounts # Look for "Kronor" label nearby and extract integer if not candidates: for i, token in enumerate(tokens): text = token.text.strip().lower() if 'kronor' in text or 'kr' == text or text.endswith(' kr'): # Look at nearby tokens for amounts (wider range) for j in range(max(0, i - 5), min(len(tokens), i + 5)): nearby_text = tokens[j].text.strip() # Match pure integer (1-6 digits) int_match = re.match(r'^(\d{1,6})$', nearby_text) if int_match: value = int(int_match.group(1)) if 0 < value < 1000000: candidates.append((str(value), float(value), tokens[j])) # Also try to find amounts near "öre" label (Swedish cents) if not candidates: for i, token in enumerate(tokens): text = token.text.strip().lower() if 'öre' in text: # Look at nearby tokens for amounts for j in range(max(0, i - 5), min(len(tokens), i + 5)): nearby_text = tokens[j].text.strip() int_match = re.match(r'^(\d{1,6})$', nearby_text) if int_match: value = int(int_match.group(1)) if 0 < value < 1000000: candidates.append((str(value), float(value), tokens[j])) if not candidates: return None # Sort by value (prefer larger amounts - likely total) candidates.sort(key=lambda x: x[1], reverse=True) return candidates[0][0] def _calculate_confidence( self, result: MachineCodeResult, has_payment_keywords: bool ) -> float: """Calculate confidence score for the extraction.""" confidence = 0.0 # Base confidence from payment keywords if has_payment_keywords: confidence += 0.3 # Points for each extracted field if result.ocr: confidence += 0.25 # Bonus for typical OCR length (15-17 digits) if 15 <= len(result.ocr) <= 17: confidence += 0.1 if result.bankgiro or result.plusgiro: confidence += 0.2 if result.amount: confidence += 0.15 return min(confidence, 1.0) def cross_validate( self, machine_result: MachineCodeResult, csv_values: dict[str, str], ) -> dict[str, dict]: """ Cross-validate machine code extraction with CSV ground truth. Args: machine_result: Result from parse() csv_values: Dict of field values from CSV (keys: 'ocr', 'amount', 'bankgiro', 'plusgiro') Returns: Dict with validation results for each field: { 'ocr': { 'machine': '123456789', 'csv': '123456789', 'match': True, 'use_machine': False, # CSV has value }, ... } """ from src.normalize import normalize_field results = {} field_mapping = [ ('ocr', 'OCR', machine_result.ocr), ('amount', 'Amount', machine_result.amount), ('bankgiro', 'Bankgiro', machine_result.bankgiro), ('plusgiro', 'Plusgiro', machine_result.plusgiro), ] for field_key, normalizer_name, machine_value in field_mapping: csv_value = csv_values.get(field_key, '').strip() result_entry = { 'machine': machine_value, 'csv': csv_value if csv_value else None, 'match': False, 'use_machine': False, } if machine_value and csv_value: # Both have values - check if they match machine_variants = normalize_field(normalizer_name, machine_value) csv_variants = normalize_field(normalizer_name, csv_value) # Check for any overlap result_entry['match'] = bool( set(machine_variants) & set(csv_variants) ) # Special handling for amounts - allow rounding differences if not result_entry['match'] and field_key == 'amount': try: # Parse both values as floats machine_float = float( machine_value.replace(' ', '') .replace(',', '.').replace('\xa0', '') ) csv_float = float( csv_value.replace(' ', '') .replace(',', '.').replace('\xa0', '') ) # Allow 1 unit difference (rounding) if abs(machine_float - csv_float) <= 1.0: result_entry['match'] = True result_entry['rounding_diff'] = True except ValueError: pass elif machine_value and not csv_value: # CSV is missing, use machine value result_entry['use_machine'] = True results[field_key] = result_entry return results def parse_machine_code( tokens: list[TextToken], page_height: float, page_width: float | None = None, bottom_ratio: float = 0.35, ) -> MachineCodeResult: """ Convenience function to parse machine code from tokens. Args: tokens: List of text tokens page_height: Page height in points page_width: Page width in points (optional) bottom_ratio: Fraction of page to consider as bottom region Returns: MachineCodeResult with extracted fields """ parser = MachineCodeParser(bottom_region_ratio=bottom_ratio) return parser.parse(tokens, page_height, page_width)