invoice-master-poc-v2/src/ocr/machine_code_parser.py

"""
Machine Code Line Parser for Swedish Invoices

Parses the bottom machine-readable payment line to extract:
- OCR reference number (10-25 digits)
- Amount (payment amount in SEK)
- Bankgiro account number (XXX-XXXX or XXXX-XXXX format)
- Plusgiro account number (XXXXXXX-X format)

The machine code line is typically found at the bottom of Swedish invoices,
in the payment slip (Inbetalningskort) section. It contains machine-readable
data for automated payment processing.

## Swedish Payment Line Standard Format

The standard machine-readable payment line follows this structure:

    # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro>#<Control>#

Example:
    # 31130954410 # 315 00 2 > 8983025#14#

Components:
- `#` - Start delimiter
- `31130954410` - OCR number (with Mod 10 check digit)
- `#` - Separator
- `315 00` - Amount: 315 SEK and 00 öre (315.00 SEK)
- `2` - Payment type / record type
- `>` - Points to recipient info
- `8983025` - Bankgiro number
- `#14#` - End marker with control code

Legacy patterns also supported:
- OCR: 8120000849965361 (10-25 consecutive digits)
- Bankgiro: 5393-9484 or 53939484
- Plusgiro: 1234567-8
- Amount: 1234,56 or 1234.56 (with decimal separator)
"""

import re
from dataclasses import dataclass, field
from typing import Optional

from src.pdf.extractor import Token as TextToken


@dataclass
class MachineCodeResult:
    """Result of machine code parsing."""
    ocr: Optional[str] = None
    amount: Optional[str] = None
    bankgiro: Optional[str] = None
    plusgiro: Optional[str] = None
    confidence: float = 0.0
    source_tokens: list[TextToken] = field(default_factory=list)
    raw_line: str = ""
    # Region bounding box in PDF coordinates (x0, y0, x1, y1)
    region_bbox: Optional[tuple[float, float, float, float]] = None

    def to_dict(self) -> dict:
        """Convert to dictionary for serialization."""
        return {
            'ocr': self.ocr,
            'amount': self.amount,
            'bankgiro': self.bankgiro,
            'plusgiro': self.plusgiro,
            'confidence': self.confidence,
            'raw_line': self.raw_line,
            'region_bbox': self.region_bbox,
        }

    def get_region_bbox(self) -> Optional[tuple[float, float, float, float]]:
        """
        Get the bounding box of the payment slip region.

        Returns:
            Tuple (x0, y0, x1, y1) in PDF coordinates, or None if no region detected
        """
        if self.region_bbox:
            return self.region_bbox

        if not self.source_tokens:
            return None

        # Calculate bbox from source tokens
        x0 = min(t.bbox[0] for t in self.source_tokens)
        y0 = min(t.bbox[1] for t in self.source_tokens)
        x1 = max(t.bbox[2] for t in self.source_tokens)
        y1 = max(t.bbox[3] for t in self.source_tokens)

        return (x0, y0, x1, y1)


class MachineCodeParser:
    """
    Parser for machine-readable payment lines on Swedish invoices.

    The parser focuses on the bottom region of the invoice where
    the payment slip (Inbetalningskort) is typically located.
    """

    # Payment slip detection keywords (Swedish)
    PAYMENT_SLIP_KEYWORDS = [
        'inbetalning', 'girering', 'avi', 'betalning',
        'plusgiro', 'postgiro', 'bankgiro', 'bankgirot',
        'betalningsavsändare', 'betalningsmottagare',
        'maskinellt', 'ändringar',  # "DEN AVLÄSES MASKINELLT"
    ]

    # Patterns for field extraction
    # OCR: 10-25 consecutive digits (may have spaces or # at end)
    OCR_PATTERN = re.compile(r'(?<!\d)(\d{10,25})(?!\d)')

    # Bankgiro: XXX-XXXX or XXXX-XXXX (7-8 digits with optional dash)
    BANKGIRO_PATTERN = re.compile(r'\b(\d{3,4}[-\s]?\d{4})\b')

    # Plusgiro: XXXXXXX-X (7-8 digits with dash before last digit)
    PLUSGIRO_PATTERN = re.compile(r'\b(\d{6,7}[-\s]?\d)\b')

    # Amount: digits with comma or dot as decimal separator
    # Supports formats: 1234,56 | 1234.56 | 1 234,56 | 1.234,56
    AMOUNT_PATTERN = re.compile(
        r'\b(\d{1,3}(?:[\s\.\xa0]\d{3})*[,\.]\d{2})\b'
    )

    # Alternative amount pattern for integers (no decimal)
    AMOUNT_INTEGER_PATTERN = re.compile(r'\b(\d{2,6})\b')

    # Standard Swedish payment line pattern
    # Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
    # Example: # 31130954410 # 315 00 2 > 8983025#14#
    # This pattern captures both Bankgiro and Plusgiro accounts
    PAYMENT_LINE_PATTERN = re.compile(
        r'#\s*'                          # Start delimiter
        r'(\d{5,25})\s*'                 # OCR number (capture group 1)
        r'#\s*'                          # Separator
        r'(\d{1,7})\s+'                  # Kronor (capture group 2)
        r'(\d{2})\s+'                    # Öre (capture group 3)
        r'(\d)\s*'                       # Type (capture group 4)
        r'>\s*'                          # Direction marker
        r'(\d{5,10})'                    # Bankgiro/Plusgiro (capture group 5)
        r'(?:#\d{1,3}#)?'               # Optional end marker
    )

    # Alternative pattern with different spacing
    PAYMENT_LINE_PATTERN_ALT = re.compile(
        r'#?\s*'                         # Optional start delimiter
        r'(\d{8,25})\s*'                 # OCR number
        r'#?\s*'                         # Optional separator
        r'(\d{1,7})\s+'                  # Kronor
        r'(\d{2})\s+'                    # Öre
        r'\d\s*'                         # Type
        r'>?\s*'                         # Optional direction marker
        r'(\d{5,10})'                    # Bankgiro
    )

    # Reverse format pattern (Bankgiro first, then OCR)
    # Format: <Bankgiro>#<Control># <Kronor> <Öre> <Type> > <OCR> #
    # Example: 53241469#41# 2428 00 1 > 4388595300 #
    PAYMENT_LINE_PATTERN_REVERSE = re.compile(
        r'(\d{7,8})'                     # Bankgiro (capture group 1)
        r'#\d{1,3}#\s+'                  # Control marker
        r'(\d{1,7})\s+'                  # Kronor (capture group 2)
        r'(\d{2})\s+'                    # Öre (capture group 3)
        r'\d\s*'                         # Type
        r'>\s*'                          # Direction marker
        r'(\d{5,25})'                    # OCR number (capture group 4)
    )

    def __init__(self, bottom_region_ratio: float = 0.35):
        """
        Initialize the parser.

        Args:
            bottom_region_ratio: Fraction of page height to consider as bottom region.
                                 Default 0.35 means bottom 35% of the page.
        """
        self.bottom_region_ratio = bottom_region_ratio

    def parse(
        self,
        tokens: list[TextToken],
        page_height: float,
        page_width: float | None = None,
    ) -> MachineCodeResult:
        """
        Parse machine code from tokens.

        Args:
            tokens: List of text tokens from OCR or text extraction
            page_height: Height of the page in points
            page_width: Width of the page in points (optional)

        Returns:
            MachineCodeResult with extracted fields
        """
        if not tokens:
            return MachineCodeResult()

        # Filter to bottom region tokens
        bottom_y_threshold = page_height * (1 - self.bottom_region_ratio)
        bottom_tokens = [
            t for t in tokens
            if t.bbox[1] >= bottom_y_threshold  # y0 >= threshold
        ]

        if not bottom_tokens:
            return MachineCodeResult()

        # Sort by y position (top to bottom) then x (left to right)
        bottom_tokens.sort(key=lambda t: (t.bbox[1], t.bbox[0]))

        # Check if this looks like a payment slip region
        combined_text = ' '.join(t.text for t in bottom_tokens).lower()
        has_payment_keywords = any(
            kw in combined_text for kw in self.PAYMENT_SLIP_KEYWORDS
        )

        # Build raw line from bottom tokens
        raw_line = ' '.join(t.text for t in bottom_tokens)

        # Try standard payment line format first and find the matching tokens
        standard_result, matched_tokens = self._parse_standard_payment_line_with_tokens(
            raw_line, bottom_tokens
        )

        if standard_result and matched_tokens:
            # Calculate bbox only from tokens that contain the machine code
            x0 = min(t.bbox[0] for t in matched_tokens)
            y0 = min(t.bbox[1] for t in matched_tokens)
            x1 = max(t.bbox[2] for t in matched_tokens)
            y1 = max(t.bbox[3] for t in matched_tokens)
            region_bbox = (x0, y0, x1, y1)

            result = MachineCodeResult(
                ocr=standard_result.get('ocr'),
                amount=standard_result.get('amount'),
                bankgiro=standard_result.get('bankgiro'),
                plusgiro=standard_result.get('plusgiro'),
                confidence=0.95,
                source_tokens=matched_tokens,
                raw_line=raw_line,
                region_bbox=region_bbox,
            )
            return result

        # Fall back to individual field extraction
        result = MachineCodeResult(
            source_tokens=bottom_tokens,
            raw_line=raw_line,
        )

        # Extract OCR number (longest digit sequence 10-25 digits)
        result.ocr = self._extract_ocr(bottom_tokens)

        # Extract Bankgiro
        result.bankgiro = self._extract_bankgiro(bottom_tokens)

        # Extract Plusgiro (if no Bankgiro found)
        if not result.bankgiro:
            result.plusgiro = self._extract_plusgiro(bottom_tokens)

        # Extract Amount
        result.amount = self._extract_amount(bottom_tokens)

        # Calculate confidence
        result.confidence = self._calculate_confidence(
            result, has_payment_keywords
        )

        # For fallback extraction, compute bbox from tokens that contain the extracted values
        matched_tokens = self._find_tokens_with_values(bottom_tokens, result)
        if matched_tokens:
            x0 = min(t.bbox[0] for t in matched_tokens)
            y0 = min(t.bbox[1] for t in matched_tokens)
            x1 = max(t.bbox[2] for t in matched_tokens)
            y1 = max(t.bbox[3] for t in matched_tokens)
            result.region_bbox = (x0, y0, x1, y1)
            result.source_tokens = matched_tokens

        return result

    def _find_tokens_with_values(
        self,
        tokens: list[TextToken],
        result: MachineCodeResult
    ) -> list[TextToken]:
        """Find tokens that contain the extracted values (OCR, Amount, Bankgiro)."""
        matched = []
        values_to_find = []

        if result.ocr:
            values_to_find.append(result.ocr)
        if result.amount:
            # Amount might be just digits
            amount_digits = re.sub(r'\D', '', result.amount)
            values_to_find.append(amount_digits)
            values_to_find.append(result.amount)
        if result.bankgiro:
            # Bankgiro might have dash or not
            bg_digits = re.sub(r'\D', '', result.bankgiro)
            values_to_find.append(bg_digits)
            values_to_find.append(result.bankgiro)
        if result.plusgiro:
            pg_digits = re.sub(r'\D', '', result.plusgiro)
            values_to_find.append(pg_digits)
            values_to_find.append(result.plusgiro)

        for token in tokens:
            text = token.text.replace(' ', '').replace('#', '')
            text_digits = re.sub(r'\D', '', token.text)

            for value in values_to_find:
                if value in text or value in text_digits:
                    if token not in matched:
                        matched.append(token)
                    break

        return matched

    def _find_machine_code_line_tokens(
        self,
        tokens: list[TextToken]
    ) -> list[TextToken]:
        """
        Find tokens that belong to the machine code line using pure regex patterns.

        The machine code line typically contains:
        - Control markers like #14#, #41#
        - Direction marker >
        - Account numbers with # suffix

        Returns:
            List of tokens belonging to the machine code line
        """
        # Find tokens with characteristic machine code patterns
        ref_y = None

        # First, find the reference y-coordinate from tokens with machine code patterns
        for token in tokens:
            text = token.text

            # Check if token contains machine code patterns
            # Priority 1: Control marker like #14#, 47304035#14#
            has_control_marker = bool(re.search(r'#\d+#', text))
            # Priority 2: Direction marker >
            has_direction = '>' in text

            if has_control_marker:
                # This is very likely part of the machine code line
                ref_y = token.bbox[1]
                break
            elif has_direction and ref_y is None:
                # Direction marker is also a good indicator
                ref_y = token.bbox[1]

        if ref_y is None:
            return []

        # Collect all tokens on the same line (within 3 points of ref_y)
        # Use very small tolerance because Swedish invoices often have duplicate
        # machine code lines (upper and lower part of payment slip)
        y_tolerance = 3
        machine_code_tokens = []
        for token in tokens:
            if abs(token.bbox[1] - ref_y) < y_tolerance:
                text = token.text
                # Include token if it contains:
                # - Digits (OCR, amount, account numbers)
                # - # symbol (delimiters, control markers)
                # - > symbol (direction marker)
                if (re.search(r'\d', text) or '#' in text or '>' in text):
                    machine_code_tokens.append(token)

        # If we found very few tokens, try to expand to nearby y values
        # that might be part of the same logical line
        if len(machine_code_tokens) < 3:
            y_tolerance = 10
            machine_code_tokens = []
            for token in tokens:
                if abs(token.bbox[1] - ref_y) < y_tolerance:
                    text = token.text
                    if (re.search(r'\d', text) or '#' in text or '>' in text):
                        machine_code_tokens.append(token)

        return machine_code_tokens

    def _parse_standard_payment_line_with_tokens(
        self,
        raw_line: str,
        tokens: list[TextToken]
    ) -> tuple[Optional[dict], list[TextToken]]:
        """
        Parse standard Swedish payment line format and find matching tokens.

        Uses pure regex to identify the machine code line, then finds tokens
        that are part of that line based on their position.

        Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
        Example: # 31130954410 # 315 00 2 > 8983025#14#

        Returns:
            Tuple of (parsed_dict, matched_tokens) or (None, [])
        """
        # First find the machine code line tokens using pattern matching
        machine_code_tokens = self._find_machine_code_line_tokens(tokens)

        if not machine_code_tokens:
            # Fall back to regex on raw_line
            parsed = self._parse_standard_payment_line(raw_line, raw_line)
            return parsed, []

        # Build a line from just the machine code tokens (sorted by x position)
        # Group tokens by approximate x position to handle duplicate OCR results
        mc_tokens_sorted = sorted(machine_code_tokens, key=lambda t: t.bbox[0])

        # Deduplicate tokens at similar x positions (keep the first one)
        deduped_tokens = []
        last_x = -100
        for t in mc_tokens_sorted:
            # Skip tokens that are too close to the previous one (likely duplicates)
            if t.bbox[0] - last_x < 5:
                continue
            deduped_tokens.append(t)
            last_x = t.bbox[2]  # Use end x for next comparison

        mc_line = ' '.join(t.text for t in deduped_tokens)

        # Try to parse this line, using raw_line for context detection
        parsed = self._parse_standard_payment_line(mc_line, raw_line)
        if parsed:
            return parsed, deduped_tokens

        # If machine code line parsing failed, try the full raw_line
        parsed = self._parse_standard_payment_line(raw_line, raw_line)
        if parsed:
            return parsed, machine_code_tokens

        return None, []

    def _parse_standard_payment_line(
        self,
        raw_line: str,
        context_line: str | None = None
    ) -> Optional[dict]:
        """
        Parse standard Swedish payment line format.

        Format: # <OCR> # <Kronor> <Öre> <Type> > <Bankgiro/Plusgiro>#<Control>#
        Example: # 31130954410 # 315 00 2 > 8983025#14#

        Args:
            raw_line: The line to parse (may be just the machine code tokens)
            context_line: Optional full line for context detection (e.g., to find "plusgiro" keywords)

        Returns:
            Dict with 'ocr', 'amount', and 'bankgiro' or 'plusgiro' if matched, None otherwise
        """
        # Use context_line for detecting Plusgiro/Bankgiro, fall back to raw_line
        context = (context_line or raw_line).lower()
        is_plusgiro_context = (
            ('plusgiro' in context or 'postgiro' in context or 'plusgirokonto' in context)
            and 'bankgiro' not in context
        )

        # Preprocess: remove spaces in the account number part (after >)
        # This handles cases like "78 2 1 713" -> "7821713"
        def normalize_account_spaces(line: str) -> str:
            """Remove spaces in account number portion after > marker."""
            if '>' in line:
                parts = line.split('>', 1)
                # After >, remove spaces between digits (but keep # markers)
                after_arrow = parts[1]
                # Extract digits and # markers, remove spaces between digits
                normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', after_arrow)
                # May need multiple passes for sequences like "78 2 1 713"
                while re.search(r'(\d)\s+(\d)', normalized):
                    normalized = re.sub(r'(\d)\s+(\d)', r'\1\2', normalized)
                return parts[0] + '>' + normalized
            return line

        raw_line = normalize_account_spaces(raw_line)

        def format_account(account_digits: str) -> tuple[str, str]:
            """Format account and determine type (bankgiro or plusgiro).

            Returns: (formatted_account, account_type)
            """
            if is_plusgiro_context:
                # Plusgiro format: XXXXXXX-X
                formatted = f"{account_digits[:-1]}-{account_digits[-1]}"
                return formatted, 'plusgiro'
            else:
                # Bankgiro format: XXX-XXXX or XXXX-XXXX
                if len(account_digits) == 7:
                    formatted = f"{account_digits[:3]}-{account_digits[3:]}"
                elif len(account_digits) == 8:
                    formatted = f"{account_digits[:4]}-{account_digits[4:]}"
                else:
                    formatted = account_digits
                return formatted, 'bankgiro'

        # Try primary pattern
        match = self.PAYMENT_LINE_PATTERN.search(raw_line)
        if match:
            ocr = match.group(1)
            kronor = match.group(2)
            ore = match.group(3)
            account_digits = match.group(5)

            # Format amount: combine kronor and öre
            amount = f"{kronor},{ore}" if ore != "00" else kronor

            formatted_account, account_type = format_account(account_digits)

            return {
                'ocr': ocr,
                'amount': amount,
                account_type: formatted_account,
            }

        # Try alternative pattern
        match = self.PAYMENT_LINE_PATTERN_ALT.search(raw_line)
        if match:
            ocr = match.group(1)
            kronor = match.group(2)
            ore = match.group(3)
            account_digits = match.group(4)

            amount = f"{kronor},{ore}" if ore != "00" else kronor

            formatted_account, account_type = format_account(account_digits)

            return {
                'ocr': ocr,
                'amount': amount,
                account_type: formatted_account,
            }

        # Try reverse pattern (Account first, then OCR)
        match = self.PAYMENT_LINE_PATTERN_REVERSE.search(raw_line)
        if match:
            account_digits = match.group(1)
            kronor = match.group(2)
            ore = match.group(3)
            ocr = match.group(4)

            amount = f"{kronor},{ore}" if ore != "00" else kronor

            formatted_account, account_type = format_account(account_digits)

            return {
                'ocr': ocr,
                'amount': amount,
                account_type: formatted_account,
            }

        return None

    def _extract_ocr(self, tokens: list[TextToken]) -> Optional[str]:
        """Extract OCR reference number."""
        candidates = []

        # First, collect all bankgiro-like patterns to exclude
        bankgiro_digits = set()
        for token in tokens:
            text = token.text.strip()
            bg_matches = self.BANKGIRO_PATTERN.findall(text)
            for bg in bg_matches:
                digits = re.sub(r'\D', '', bg)
                bankgiro_digits.add(digits)
                # Also add with potential check digits (common pattern)
                for i in range(10):
                    bankgiro_digits.add(digits + str(i))
                    bankgiro_digits.add(digits + str(i) + str(i))

        for token in tokens:
            # Remove spaces and common suffixes
            text = token.text.replace(' ', '').replace('#', '').strip()

            # Find all digit sequences
            matches = self.OCR_PATTERN.findall(text)
            for match in matches:
                # OCR numbers are typically 10-25 digits
                if 10 <= len(match) <= 25:
                    # Skip if this looks like a bankgiro number with check digit
                    is_bankgiro_variant = any(
                        match.startswith(bg) or match.endswith(bg)
                        for bg in bankgiro_digits if len(bg) >= 7
                    )

                    # Also check if it's exactly bankgiro with 2-3 extra digits
                    for bg in bankgiro_digits:
                        if len(bg) >= 7 and (
                            match == bg or
                            (len(match) - len(bg) <= 3 and match.startswith(bg))
                        ):
                            is_bankgiro_variant = True
                            break

                    if not is_bankgiro_variant:
                        candidates.append((match, len(match), token))

        if not candidates:
            return None

        # Prefer longer sequences (more likely to be OCR)
        candidates.sort(key=lambda x: x[1], reverse=True)
        return candidates[0][0]

    def _extract_bankgiro(self, tokens: list[TextToken]) -> Optional[str]:
        """Extract Bankgiro account number.

        Bankgiro format: XXX-XXXX or XXXX-XXXX (dash in middle)
        NOT Plusgiro: XXXXXXX-X (dash before last digit)
        """
        candidates = []
        context_text = ' '.join(t.text.lower() for t in tokens)

        # Check if this is clearly a Plusgiro context (not Bankgiro)
        is_plusgiro_only_context = (
            ('plusgiro' in context_text or 'postgiro' in context_text or 'plusgirokonto' in context_text)
            and 'bankgiro' not in context_text
        )

        # If clearly Plusgiro context, don't extract as Bankgiro
        if is_plusgiro_only_context:
            return None

        for token in tokens:
            text = token.text.strip()

            # Look for Bankgiro pattern
            matches = self.BANKGIRO_PATTERN.findall(text)
            for match in matches:
                # Check if this looks like Plusgiro format (dash before last digit)
                # Plusgiro: 1234567-8 (dash at position -2)
                if '-' in match:
                    parts = match.replace(' ', '').split('-')
                    if len(parts) == 2 and len(parts[1]) == 1:
                        # This is Plusgiro format, skip
                        continue

                # Normalize: remove spaces, ensure dash
                digits = re.sub(r'\D', '', match)
                if len(digits) == 7:
                    normalized = f"{digits[:3]}-{digits[3:]}"
                elif len(digits) == 8:
                    normalized = f"{digits[:4]}-{digits[4:]}"
                else:
                    continue

                # Check if "bankgiro" or "bg" appears nearby
                is_bankgiro_context = (
                    'bankgiro' in context_text or
                    'bg:' in context_text or
                    'bg ' in context_text
                )

                candidates.append((normalized, is_bankgiro_context, token))

        if not candidates:
            return None

        # Prefer matches with bankgiro context
        candidates.sort(key=lambda x: (x[1], 1), reverse=True)
        return candidates[0][0]

    def _extract_plusgiro(self, tokens: list[TextToken]) -> Optional[str]:
        """Extract Plusgiro account number."""
        candidates = []

        for token in tokens:
            text = token.text.strip()

            matches = self.PLUSGIRO_PATTERN.findall(text)
            for match in matches:
                # Normalize: remove spaces, ensure dash before last digit
                digits = re.sub(r'\D', '', match)
                if 7 <= len(digits) <= 8:
                    normalized = f"{digits[:-1]}-{digits[-1]}"

                    # Check context
                    context_text = ' '.join(t.text.lower() for t in tokens)
                    is_plusgiro_context = (
                        'plusgiro' in context_text or
                        'postgiro' in context_text or
                        'pg:' in context_text or
                        'pg ' in context_text
                    )

                    candidates.append((normalized, is_plusgiro_context, token))

        if not candidates:
            return None

        candidates.sort(key=lambda x: (x[1], 1), reverse=True)
        return candidates[0][0]

    def _extract_amount(self, tokens: list[TextToken]) -> Optional[str]:
        """Extract payment amount."""
        candidates = []

        for token in tokens:
            text = token.text.strip()

            # Try decimal amount pattern first
            matches = self.AMOUNT_PATTERN.findall(text)
            for match in matches:
                # Normalize: remove thousand separators, use comma as decimal
                normalized = match.replace(' ', '').replace('\xa0', '')
                # Convert dot thousand separator to none, keep comma decimal
                if '.' in normalized and ',' in normalized:
                    # Format like 1.234,56 -> 1234,56
                    normalized = normalized.replace('.', '')
                elif '.' in normalized:
                    # Could be 1234.56 -> 1234,56
                    parts = normalized.split('.')
                    if len(parts) == 2 and len(parts[1]) == 2:
                        normalized = f"{parts[0]},{parts[1]}"

                # Parse to verify it's a valid amount
                try:
                    value = float(normalized.replace(',', '.'))
                    if 0 < value < 1000000:  # Reasonable amount range
                        candidates.append((normalized, value, token))
                except ValueError:
                    continue

        # If no decimal amounts found, try integer amounts
        # Look for "Kronor" label nearby and extract integer
        if not candidates:
            for i, token in enumerate(tokens):
                text = token.text.strip().lower()
                if 'kronor' in text or 'kr' == text or text.endswith(' kr'):
                    # Look at nearby tokens for amounts (wider range)
                    for j in range(max(0, i - 5), min(len(tokens), i + 5)):
                        nearby_text = tokens[j].text.strip()
                        # Match pure integer (1-6 digits)
                        int_match = re.match(r'^(\d{1,6})$', nearby_text)
                        if int_match:
                            value = int(int_match.group(1))
                            if 0 < value < 1000000:
                                candidates.append((str(value), float(value), tokens[j]))

        # Also try to find amounts near "öre" label (Swedish cents)
        if not candidates:
            for i, token in enumerate(tokens):
                text = token.text.strip().lower()
                if 'öre' in text:
                    # Look at nearby tokens for amounts
                    for j in range(max(0, i - 5), min(len(tokens), i + 5)):
                        nearby_text = tokens[j].text.strip()
                        int_match = re.match(r'^(\d{1,6})$', nearby_text)
                        if int_match:
                            value = int(int_match.group(1))
                            if 0 < value < 1000000:
                                candidates.append((str(value), float(value), tokens[j]))

        if not candidates:
            return None

        # Sort by value (prefer larger amounts - likely total)
        candidates.sort(key=lambda x: x[1], reverse=True)
        return candidates[0][0]

    def _calculate_confidence(
        self,
        result: MachineCodeResult,
        has_payment_keywords: bool
    ) -> float:
        """Calculate confidence score for the extraction."""
        confidence = 0.0

        # Base confidence from payment keywords
        if has_payment_keywords:
            confidence += 0.3

        # Points for each extracted field
        if result.ocr:
            confidence += 0.25
            # Bonus for typical OCR length (15-17 digits)
            if 15 <= len(result.ocr) <= 17:
                confidence += 0.1

        if result.bankgiro or result.plusgiro:
            confidence += 0.2

        if result.amount:
            confidence += 0.15

        return min(confidence, 1.0)

    def cross_validate(
        self,
        machine_result: MachineCodeResult,
        csv_values: dict[str, str],
    ) -> dict[str, dict]:
        """
        Cross-validate machine code extraction with CSV ground truth.

        Args:
            machine_result: Result from parse()
            csv_values: Dict of field values from CSV
                        (keys: 'ocr', 'amount', 'bankgiro', 'plusgiro')

        Returns:
            Dict with validation results for each field:
            {
                'ocr': {
                    'machine': '123456789',
                    'csv': '123456789',
                    'match': True,
                    'use_machine': False,  # CSV has value
                },
                ...
            }
        """
        from src.normalize import normalize_field

        results = {}

        field_mapping = [
            ('ocr', 'OCR', machine_result.ocr),
            ('amount', 'Amount', machine_result.amount),
            ('bankgiro', 'Bankgiro', machine_result.bankgiro),
            ('plusgiro', 'Plusgiro', machine_result.plusgiro),
        ]

        for field_key, normalizer_name, machine_value in field_mapping:
            csv_value = csv_values.get(field_key, '').strip()

            result_entry = {
                'machine': machine_value,
                'csv': csv_value if csv_value else None,
                'match': False,
                'use_machine': False,
            }

            if machine_value and csv_value:
                # Both have values - check if they match
                machine_variants = normalize_field(normalizer_name, machine_value)
                csv_variants = normalize_field(normalizer_name, csv_value)

                # Check for any overlap
                result_entry['match'] = bool(
                    set(machine_variants) & set(csv_variants)
                )

                # Special handling for amounts - allow rounding differences
                if not result_entry['match'] and field_key == 'amount':
                    try:
                        # Parse both values as floats
                        machine_float = float(
                            machine_value.replace(' ', '')
                            .replace(',', '.').replace('\xa0', '')
                        )
                        csv_float = float(
                            csv_value.replace(' ', '')
                            .replace(',', '.').replace('\xa0', '')
                        )
                        # Allow 1 unit difference (rounding)
                        if abs(machine_float - csv_float) <= 1.0:
                            result_entry['match'] = True
                            result_entry['rounding_diff'] = True
                    except ValueError:
                        pass

            elif machine_value and not csv_value:
                # CSV is missing, use machine value
                result_entry['use_machine'] = True

            results[field_key] = result_entry

        return results


def parse_machine_code(
    tokens: list[TextToken],
    page_height: float,
    page_width: float | None = None,
    bottom_ratio: float = 0.35,
) -> MachineCodeResult:
    """
    Convenience function to parse machine code from tokens.

    Args:
        tokens: List of text tokens
        page_height: Page height in points
        page_width: Page width in points (optional)
        bottom_ratio: Fraction of page to consider as bottom region

    Returns:
        MachineCodeResult with extracted fields
    """
    parser = MachineCodeParser(bottom_region_ratio=bottom_ratio)
    return parser.parse(tokens, page_height, page_width)