re-structure

2026-02-01 22:55:31 +01:00
parent 400b12a967
commit b602d0a340
176 changed files with 856 additions and 853 deletions
--- a/packages/backend/backend/pipeline/field_extractor.py
+++ b/packages/backend/backend/pipeline/field_extractor.py
@@ -0,0 +1,630 @@
+"""
+Field Extractor Module
+
+Extracts and validates field values from detected regions.
+
+This module is used during inference to extract values from OCR text.
+It uses shared utilities from shared.utils for text cleaning and validation.
+
+Enhanced features:
+- Multi-source fusion with confidence weighting
+- Smart amount parsing with multiple strategies
+- Enhanced date format unification
+- OCR error correction integration
+
+Refactored to use modular normalizers for each field type.
+"""
+
+from dataclasses import dataclass, field
+from collections import defaultdict
+import re
+import numpy as np
+from PIL import Image
+
+from shared.fields import CLASS_TO_FIELD
+from .yolo_detector import Detection
+
+# Import shared utilities for text cleaning and validation
+from shared.utils.validators import FieldValidators
+from shared.utils.ocr_corrections import OCRCorrections
+
+# Import new unified parsers
+from .payment_line_parser import PaymentLineParser
+from .customer_number_parser import CustomerNumberParser
+
+# Import normalizers
+from .normalizers import (
+    BaseNormalizer,
+    NormalizationResult,
+    create_normalizer_registry,
+    EnhancedAmountNormalizer,
+    EnhancedDateNormalizer,
+)
+
+
+@dataclass
+class ExtractedField:
+    """Represents an extracted field value."""
+    field_name: str
+    raw_text: str
+    normalized_value: str | None
+    confidence: float
+    detection_confidence: float
+    ocr_confidence: float
+    bbox: tuple[float, float, float, float]
+    page_no: int
+    is_valid: bool = True
+    validation_error: str | None = None
+    # Multi-source fusion fields
+    alternative_values: list[tuple[str, float]] = field(default_factory=list)  # [(value, confidence), ...]
+    extraction_method: str = 'single'  # 'single', 'fused', 'corrected'
+    ocr_corrections_applied: list[str] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        result = {
+            'field_name': self.field_name,
+            'value': self.normalized_value,
+            'raw_text': self.raw_text,
+            'confidence': self.confidence,
+            'bbox': list(self.bbox),
+            'page_no': self.page_no,
+            'is_valid': self.is_valid,
+            'validation_error': self.validation_error
+        }
+        if self.alternative_values:
+            result['alternatives'] = self.alternative_values
+        if self.extraction_method != 'single':
+            result['extraction_method'] = self.extraction_method
+        return result
+
+
+class FieldExtractor:
+    """Extracts field values from detected regions using OCR or PDF text."""
+
+    def __init__(
+        self,
+        ocr_lang: str = 'en',
+        use_gpu: bool = False,
+        bbox_padding: float = 0.1,
+        dpi: int = 300,
+        use_enhanced_parsing: bool = False
+    ):
+        """
+        Initialize field extractor.
+
+        Args:
+            ocr_lang: Language for OCR
+            use_gpu: Whether to use GPU for OCR
+            bbox_padding: Padding to add around bboxes (as fraction)
+            dpi: DPI used for rendering (for coordinate conversion)
+            use_enhanced_parsing: Whether to use enhanced normalizers
+        """
+        self.ocr_lang = ocr_lang
+        self.use_gpu = use_gpu
+        self.bbox_padding = bbox_padding
+        self.dpi = dpi
+        self._ocr_engine = None  # Lazy init
+        self.use_enhanced_parsing = use_enhanced_parsing
+
+        # Initialize new unified parsers
+        self.payment_line_parser = PaymentLineParser()
+        self.customer_number_parser = CustomerNumberParser()
+
+        # Initialize normalizer registry
+        self._normalizers = create_normalizer_registry(use_enhanced=use_enhanced_parsing)
+
+    @property
+    def ocr_engine(self):
+        """Lazy-load OCR engine only when needed."""
+        if self._ocr_engine is None:
+            from shared.ocr import OCREngine
+            self._ocr_engine = OCREngine(lang=self.ocr_lang)
+        return self._ocr_engine
+
+    def extract_from_detection_with_pdf(
+        self,
+        detection: Detection,
+        pdf_tokens: list,
+        image_width: int,
+        image_height: int
+    ) -> ExtractedField:
+        """
+        Extract field value using PDF text tokens (faster and more accurate for text PDFs).
+
+        Args:
+            detection: Detection object with bbox in pixel coordinates
+            pdf_tokens: List of Token objects from PDF text extraction
+            image_width: Width of rendered image in pixels
+            image_height: Height of rendered image in pixels
+
+        Returns:
+            ExtractedField object
+        """
+        # Convert detection bbox from pixels to PDF points
+        scale = 72 / self.dpi  # points per pixel
+        x0_pdf = detection.bbox[0] * scale
+        y0_pdf = detection.bbox[1] * scale
+        x1_pdf = detection.bbox[2] * scale
+        y1_pdf = detection.bbox[3] * scale
+
+        # Add padding in points
+        pad = 3  # Small padding in points
+
+        # Find tokens that overlap with detection bbox
+        matching_tokens = []
+        for token in pdf_tokens:
+            if token.page_no != detection.page_no:
+                continue
+            tx0, ty0, tx1, ty1 = token.bbox
+            # Check overlap
+            if (tx0 < x1_pdf + pad and tx1 > x0_pdf - pad and
+                ty0 < y1_pdf + pad and ty1 > y0_pdf - pad):
+                # Calculate overlap ratio to prioritize better matches
+                overlap_x = min(tx1, x1_pdf) - max(tx0, x0_pdf)
+                overlap_y = min(ty1, y1_pdf) - max(ty0, y0_pdf)
+                if overlap_x > 0 and overlap_y > 0:
+                    token_area = (tx1 - tx0) * (ty1 - ty0)
+                    overlap_area = overlap_x * overlap_y
+                    overlap_ratio = overlap_area / token_area if token_area > 0 else 0
+                    matching_tokens.append((token, overlap_ratio))
+
+        # Sort by overlap ratio and combine text
+        matching_tokens.sort(key=lambda x: -x[1])
+        raw_text = ' '.join(t[0].text for t in matching_tokens)
+
+        # Get field name
+        field_name = CLASS_TO_FIELD.get(detection.class_name, detection.class_name)
+
+        # Normalize and validate
+        normalized_value, is_valid, validation_error = self._normalize_and_validate(
+            field_name, raw_text
+        )
+
+        return ExtractedField(
+            field_name=field_name,
+            raw_text=raw_text,
+            normalized_value=normalized_value,
+            confidence=detection.confidence if normalized_value else detection.confidence * 0.5,
+            detection_confidence=detection.confidence,
+            ocr_confidence=1.0,  # PDF text is always accurate
+            bbox=detection.bbox,
+            page_no=detection.page_no,
+            is_valid=is_valid,
+            validation_error=validation_error
+        )
+
+    def extract_from_detection(
+        self,
+        detection: Detection,
+        image: np.ndarray | Image.Image
+    ) -> ExtractedField:
+        """
+        Extract field value from a detection region using OCR.
+
+        Args:
+            detection: Detection object
+            image: Full page image
+
+        Returns:
+            ExtractedField object
+        """
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+
+        # Get padded bbox
+        h, w = image.shape[:2]
+        bbox = detection.get_padded_bbox(self.bbox_padding, w, h)
+
+        # Crop region
+        x0, y0, x1, y1 = [int(v) for v in bbox]
+        region = image[y0:y1, x0:x1]
+
+        # Run OCR on region
+        ocr_tokens = self.ocr_engine.extract_from_image(region)
+
+        # Combine all OCR text
+        raw_text = ' '.join(t.text for t in ocr_tokens)
+        ocr_confidence = sum(t.confidence for t in ocr_tokens) / len(ocr_tokens) if ocr_tokens else 0.0
+
+        # Get field name
+        field_name = CLASS_TO_FIELD.get(detection.class_name, detection.class_name)
+
+        # Normalize and validate
+        normalized_value, is_valid, validation_error = self._normalize_and_validate(
+            field_name, raw_text
+        )
+
+        # Combined confidence
+        confidence = (detection.confidence + ocr_confidence) / 2 if ocr_tokens else detection.confidence * 0.5
+
+        return ExtractedField(
+            field_name=field_name,
+            raw_text=raw_text,
+            normalized_value=normalized_value,
+            confidence=confidence,
+            detection_confidence=detection.confidence,
+            ocr_confidence=ocr_confidence,
+            bbox=detection.bbox,
+            page_no=detection.page_no,
+            is_valid=is_valid,
+            validation_error=validation_error
+        )
+
+    def _normalize_and_validate(
+        self,
+        field_name: str,
+        raw_text: str
+    ) -> tuple[str | None, bool, str | None]:
+        """
+        Normalize and validate extracted text for a field.
+
+        Uses modular normalizers for each field type.
+        Falls back to legacy methods for payment_line and customer_number.
+
+        Returns:
+            (normalized_value, is_valid, validation_error)
+        """
+        text = raw_text.strip()
+
+        if not text:
+            return None, False, "Empty text"
+
+        # Special handling for payment_line and customer_number (use unified parsers)
+        if field_name == 'payment_line':
+            return self._normalize_payment_line(text)
+
+        if field_name == 'customer_number':
+            return self._normalize_customer_number(text)
+
+        # Use normalizer registry for other fields
+        normalizer = self._normalizers.get(field_name)
+        if normalizer:
+            result = normalizer.normalize(text)
+            return result.to_tuple()
+
+        # Fallback for unknown fields
+        return text, True, None
+
+    def _normalize_payment_line(self, text: str) -> tuple[str | None, bool, str | None]:
+        """
+        Normalize payment line region text using unified PaymentLineParser.
+
+        Extracts the machine-readable payment line format from OCR text.
+        Standard Swedish payment line format: # <OCR> # <Kronor> <Öre> <Type> > <Account>#<Check>#
+
+        Examples:
+        - "# 94228110015950070 # 15658 00 8 > 48666036#14#" -> includes amount 15658.00
+        - "# 11000770600242 # 1200 00 5 > 3082963#41#" -> includes amount 1200.00
+
+        Returns normalized format preserving ALL components including Amount.
+        This allows downstream cross-validation to extract fields properly.
+        """
+        # Use unified payment line parser
+        return self.payment_line_parser.format_for_field_extractor(
+            self.payment_line_parser.parse(text)
+        )
+
+    def _normalize_customer_number(self, text: str) -> tuple[str | None, bool, str | None]:
+        """
+        Normalize customer number text using unified CustomerNumberParser.
+
+        Supports various Swedish customer number formats:
+        - With separators: 'JTY 576-3', 'EMM 256-6', 'FFL 019N', 'UMJ 436-R'
+        - Compact (no separators): 'JTY5763', 'EMM2566', 'FFL019N'
+        - Mixed with names: 'VIKSTRÖM, ELIAS CH FFL 01' -> extract 'FFL 01'
+        - Address format: 'Umj 436-R Billo' -> extract 'UMJ 436-R'
+        """
+        return self.customer_number_parser.parse(text)
+
+    def extract_all_fields(
+        self,
+        detections: list[Detection],
+        image: np.ndarray | Image.Image
+    ) -> list[ExtractedField]:
+        """
+        Extract fields from all detections.
+
+        Args:
+            detections: List of detections
+            image: Full page image
+
+        Returns:
+            List of ExtractedField objects
+        """
+        fields = []
+
+        for detection in detections:
+            field = self.extract_from_detection(detection, image)
+            fields.append(field)
+
+        return fields
+
+    @staticmethod
+    def infer_ocr_from_invoice_number(fields: dict[str, str]) -> dict[str, str]:
+        """
+        Infer OCR field from InvoiceNumber if not detected.
+
+        In Swedish invoices, OCR reference number is often identical to InvoiceNumber.
+        When OCR is not detected but InvoiceNumber is, we can infer OCR value.
+
+        Args:
+            fields: Dict of field_name -> normalized_value
+
+        Returns:
+            Updated fields dict with inferred OCR if applicable
+        """
+        # If OCR already exists, no need to infer
+        if fields.get('OCR'):
+            return fields
+
+        # If InvoiceNumber exists and is numeric, use it as OCR
+        invoice_number = fields.get('InvoiceNumber')
+        if invoice_number:
+            # Check if it's mostly digits (valid OCR reference)
+            digits_only = re.sub(r'\D', '', invoice_number)
+            if len(digits_only) >= 5 and len(digits_only) == len(invoice_number):
+                fields['OCR'] = invoice_number
+
+        return fields
+
+    # =========================================================================
+    # Multi-Source Fusion with Confidence Weighting
+    # =========================================================================
+
+    def fuse_multiple_detections(
+        self,
+        extracted_fields: list[ExtractedField]
+    ) -> list[ExtractedField]:
+        """
+        Fuse multiple detections of the same field using confidence-weighted voting.
+
+        When YOLO detects the same field type multiple times (e.g., multiple Amount boxes),
+        this method selects the best value or combines them intelligently.
+
+        Strategies:
+        1. For numeric fields (Amount, OCR): prefer values that pass validation
+        2. For date fields: prefer values in expected range
+        3. For giro numbers: prefer values with valid Luhn checksum
+        4. General: weighted vote by confidence scores
+
+        Args:
+            extracted_fields: List of all extracted fields (may have duplicates)
+
+        Returns:
+            List with duplicates resolved to single best value per field
+        """
+        # Group fields by name
+        fields_by_name: dict[str, list[ExtractedField]] = defaultdict(list)
+        for field in extracted_fields:
+            fields_by_name[field.field_name].append(field)
+
+        fused_fields = []
+
+        for field_name, candidates in fields_by_name.items():
+            if len(candidates) == 1:
+                # No fusion needed
+                fused_fields.append(candidates[0])
+            else:
+                # Multiple candidates - fuse them
+                fused = self._fuse_field_candidates(field_name, candidates)
+                fused_fields.append(fused)
+
+        return fused_fields
+
+    def _fuse_field_candidates(
+        self,
+        field_name: str,
+        candidates: list[ExtractedField]
+    ) -> ExtractedField:
+        """
+        Fuse multiple candidates for a single field.
+
+        Returns the best candidate with alternatives recorded.
+        """
+        # Sort by confidence (descending)
+        sorted_candidates = sorted(candidates, key=lambda x: x.confidence, reverse=True)
+
+        # Collect all unique values with their max confidence
+        value_scores: dict[str, tuple[float, ExtractedField]] = {}
+        for c in sorted_candidates:
+            if c.normalized_value:
+                if c.normalized_value not in value_scores:
+                    value_scores[c.normalized_value] = (c.confidence, c)
+                else:
+                    # Keep the higher confidence one
+                    if c.confidence > value_scores[c.normalized_value][0]:
+                        value_scores[c.normalized_value] = (c.confidence, c)
+
+        if not value_scores:
+            # No valid values, return the highest confidence candidate
+            return sorted_candidates[0]
+
+        # Field-specific fusion strategy
+        best_value, best_field = self._select_best_value(field_name, value_scores)
+
+        # Record alternatives
+        alternatives = [
+            (v, score) for v, (score, _) in value_scores.items()
+            if v != best_value
+        ]
+
+        # Create fused result
+        result = ExtractedField(
+            field_name=field_name,
+            raw_text=best_field.raw_text,
+            normalized_value=best_value,
+            confidence=value_scores[best_value][0],
+            detection_confidence=best_field.detection_confidence,
+            ocr_confidence=best_field.ocr_confidence,
+            bbox=best_field.bbox,
+            page_no=best_field.page_no,
+            is_valid=best_field.is_valid,
+            validation_error=best_field.validation_error,
+            alternative_values=alternatives,
+            extraction_method='fused' if len(value_scores) > 1 else 'single'
+        )
+
+        return result
+
+    def _select_best_value(
+        self,
+        field_name: str,
+        value_scores: dict[str, tuple[float, ExtractedField]]
+    ) -> tuple[str, ExtractedField]:
+        """
+        Select the best value for a field using field-specific logic.
+
+        Returns (best_value, best_field)
+        """
+        items = list(value_scores.items())
+
+        # Field-specific selection
+        if field_name in ('Bankgiro', 'Plusgiro', 'OCR'):
+            # Prefer values with valid Luhn checksum
+            for value, (score, field) in items:
+                digits = re.sub(r'\D', '', value)
+                if FieldValidators.luhn_checksum(digits):
+                    return value, field
+
+        elif field_name == 'Amount':
+            # Prefer larger amounts (usually the total, not subtotals)
+            amounts = []
+            for value, (score, field) in items:
+                try:
+                    amt = float(value.replace(',', '.'))
+                    amounts.append((amt, value, field))
+                except ValueError:
+                    continue
+            if amounts:
+                # Return the largest amount
+                amounts.sort(reverse=True)
+                return amounts[0][1], amounts[0][2]
+
+        elif field_name in ('InvoiceDate', 'InvoiceDueDate'):
+            # Prefer dates in reasonable range
+            from datetime import datetime
+            for value, (score, field) in items:
+                try:
+                    dt = datetime.strptime(value, '%Y-%m-%d')
+                    # Prefer recent dates (within last 2 years and next 1 year)
+                    now = datetime.now()
+                    if now.year - 2 <= dt.year <= now.year + 1:
+                        return value, field
+                except ValueError:
+                    continue
+
+        # Default: return highest confidence value
+        best = max(items, key=lambda x: x[1][0])
+        return best[0], best[1][1]
+
+    # =========================================================================
+    # Apply OCR Corrections to Raw Text
+    # =========================================================================
+
+    def apply_ocr_corrections(
+        self,
+        field_name: str,
+        raw_text: str
+    ) -> tuple[str, list[str]]:
+        """
+        Apply OCR corrections to raw text based on field type.
+
+        Returns (corrected_text, list_of_corrections_applied)
+        """
+        corrections_applied = []
+
+        if field_name in ('OCR', 'Bankgiro', 'Plusgiro', 'supplier_org_number'):
+            # Aggressive correction for numeric fields
+            result = OCRCorrections.correct_digits(raw_text, aggressive=True)
+            if result.corrections_applied:
+                corrections_applied = [f"{c[1]}->{c[2]}" for c in result.corrections_applied]
+            return result.corrected, corrections_applied
+
+        elif field_name == 'Amount':
+            # Conservative correction for amounts (preserve decimal separators)
+            result = OCRCorrections.correct_digits(raw_text, aggressive=False)
+            if result.corrections_applied:
+                corrections_applied = [f"{c[1]}->{c[2]}" for c in result.corrections_applied]
+            return result.corrected, corrections_applied
+
+        elif field_name in ('InvoiceDate', 'InvoiceDueDate'):
+            # Conservative correction for dates
+            result = OCRCorrections.correct_digits(raw_text, aggressive=False)
+            if result.corrections_applied:
+                corrections_applied = [f"{c[1]}->{c[2]}" for c in result.corrections_applied]
+            return result.corrected, corrections_applied
+
+        # No correction for other fields
+        return raw_text, []
+
+    # =========================================================================
+    # Extraction with All Enhancements
+    # =========================================================================
+
+    def extract_with_enhancements(
+        self,
+        detection: Detection,
+        pdf_tokens: list,
+        image_width: int,
+        image_height: int,
+        use_enhanced_parsing: bool = True
+    ) -> ExtractedField:
+        """
+        Extract field value with all enhancements enabled.
+
+        Combines:
+        1. OCR error correction
+        2. Enhanced amount/date parsing
+        3. Multi-strategy extraction
+
+        Args:
+            detection: Detection object
+            pdf_tokens: PDF text tokens
+            image_width: Image width in pixels
+            image_height: Image height in pixels
+            use_enhanced_parsing: Whether to use enhanced parsing methods
+
+        Returns:
+            ExtractedField with enhancements applied
+        """
+        # First, extract using standard method
+        base_result = self.extract_from_detection_with_pdf(
+            detection, pdf_tokens, image_width, image_height
+        )
+
+        if not use_enhanced_parsing:
+            return base_result
+
+        # Apply OCR corrections
+        corrected_text, corrections = self.apply_ocr_corrections(
+            base_result.field_name, base_result.raw_text
+        )
+
+        # Re-normalize with enhanced methods if corrections were applied
+        if corrections or base_result.normalized_value is None:
+            # Use enhanced normalizers for Amount and Date fields
+            if base_result.field_name == 'Amount':
+                enhanced_normalizer = EnhancedAmountNormalizer()
+                result = enhanced_normalizer.normalize(corrected_text)
+                normalized, is_valid, error = result.to_tuple()
+            elif base_result.field_name in ('InvoiceDate', 'InvoiceDueDate'):
+                enhanced_normalizer = EnhancedDateNormalizer()
+                result = enhanced_normalizer.normalize(corrected_text)
+                normalized, is_valid, error = result.to_tuple()
+            else:
+                # Re-run standard normalization with corrected text
+                normalized, is_valid, error = self._normalize_and_validate(
+                    base_result.field_name, corrected_text
+                )
+
+            # Update result if we got a better value
+            if normalized and (not base_result.normalized_value or is_valid):
+                base_result.normalized_value = normalized
+                base_result.is_valid = is_valid
+                base_result.validation_error = error
+                base_result.ocr_corrections_applied = corrections
+                if corrections:
+                    base_result.extraction_method = 'corrected'
+
+        return base_result