invoice-master-poc-v2/packages/shared/shared/ocr/paddle_ocr.py

"""
OCR Extraction Module using PaddleOCR

Extracts text tokens with bounding boxes from scanned PDFs.
"""

import os
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Generator
import numpy as np

# Suppress PaddlePaddle reinitialization warnings
os.environ.setdefault('GLOG_minloglevel', '2')
warnings.filterwarnings('ignore', message='.*PDX has already been initialized.*')
warnings.filterwarnings('ignore', message='.*reinitialization.*')


@dataclass
class OCRToken:
    """Represents an OCR-extracted text token with its bounding box."""
    text: str
    bbox: tuple[float, float, float, float]  # (x0, y0, x1, y1)
    confidence: float
    page_no: int = 0

    @property
    def x0(self) -> float:
        return self.bbox[0]

    @property
    def y0(self) -> float:
        return self.bbox[1]

    @property
    def x1(self) -> float:
        return self.bbox[2]

    @property
    def y1(self) -> float:
        return self.bbox[3]

    @property
    def center(self) -> tuple[float, float]:
        return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)


@dataclass
class OCRResult:
    """Result from OCR extraction including tokens and preprocessed image."""
    tokens: list[OCRToken]
    output_img: np.ndarray | None = None  # Preprocessed image from PaddleOCR


class OCREngine:
    """PaddleOCR wrapper for text extraction."""

    def __init__(
        self,
        lang: str = "en",
        det_model_dir: str | None = None,
        rec_model_dir: str | None = None,
        use_doc_orientation_classify: bool = True,
        use_doc_unwarping: bool = False
    ):
        """
        Initialize OCR engine.

        Args:
            lang: Language code ('en', 'sv', 'ch', etc.)
            det_model_dir: Custom detection model directory
            rec_model_dir: Custom recognition model directory
            use_doc_orientation_classify: Whether to auto-detect and correct document orientation.
                                          Default True to handle rotated documents.
            use_doc_unwarping: Whether to use UVDoc document unwarping for curved/warped documents.
                               Default False to preserve original image layout,
                               especially important for payment OCR lines at bottom.
                               Enable for severely warped documents at the cost of potentially
                               losing bottom content.

        Note:
            PaddleOCR 3.x automatically uses GPU if available via PaddlePaddle.
            Use `paddle.set_device('gpu')` before initialization to force GPU.
        """
        # Suppress warnings during import and initialization
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            from paddleocr import PaddleOCR

            # PaddleOCR 3.x init (use_gpu removed, device controlled by paddle.set_device)
            init_params = {
                'lang': lang,
                # Enable orientation classification to handle rotated documents
                'use_doc_orientation_classify': use_doc_orientation_classify,
                # Disable UVDoc unwarping to preserve original image layout
                # This prevents the bottom payment OCR line from being cut off
                # For severely warped documents, enable this but expect potential content loss
                'use_doc_unwarping': use_doc_unwarping,
            }
            if det_model_dir:
                init_params['text_detection_model_dir'] = det_model_dir
            if rec_model_dir:
                init_params['text_recognition_model_dir'] = rec_model_dir

            self.ocr = PaddleOCR(**init_params)

    def extract_from_image(
        self,
        image: str | Path | np.ndarray,
        page_no: int = 0,
        max_size: int = 2000,
        scale_to_pdf_points: float | None = None,
        scan_bottom_region: bool = True,
        bottom_region_ratio: float = 0.15
    ) -> list[OCRToken]:
        """
        Extract text tokens from an image.

        Args:
            image: Image path or numpy array
            page_no: Page number for reference
            max_size: Maximum image dimension. Larger images will be scaled down
                      to avoid OCR issues with PaddleOCR on large images.
            scale_to_pdf_points: If provided, scale bbox coordinates by this factor
                                 to convert from pixel to PDF point coordinates.
                                 Use (72 / dpi) for images rendered at a specific DPI.
            scan_bottom_region: If True, also scan the bottom region separately to catch
                               OCR payment lines that may be missed in full-page scan.
            bottom_region_ratio: Ratio of page height to scan as bottom region (default 0.15 = 15%)

        Returns:
            List of OCRToken objects with bbox in pixel coords (or PDF points if scale_to_pdf_points is set)
        """
        result = self.extract_with_image(image, page_no, max_size, scale_to_pdf_points)
        tokens = result.tokens

        # Optionally scan bottom region separately for Swedish OCR payment lines
        if scan_bottom_region:
            bottom_tokens = self._scan_bottom_region(
                image, page_no, max_size, scale_to_pdf_points, bottom_region_ratio
            )
            tokens = self._merge_tokens(tokens, bottom_tokens)

        return tokens

    def _scan_bottom_region(
        self,
        image: str | Path | np.ndarray,
        page_no: int,
        max_size: int,
        scale_to_pdf_points: float | None,
        bottom_ratio: float
    ) -> list[OCRToken]:
        """Scan the bottom region of the image separately."""
        from PIL import Image as PILImage

        # Load image if path
        if isinstance(image, (str, Path)):
            img = PILImage.open(str(image))
            img_array = np.array(img)
        else:
            img_array = image

        h, w = img_array.shape[:2]
        crop_y = int(h * (1 - bottom_ratio))

        # Crop bottom region
        bottom_crop = img_array[crop_y:h, :, :] if len(img_array.shape) == 3 else img_array[crop_y:h, :]

        # OCR the cropped region (without recursive bottom scan to avoid infinite loop)
        result = self.extract_with_image(
            bottom_crop, page_no, max_size,
            scale_to_pdf_points=None,
            scan_bottom_region=False  # Important: disable to prevent recursion
        )

        # Adjust bbox y-coordinates to full image space
        adjusted_tokens = []
        for token in result.tokens:
            # Scale factor for coordinates
            scale = scale_to_pdf_points if scale_to_pdf_points else 1.0

            adjusted_bbox = (
                token.bbox[0] * scale,
                (token.bbox[1] + crop_y) * scale,
                token.bbox[2] * scale,
                (token.bbox[3] + crop_y) * scale
            )
            adjusted_tokens.append(OCRToken(
                text=token.text,
                bbox=adjusted_bbox,
                confidence=token.confidence,
                page_no=token.page_no
            ))

        return adjusted_tokens

    def _merge_tokens(
        self,
        main_tokens: list[OCRToken],
        bottom_tokens: list[OCRToken]
    ) -> list[OCRToken]:
        """Merge tokens from main scan and bottom region scan, removing duplicates."""
        if not bottom_tokens:
            return main_tokens

        # Create a set of existing token texts for deduplication
        existing_texts = {t.text.strip() for t in main_tokens}

        # Add bottom tokens that aren't duplicates
        merged = list(main_tokens)
        for token in bottom_tokens:
            if token.text.strip() not in existing_texts:
                merged.append(token)
                existing_texts.add(token.text.strip())

        return merged

    def extract_with_image(
        self,
        image: str | Path | np.ndarray,
        page_no: int = 0,
        max_size: int = 2000,
        scale_to_pdf_points: float | None = None,
        scan_bottom_region: bool = True,
        bottom_region_ratio: float = 0.15
    ) -> OCRResult:
        """
        Extract text tokens from an image and return the preprocessed image.

        PaddleOCR applies document preprocessing (unwarping, rotation, enhancement)
        and returns coordinates relative to the preprocessed image (output_img).
        This method returns both tokens and output_img so the caller can save
        the correct image that matches the coordinates.

        Args:
            image: Image path or numpy array
            page_no: Page number for reference
            max_size: Maximum image dimension. Larger images will be scaled down
                      to avoid OCR issues with PaddleOCR on large images.
            scale_to_pdf_points: If provided, scale bbox coordinates by this factor
                                 to convert from pixel to PDF point coordinates.
                                 Use (72 / dpi) for images rendered at a specific DPI.
            scan_bottom_region: If True, also scan the bottom region separately to catch
                               OCR payment lines that may be missed in full-page scan.
            bottom_region_ratio: Ratio of page height to scan as bottom region (default 0.15 = 15%)

        Returns:
            OCRResult with tokens and output_img (preprocessed image from PaddleOCR)
        """
        from PIL import Image as PILImage

        # Load image if path
        if isinstance(image, (str, Path)):
            img = PILImage.open(str(image))
            img_array = np.array(img)
        else:
            img_array = image

        # Check if image needs scaling for OCR
        h, w = img_array.shape[:2]
        ocr_scale_factor = 1.0

        if max(h, w) > max_size:
            ocr_scale_factor = max_size / max(h, w)
            new_w = int(w * ocr_scale_factor)
            new_h = int(h * ocr_scale_factor)
            # Resize image for OCR
            img = PILImage.fromarray(img_array)
            img = img.resize((new_w, new_h), PILImage.Resampling.LANCZOS)
            img_array = np.array(img)

        # PaddleOCR 3.x uses predict() method instead of ocr()
        result = self.ocr.predict(img_array)

        tokens = []
        output_img = None

        if result:
            for item in result:
                # PaddleOCR 3.x returns list of dicts with 'rec_texts', 'rec_scores', 'dt_polys'
                if isinstance(item, dict):
                    rec_texts = item.get('rec_texts', [])
                    rec_scores = item.get('rec_scores', [])
                    dt_polys = item.get('dt_polys', [])

                    # Get output_img from doc_preprocessor_res
                    # This is the preprocessed image that coordinates are relative to
                    doc_preproc = item.get('doc_preprocessor_res', {})
                    if isinstance(doc_preproc, dict):
                        output_img = doc_preproc.get('output_img')

                    # Coordinates are relative to output_img (preprocessed image)
                    # No rotation compensation needed - just use coordinates directly
                    for text, score, poly in zip(rec_texts, rec_scores, dt_polys):
                        # poly is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
                        x_coords = [float(p[0]) for p in poly]
                        y_coords = [float(p[1]) for p in poly]

                        # Apply PDF points scale if requested
                        if scale_to_pdf_points is not None:
                            final_scale = scale_to_pdf_points
                        else:
                            final_scale = 1.0

                        bbox = (
                            min(x_coords) * final_scale,
                            min(y_coords) * final_scale,
                            max(x_coords) * final_scale,
                            max(y_coords) * final_scale
                        )

                        tokens.append(OCRToken(
                            text=text,
                            bbox=bbox,
                            confidence=float(score),
                            page_no=page_no
                        ))
                elif isinstance(item, (list, tuple)) and len(item) == 2:
                    # Legacy format: [[bbox_points], (text, confidence)]
                    bbox_points, (text, confidence) = item

                    x_coords = [p[0] for p in bbox_points]
                    y_coords = [p[1] for p in bbox_points]

                    # Apply PDF points scale if requested
                    if scale_to_pdf_points is not None:
                        final_scale = scale_to_pdf_points
                    else:
                        final_scale = 1.0

                    bbox = (
                        min(x_coords) * final_scale,
                        min(y_coords) * final_scale,
                        max(x_coords) * final_scale,
                        max(y_coords) * final_scale
                    )

                    tokens.append(OCRToken(
                        text=text,
                        bbox=bbox,
                        confidence=confidence,
                        page_no=page_no
                    ))

        # If no output_img was found, use the original input array
        if output_img is None:
            output_img = img_array

        # Optionally scan bottom region separately for Swedish OCR payment lines
        if scan_bottom_region:
            bottom_tokens = self._scan_bottom_region(
                image, page_no, max_size, scale_to_pdf_points, bottom_region_ratio
            )
            tokens = self._merge_tokens(tokens, bottom_tokens)

        return OCRResult(tokens=tokens, output_img=output_img)

    def extract_from_pdf(
        self,
        pdf_path: str | Path,
        dpi: int = 300
    ) -> Generator[list[OCRToken], None, None]:
        """
        Extract text from all pages of a scanned PDF.

        Args:
            pdf_path: Path to the PDF file
            dpi: Resolution for rendering

        Yields:
            List of OCRToken for each page
        """
        from ..pdf.renderer import render_pdf_to_images
        import io
        from PIL import Image

        for page_no, image_bytes in render_pdf_to_images(pdf_path, dpi=dpi):
            # Convert bytes to numpy array
            image = Image.open(io.BytesIO(image_bytes))
            image_array = np.array(image)

            tokens = self.extract_from_image(image_array, page_no=page_no)
            yield tokens


def extract_ocr_tokens(
    image_path: str | Path,
    lang: str = "en",
    page_no: int = 0
) -> list[OCRToken]:
    """
    Convenience function to extract OCR tokens from an image.

    Args:
        image_path: Path to the image file
        lang: Language code
        page_no: Page number for reference

    Returns:
        List of OCRToken objects
    """
    engine = OCREngine(lang=lang)
    return engine.extract_from_image(image_path, page_no=page_no)