invoice-master-poc-v2/packages/training/training/processing/gpu_pool.py

"""
GPU Worker Pool for OCR processing.

This pool handles GPU-bound tasks like PaddleOCR for scanned PDF processing.
"""

from __future__ import annotations

import logging
import os
from typing import Any, Callable, Optional

from training.processing.worker_pool import WorkerPool

logger = logging.getLogger(__name__)

# Global OCR instance for GPU workers (initialized once per process)
_ocr_instance: Optional[Any] = None
_gpu_initialized: bool = False


def _init_gpu_worker(gpu_id: int = 0) -> None:
    """
    Initialize a GPU worker process with PaddleOCR.

    Args:
        gpu_id: GPU device ID to use.
    """
    global _ocr_instance, _gpu_initialized

    # Set GPU device before importing paddle
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

    # Reduce logging noise
    os.environ["GLOG_minloglevel"] = "2"

    # Suppress PaddleX warnings
    import warnings
    warnings.filterwarnings("ignore", message=".*PDX has already been initialized.*")
    warnings.filterwarnings("ignore", message=".*reinitialization.*")

    try:
        # Import PaddleOCR after setting environment
        # PaddleOCR 3.x uses paddle.set_device() for GPU control, not use_gpu param
        import paddle
        paddle.set_device(f"gpu:{gpu_id}")

        from paddleocr import PaddleOCR

        # PaddleOCR 3.x init - minimal params, GPU controlled via paddle.set_device
        _ocr_instance = PaddleOCR(lang="en")
        _gpu_initialized = True
        logger.info(f"GPU worker initialized on GPU {gpu_id} in process {os.getpid()}")

    except Exception as e:
        logger.error(f"Failed to initialize GPU worker: {e}")
        raise


def get_ocr_instance() -> Any:
    """
    Get the initialized OCR instance for the current worker.

    Returns:
        PaddleOCR instance.

    Raises:
        RuntimeError: If OCR is not initialized.
    """
    global _ocr_instance
    if _ocr_instance is None:
        raise RuntimeError("OCR not initialized. This function must be called from a GPU worker.")
    return _ocr_instance


class GPUWorkerPool(WorkerPool):
    """
    Worker pool for GPU-bound OCR tasks.

    Handles scanned PDF processing using PaddleOCR with GPU acceleration.
    Typically limited to 1 worker to avoid GPU memory conflicts.

    Example:
        with GPUWorkerPool(max_workers=1, gpu_id=0) as pool:
            future = pool.submit(process_scanned_pdf, pdf_path)
            result = future.result()
    """

    def __init__(
        self,
        max_workers: int = 1,
        gpu_id: int = 0,
    ) -> None:
        """
        Initialize GPU worker pool.

        Args:
            max_workers: Number of GPU worker processes.
                        Defaults to 1 to avoid GPU memory conflicts.
            gpu_id: GPU device ID to use.
        """
        super().__init__(max_workers=max_workers, use_gpu=True, gpu_id=gpu_id)

    def get_initializer(self) -> Optional[Callable[..., None]]:
        """Return the GPU worker initializer."""
        return _init_gpu_worker

    def get_init_args(self) -> tuple:
        """Return args for GPU initializer."""
        return (self.gpu_id,)