Files
invoice-master-poc-v2/packages/shared/shared/ocr/paddle_ocr.py
2026-01-27 23:58:17 +01:00

406 lines
15 KiB
Python

"""
OCR Extraction Module using PaddleOCR
Extracts text tokens with bounding boxes from scanned PDFs.
"""
import os
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Generator
import numpy as np
# Suppress PaddlePaddle reinitialization warnings
os.environ.setdefault('GLOG_minloglevel', '2')
warnings.filterwarnings('ignore', message='.*PDX has already been initialized.*')
warnings.filterwarnings('ignore', message='.*reinitialization.*')
@dataclass
class OCRToken:
"""Represents an OCR-extracted text token with its bounding box."""
text: str
bbox: tuple[float, float, float, float] # (x0, y0, x1, y1)
confidence: float
page_no: int = 0
@property
def x0(self) -> float:
return self.bbox[0]
@property
def y0(self) -> float:
return self.bbox[1]
@property
def x1(self) -> float:
return self.bbox[2]
@property
def y1(self) -> float:
return self.bbox[3]
@property
def center(self) -> tuple[float, float]:
return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
@dataclass
class OCRResult:
"""Result from OCR extraction including tokens and preprocessed image."""
tokens: list[OCRToken]
output_img: np.ndarray | None = None # Preprocessed image from PaddleOCR
class OCREngine:
"""PaddleOCR wrapper for text extraction."""
def __init__(
self,
lang: str = "en",
det_model_dir: str | None = None,
rec_model_dir: str | None = None,
use_doc_orientation_classify: bool = True,
use_doc_unwarping: bool = False
):
"""
Initialize OCR engine.
Args:
lang: Language code ('en', 'sv', 'ch', etc.)
det_model_dir: Custom detection model directory
rec_model_dir: Custom recognition model directory
use_doc_orientation_classify: Whether to auto-detect and correct document orientation.
Default True to handle rotated documents.
use_doc_unwarping: Whether to use UVDoc document unwarping for curved/warped documents.
Default False to preserve original image layout,
especially important for payment OCR lines at bottom.
Enable for severely warped documents at the cost of potentially
losing bottom content.
Note:
PaddleOCR 3.x automatically uses GPU if available via PaddlePaddle.
Use `paddle.set_device('gpu')` before initialization to force GPU.
"""
# Suppress warnings during import and initialization
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
from paddleocr import PaddleOCR
# PaddleOCR 3.x init (use_gpu removed, device controlled by paddle.set_device)
init_params = {
'lang': lang,
# Enable orientation classification to handle rotated documents
'use_doc_orientation_classify': use_doc_orientation_classify,
# Disable UVDoc unwarping to preserve original image layout
# This prevents the bottom payment OCR line from being cut off
# For severely warped documents, enable this but expect potential content loss
'use_doc_unwarping': use_doc_unwarping,
}
if det_model_dir:
init_params['text_detection_model_dir'] = det_model_dir
if rec_model_dir:
init_params['text_recognition_model_dir'] = rec_model_dir
self.ocr = PaddleOCR(**init_params)
def extract_from_image(
self,
image: str | Path | np.ndarray,
page_no: int = 0,
max_size: int = 2000,
scale_to_pdf_points: float | None = None,
scan_bottom_region: bool = True,
bottom_region_ratio: float = 0.15
) -> list[OCRToken]:
"""
Extract text tokens from an image.
Args:
image: Image path or numpy array
page_no: Page number for reference
max_size: Maximum image dimension. Larger images will be scaled down
to avoid OCR issues with PaddleOCR on large images.
scale_to_pdf_points: If provided, scale bbox coordinates by this factor
to convert from pixel to PDF point coordinates.
Use (72 / dpi) for images rendered at a specific DPI.
scan_bottom_region: If True, also scan the bottom region separately to catch
OCR payment lines that may be missed in full-page scan.
bottom_region_ratio: Ratio of page height to scan as bottom region (default 0.15 = 15%)
Returns:
List of OCRToken objects with bbox in pixel coords (or PDF points if scale_to_pdf_points is set)
"""
result = self.extract_with_image(image, page_no, max_size, scale_to_pdf_points)
tokens = result.tokens
# Optionally scan bottom region separately for Swedish OCR payment lines
if scan_bottom_region:
bottom_tokens = self._scan_bottom_region(
image, page_no, max_size, scale_to_pdf_points, bottom_region_ratio
)
tokens = self._merge_tokens(tokens, bottom_tokens)
return tokens
def _scan_bottom_region(
self,
image: str | Path | np.ndarray,
page_no: int,
max_size: int,
scale_to_pdf_points: float | None,
bottom_ratio: float
) -> list[OCRToken]:
"""Scan the bottom region of the image separately."""
from PIL import Image as PILImage
# Load image if path
if isinstance(image, (str, Path)):
img = PILImage.open(str(image))
img_array = np.array(img)
else:
img_array = image
h, w = img_array.shape[:2]
crop_y = int(h * (1 - bottom_ratio))
# Crop bottom region
bottom_crop = img_array[crop_y:h, :, :] if len(img_array.shape) == 3 else img_array[crop_y:h, :]
# OCR the cropped region (without recursive bottom scan to avoid infinite loop)
result = self.extract_with_image(
bottom_crop, page_no, max_size,
scale_to_pdf_points=None,
scan_bottom_region=False # Important: disable to prevent recursion
)
# Adjust bbox y-coordinates to full image space
adjusted_tokens = []
for token in result.tokens:
# Scale factor for coordinates
scale = scale_to_pdf_points if scale_to_pdf_points else 1.0
adjusted_bbox = (
token.bbox[0] * scale,
(token.bbox[1] + crop_y) * scale,
token.bbox[2] * scale,
(token.bbox[3] + crop_y) * scale
)
adjusted_tokens.append(OCRToken(
text=token.text,
bbox=adjusted_bbox,
confidence=token.confidence,
page_no=token.page_no
))
return adjusted_tokens
def _merge_tokens(
self,
main_tokens: list[OCRToken],
bottom_tokens: list[OCRToken]
) -> list[OCRToken]:
"""Merge tokens from main scan and bottom region scan, removing duplicates."""
if not bottom_tokens:
return main_tokens
# Create a set of existing token texts for deduplication
existing_texts = {t.text.strip() for t in main_tokens}
# Add bottom tokens that aren't duplicates
merged = list(main_tokens)
for token in bottom_tokens:
if token.text.strip() not in existing_texts:
merged.append(token)
existing_texts.add(token.text.strip())
return merged
def extract_with_image(
self,
image: str | Path | np.ndarray,
page_no: int = 0,
max_size: int = 2000,
scale_to_pdf_points: float | None = None,
scan_bottom_region: bool = True,
bottom_region_ratio: float = 0.15
) -> OCRResult:
"""
Extract text tokens from an image and return the preprocessed image.
PaddleOCR applies document preprocessing (unwarping, rotation, enhancement)
and returns coordinates relative to the preprocessed image (output_img).
This method returns both tokens and output_img so the caller can save
the correct image that matches the coordinates.
Args:
image: Image path or numpy array
page_no: Page number for reference
max_size: Maximum image dimension. Larger images will be scaled down
to avoid OCR issues with PaddleOCR on large images.
scale_to_pdf_points: If provided, scale bbox coordinates by this factor
to convert from pixel to PDF point coordinates.
Use (72 / dpi) for images rendered at a specific DPI.
scan_bottom_region: If True, also scan the bottom region separately to catch
OCR payment lines that may be missed in full-page scan.
bottom_region_ratio: Ratio of page height to scan as bottom region (default 0.15 = 15%)
Returns:
OCRResult with tokens and output_img (preprocessed image from PaddleOCR)
"""
from PIL import Image as PILImage
# Load image if path
if isinstance(image, (str, Path)):
img = PILImage.open(str(image))
img_array = np.array(img)
else:
img_array = image
# Check if image needs scaling for OCR
h, w = img_array.shape[:2]
ocr_scale_factor = 1.0
if max(h, w) > max_size:
ocr_scale_factor = max_size / max(h, w)
new_w = int(w * ocr_scale_factor)
new_h = int(h * ocr_scale_factor)
# Resize image for OCR
img = PILImage.fromarray(img_array)
img = img.resize((new_w, new_h), PILImage.Resampling.LANCZOS)
img_array = np.array(img)
# PaddleOCR 3.x uses predict() method instead of ocr()
result = self.ocr.predict(img_array)
tokens = []
output_img = None
if result:
for item in result:
# PaddleOCR 3.x returns list of dicts with 'rec_texts', 'rec_scores', 'dt_polys'
if isinstance(item, dict):
rec_texts = item.get('rec_texts', [])
rec_scores = item.get('rec_scores', [])
dt_polys = item.get('dt_polys', [])
# Get output_img from doc_preprocessor_res
# This is the preprocessed image that coordinates are relative to
doc_preproc = item.get('doc_preprocessor_res', {})
if isinstance(doc_preproc, dict):
output_img = doc_preproc.get('output_img')
# Coordinates are relative to output_img (preprocessed image)
# No rotation compensation needed - just use coordinates directly
for text, score, poly in zip(rec_texts, rec_scores, dt_polys):
# poly is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
x_coords = [float(p[0]) for p in poly]
y_coords = [float(p[1]) for p in poly]
# Apply PDF points scale if requested
if scale_to_pdf_points is not None:
final_scale = scale_to_pdf_points
else:
final_scale = 1.0
bbox = (
min(x_coords) * final_scale,
min(y_coords) * final_scale,
max(x_coords) * final_scale,
max(y_coords) * final_scale
)
tokens.append(OCRToken(
text=text,
bbox=bbox,
confidence=float(score),
page_no=page_no
))
elif isinstance(item, (list, tuple)) and len(item) == 2:
# Legacy format: [[bbox_points], (text, confidence)]
bbox_points, (text, confidence) = item
x_coords = [p[0] for p in bbox_points]
y_coords = [p[1] for p in bbox_points]
# Apply PDF points scale if requested
if scale_to_pdf_points is not None:
final_scale = scale_to_pdf_points
else:
final_scale = 1.0
bbox = (
min(x_coords) * final_scale,
min(y_coords) * final_scale,
max(x_coords) * final_scale,
max(y_coords) * final_scale
)
tokens.append(OCRToken(
text=text,
bbox=bbox,
confidence=confidence,
page_no=page_no
))
# If no output_img was found, use the original input array
if output_img is None:
output_img = img_array
# Optionally scan bottom region separately for Swedish OCR payment lines
if scan_bottom_region:
bottom_tokens = self._scan_bottom_region(
image, page_no, max_size, scale_to_pdf_points, bottom_region_ratio
)
tokens = self._merge_tokens(tokens, bottom_tokens)
return OCRResult(tokens=tokens, output_img=output_img)
def extract_from_pdf(
self,
pdf_path: str | Path,
dpi: int = 300
) -> Generator[list[OCRToken], None, None]:
"""
Extract text from all pages of a scanned PDF.
Args:
pdf_path: Path to the PDF file
dpi: Resolution for rendering
Yields:
List of OCRToken for each page
"""
from ..pdf.renderer import render_pdf_to_images
import io
from PIL import Image
for page_no, image_bytes in render_pdf_to_images(pdf_path, dpi=dpi):
# Convert bytes to numpy array
image = Image.open(io.BytesIO(image_bytes))
image_array = np.array(image)
tokens = self.extract_from_image(image_array, page_no=page_no)
yield tokens
def extract_ocr_tokens(
image_path: str | Path,
lang: str = "en",
page_no: int = 0
) -> list[OCRToken]:
"""
Convenience function to extract OCR tokens from an image.
Args:
image_path: Path to the image file
lang: Language code
page_no: Page number for reference
Returns:
List of OCRToken objects
"""
engine = OCREngine(lang=lang)
return engine.extract_from_image(image_path, page_no=page_no)