406 lines
15 KiB
Python
406 lines
15 KiB
Python
"""
|
|
OCR Extraction Module using PaddleOCR
|
|
|
|
Extracts text tokens with bounding boxes from scanned PDFs.
|
|
"""
|
|
|
|
import os
|
|
import warnings
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Generator
|
|
import numpy as np
|
|
|
|
# Suppress PaddlePaddle reinitialization warnings
|
|
os.environ.setdefault('GLOG_minloglevel', '2')
|
|
warnings.filterwarnings('ignore', message='.*PDX has already been initialized.*')
|
|
warnings.filterwarnings('ignore', message='.*reinitialization.*')
|
|
|
|
|
|
@dataclass
|
|
class OCRToken:
|
|
"""Represents an OCR-extracted text token with its bounding box."""
|
|
text: str
|
|
bbox: tuple[float, float, float, float] # (x0, y0, x1, y1)
|
|
confidence: float
|
|
page_no: int = 0
|
|
|
|
@property
|
|
def x0(self) -> float:
|
|
return self.bbox[0]
|
|
|
|
@property
|
|
def y0(self) -> float:
|
|
return self.bbox[1]
|
|
|
|
@property
|
|
def x1(self) -> float:
|
|
return self.bbox[2]
|
|
|
|
@property
|
|
def y1(self) -> float:
|
|
return self.bbox[3]
|
|
|
|
@property
|
|
def center(self) -> tuple[float, float]:
|
|
return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
|
|
|
|
|
|
@dataclass
|
|
class OCRResult:
|
|
"""Result from OCR extraction including tokens and preprocessed image."""
|
|
tokens: list[OCRToken]
|
|
output_img: np.ndarray | None = None # Preprocessed image from PaddleOCR
|
|
|
|
|
|
class OCREngine:
|
|
"""PaddleOCR wrapper for text extraction."""
|
|
|
|
def __init__(
|
|
self,
|
|
lang: str = "en",
|
|
det_model_dir: str | None = None,
|
|
rec_model_dir: str | None = None,
|
|
use_doc_orientation_classify: bool = True,
|
|
use_doc_unwarping: bool = False
|
|
):
|
|
"""
|
|
Initialize OCR engine.
|
|
|
|
Args:
|
|
lang: Language code ('en', 'sv', 'ch', etc.)
|
|
det_model_dir: Custom detection model directory
|
|
rec_model_dir: Custom recognition model directory
|
|
use_doc_orientation_classify: Whether to auto-detect and correct document orientation.
|
|
Default True to handle rotated documents.
|
|
use_doc_unwarping: Whether to use UVDoc document unwarping for curved/warped documents.
|
|
Default False to preserve original image layout,
|
|
especially important for payment OCR lines at bottom.
|
|
Enable for severely warped documents at the cost of potentially
|
|
losing bottom content.
|
|
|
|
Note:
|
|
PaddleOCR 3.x automatically uses GPU if available via PaddlePaddle.
|
|
Use `paddle.set_device('gpu')` before initialization to force GPU.
|
|
"""
|
|
# Suppress warnings during import and initialization
|
|
with warnings.catch_warnings():
|
|
warnings.filterwarnings('ignore')
|
|
from paddleocr import PaddleOCR
|
|
|
|
# PaddleOCR 3.x init (use_gpu removed, device controlled by paddle.set_device)
|
|
init_params = {
|
|
'lang': lang,
|
|
# Enable orientation classification to handle rotated documents
|
|
'use_doc_orientation_classify': use_doc_orientation_classify,
|
|
# Disable UVDoc unwarping to preserve original image layout
|
|
# This prevents the bottom payment OCR line from being cut off
|
|
# For severely warped documents, enable this but expect potential content loss
|
|
'use_doc_unwarping': use_doc_unwarping,
|
|
}
|
|
if det_model_dir:
|
|
init_params['text_detection_model_dir'] = det_model_dir
|
|
if rec_model_dir:
|
|
init_params['text_recognition_model_dir'] = rec_model_dir
|
|
|
|
self.ocr = PaddleOCR(**init_params)
|
|
|
|
def extract_from_image(
|
|
self,
|
|
image: str | Path | np.ndarray,
|
|
page_no: int = 0,
|
|
max_size: int = 2000,
|
|
scale_to_pdf_points: float | None = None,
|
|
scan_bottom_region: bool = True,
|
|
bottom_region_ratio: float = 0.15
|
|
) -> list[OCRToken]:
|
|
"""
|
|
Extract text tokens from an image.
|
|
|
|
Args:
|
|
image: Image path or numpy array
|
|
page_no: Page number for reference
|
|
max_size: Maximum image dimension. Larger images will be scaled down
|
|
to avoid OCR issues with PaddleOCR on large images.
|
|
scale_to_pdf_points: If provided, scale bbox coordinates by this factor
|
|
to convert from pixel to PDF point coordinates.
|
|
Use (72 / dpi) for images rendered at a specific DPI.
|
|
scan_bottom_region: If True, also scan the bottom region separately to catch
|
|
OCR payment lines that may be missed in full-page scan.
|
|
bottom_region_ratio: Ratio of page height to scan as bottom region (default 0.15 = 15%)
|
|
|
|
Returns:
|
|
List of OCRToken objects with bbox in pixel coords (or PDF points if scale_to_pdf_points is set)
|
|
"""
|
|
result = self.extract_with_image(image, page_no, max_size, scale_to_pdf_points)
|
|
tokens = result.tokens
|
|
|
|
# Optionally scan bottom region separately for Swedish OCR payment lines
|
|
if scan_bottom_region:
|
|
bottom_tokens = self._scan_bottom_region(
|
|
image, page_no, max_size, scale_to_pdf_points, bottom_region_ratio
|
|
)
|
|
tokens = self._merge_tokens(tokens, bottom_tokens)
|
|
|
|
return tokens
|
|
|
|
def _scan_bottom_region(
|
|
self,
|
|
image: str | Path | np.ndarray,
|
|
page_no: int,
|
|
max_size: int,
|
|
scale_to_pdf_points: float | None,
|
|
bottom_ratio: float
|
|
) -> list[OCRToken]:
|
|
"""Scan the bottom region of the image separately."""
|
|
from PIL import Image as PILImage
|
|
|
|
# Load image if path
|
|
if isinstance(image, (str, Path)):
|
|
img = PILImage.open(str(image))
|
|
img_array = np.array(img)
|
|
else:
|
|
img_array = image
|
|
|
|
h, w = img_array.shape[:2]
|
|
crop_y = int(h * (1 - bottom_ratio))
|
|
|
|
# Crop bottom region
|
|
bottom_crop = img_array[crop_y:h, :, :] if len(img_array.shape) == 3 else img_array[crop_y:h, :]
|
|
|
|
# OCR the cropped region (without recursive bottom scan to avoid infinite loop)
|
|
result = self.extract_with_image(
|
|
bottom_crop, page_no, max_size,
|
|
scale_to_pdf_points=None,
|
|
scan_bottom_region=False # Important: disable to prevent recursion
|
|
)
|
|
|
|
# Adjust bbox y-coordinates to full image space
|
|
adjusted_tokens = []
|
|
for token in result.tokens:
|
|
# Scale factor for coordinates
|
|
scale = scale_to_pdf_points if scale_to_pdf_points else 1.0
|
|
|
|
adjusted_bbox = (
|
|
token.bbox[0] * scale,
|
|
(token.bbox[1] + crop_y) * scale,
|
|
token.bbox[2] * scale,
|
|
(token.bbox[3] + crop_y) * scale
|
|
)
|
|
adjusted_tokens.append(OCRToken(
|
|
text=token.text,
|
|
bbox=adjusted_bbox,
|
|
confidence=token.confidence,
|
|
page_no=token.page_no
|
|
))
|
|
|
|
return adjusted_tokens
|
|
|
|
def _merge_tokens(
|
|
self,
|
|
main_tokens: list[OCRToken],
|
|
bottom_tokens: list[OCRToken]
|
|
) -> list[OCRToken]:
|
|
"""Merge tokens from main scan and bottom region scan, removing duplicates."""
|
|
if not bottom_tokens:
|
|
return main_tokens
|
|
|
|
# Create a set of existing token texts for deduplication
|
|
existing_texts = {t.text.strip() for t in main_tokens}
|
|
|
|
# Add bottom tokens that aren't duplicates
|
|
merged = list(main_tokens)
|
|
for token in bottom_tokens:
|
|
if token.text.strip() not in existing_texts:
|
|
merged.append(token)
|
|
existing_texts.add(token.text.strip())
|
|
|
|
return merged
|
|
|
|
def extract_with_image(
|
|
self,
|
|
image: str | Path | np.ndarray,
|
|
page_no: int = 0,
|
|
max_size: int = 2000,
|
|
scale_to_pdf_points: float | None = None,
|
|
scan_bottom_region: bool = True,
|
|
bottom_region_ratio: float = 0.15
|
|
) -> OCRResult:
|
|
"""
|
|
Extract text tokens from an image and return the preprocessed image.
|
|
|
|
PaddleOCR applies document preprocessing (unwarping, rotation, enhancement)
|
|
and returns coordinates relative to the preprocessed image (output_img).
|
|
This method returns both tokens and output_img so the caller can save
|
|
the correct image that matches the coordinates.
|
|
|
|
Args:
|
|
image: Image path or numpy array
|
|
page_no: Page number for reference
|
|
max_size: Maximum image dimension. Larger images will be scaled down
|
|
to avoid OCR issues with PaddleOCR on large images.
|
|
scale_to_pdf_points: If provided, scale bbox coordinates by this factor
|
|
to convert from pixel to PDF point coordinates.
|
|
Use (72 / dpi) for images rendered at a specific DPI.
|
|
scan_bottom_region: If True, also scan the bottom region separately to catch
|
|
OCR payment lines that may be missed in full-page scan.
|
|
bottom_region_ratio: Ratio of page height to scan as bottom region (default 0.15 = 15%)
|
|
|
|
Returns:
|
|
OCRResult with tokens and output_img (preprocessed image from PaddleOCR)
|
|
"""
|
|
from PIL import Image as PILImage
|
|
|
|
# Load image if path
|
|
if isinstance(image, (str, Path)):
|
|
img = PILImage.open(str(image))
|
|
img_array = np.array(img)
|
|
else:
|
|
img_array = image
|
|
|
|
# Check if image needs scaling for OCR
|
|
h, w = img_array.shape[:2]
|
|
ocr_scale_factor = 1.0
|
|
|
|
if max(h, w) > max_size:
|
|
ocr_scale_factor = max_size / max(h, w)
|
|
new_w = int(w * ocr_scale_factor)
|
|
new_h = int(h * ocr_scale_factor)
|
|
# Resize image for OCR
|
|
img = PILImage.fromarray(img_array)
|
|
img = img.resize((new_w, new_h), PILImage.Resampling.LANCZOS)
|
|
img_array = np.array(img)
|
|
|
|
# PaddleOCR 3.x uses predict() method instead of ocr()
|
|
result = self.ocr.predict(img_array)
|
|
|
|
tokens = []
|
|
output_img = None
|
|
|
|
if result:
|
|
for item in result:
|
|
# PaddleOCR 3.x returns list of dicts with 'rec_texts', 'rec_scores', 'dt_polys'
|
|
if isinstance(item, dict):
|
|
rec_texts = item.get('rec_texts', [])
|
|
rec_scores = item.get('rec_scores', [])
|
|
dt_polys = item.get('dt_polys', [])
|
|
|
|
# Get output_img from doc_preprocessor_res
|
|
# This is the preprocessed image that coordinates are relative to
|
|
doc_preproc = item.get('doc_preprocessor_res', {})
|
|
if isinstance(doc_preproc, dict):
|
|
output_img = doc_preproc.get('output_img')
|
|
|
|
# Coordinates are relative to output_img (preprocessed image)
|
|
# No rotation compensation needed - just use coordinates directly
|
|
for text, score, poly in zip(rec_texts, rec_scores, dt_polys):
|
|
# poly is [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
|
x_coords = [float(p[0]) for p in poly]
|
|
y_coords = [float(p[1]) for p in poly]
|
|
|
|
# Apply PDF points scale if requested
|
|
if scale_to_pdf_points is not None:
|
|
final_scale = scale_to_pdf_points
|
|
else:
|
|
final_scale = 1.0
|
|
|
|
bbox = (
|
|
min(x_coords) * final_scale,
|
|
min(y_coords) * final_scale,
|
|
max(x_coords) * final_scale,
|
|
max(y_coords) * final_scale
|
|
)
|
|
|
|
tokens.append(OCRToken(
|
|
text=text,
|
|
bbox=bbox,
|
|
confidence=float(score),
|
|
page_no=page_no
|
|
))
|
|
elif isinstance(item, (list, tuple)) and len(item) == 2:
|
|
# Legacy format: [[bbox_points], (text, confidence)]
|
|
bbox_points, (text, confidence) = item
|
|
|
|
x_coords = [p[0] for p in bbox_points]
|
|
y_coords = [p[1] for p in bbox_points]
|
|
|
|
# Apply PDF points scale if requested
|
|
if scale_to_pdf_points is not None:
|
|
final_scale = scale_to_pdf_points
|
|
else:
|
|
final_scale = 1.0
|
|
|
|
bbox = (
|
|
min(x_coords) * final_scale,
|
|
min(y_coords) * final_scale,
|
|
max(x_coords) * final_scale,
|
|
max(y_coords) * final_scale
|
|
)
|
|
|
|
tokens.append(OCRToken(
|
|
text=text,
|
|
bbox=bbox,
|
|
confidence=confidence,
|
|
page_no=page_no
|
|
))
|
|
|
|
# If no output_img was found, use the original input array
|
|
if output_img is None:
|
|
output_img = img_array
|
|
|
|
# Optionally scan bottom region separately for Swedish OCR payment lines
|
|
if scan_bottom_region:
|
|
bottom_tokens = self._scan_bottom_region(
|
|
image, page_no, max_size, scale_to_pdf_points, bottom_region_ratio
|
|
)
|
|
tokens = self._merge_tokens(tokens, bottom_tokens)
|
|
|
|
return OCRResult(tokens=tokens, output_img=output_img)
|
|
|
|
def extract_from_pdf(
|
|
self,
|
|
pdf_path: str | Path,
|
|
dpi: int = 300
|
|
) -> Generator[list[OCRToken], None, None]:
|
|
"""
|
|
Extract text from all pages of a scanned PDF.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
dpi: Resolution for rendering
|
|
|
|
Yields:
|
|
List of OCRToken for each page
|
|
"""
|
|
from ..pdf.renderer import render_pdf_to_images
|
|
import io
|
|
from PIL import Image
|
|
|
|
for page_no, image_bytes in render_pdf_to_images(pdf_path, dpi=dpi):
|
|
# Convert bytes to numpy array
|
|
image = Image.open(io.BytesIO(image_bytes))
|
|
image_array = np.array(image)
|
|
|
|
tokens = self.extract_from_image(image_array, page_no=page_no)
|
|
yield tokens
|
|
|
|
|
|
def extract_ocr_tokens(
|
|
image_path: str | Path,
|
|
lang: str = "en",
|
|
page_no: int = 0
|
|
) -> list[OCRToken]:
|
|
"""
|
|
Convenience function to extract OCR tokens from an image.
|
|
|
|
Args:
|
|
image_path: Path to the image file
|
|
lang: Language code
|
|
page_no: Page number for reference
|
|
|
|
Returns:
|
|
List of OCRToken objects
|
|
"""
|
|
engine = OCREngine(lang=lang)
|
|
return engine.extract_from_image(image_path, page_no=page_no)
|