118 lines
2.8 KiB
Python
118 lines
2.8 KiB
Python
"""
|
|
PDF Rendering Module
|
|
|
|
Converts PDF pages to images for YOLO training.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Generator
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
def render_pdf_to_images(
|
|
pdf_path: str | Path,
|
|
output_dir: str | Path | None = None,
|
|
dpi: int = 300,
|
|
image_format: str = "png"
|
|
) -> Generator[tuple[int, Path | bytes], None, None]:
|
|
"""
|
|
Render PDF pages to images.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
output_dir: Directory to save images (if None, returns bytes)
|
|
dpi: Resolution for rendering (default 300)
|
|
image_format: Output format ('png' or 'jpg')
|
|
|
|
Yields:
|
|
Tuple of (page_number, image_path or image_bytes)
|
|
"""
|
|
doc = fitz.open(pdf_path)
|
|
|
|
# Calculate zoom factor for desired DPI (72 is base DPI for PDF)
|
|
zoom = dpi / 72
|
|
matrix = fitz.Matrix(zoom, zoom)
|
|
|
|
if output_dir:
|
|
output_dir = Path(output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
pdf_name = Path(pdf_path).stem
|
|
|
|
for page_no, page in enumerate(doc):
|
|
# Render page to pixmap
|
|
pix = page.get_pixmap(matrix=matrix)
|
|
|
|
if output_dir:
|
|
# Save to file
|
|
ext = "jpg" if image_format.lower() in ("jpg", "jpeg") else "png"
|
|
image_path = output_dir / f"{pdf_name}_page_{page_no:03d}.{ext}"
|
|
|
|
if ext == "jpg":
|
|
pix.save(str(image_path), "jpeg")
|
|
else:
|
|
pix.save(str(image_path))
|
|
|
|
yield page_no, image_path
|
|
else:
|
|
# Return bytes
|
|
if image_format.lower() in ("jpg", "jpeg"):
|
|
yield page_no, pix.tobytes("jpeg")
|
|
else:
|
|
yield page_no, pix.tobytes("png")
|
|
|
|
doc.close()
|
|
|
|
|
|
def render_page_to_image(
|
|
pdf_path: str | Path,
|
|
page_no: int,
|
|
dpi: int = 300
|
|
) -> bytes:
|
|
"""
|
|
Render a single page to image bytes.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
page_no: Page number (0-indexed)
|
|
dpi: Resolution for rendering
|
|
|
|
Returns:
|
|
PNG image bytes
|
|
"""
|
|
doc = fitz.open(pdf_path)
|
|
|
|
if page_no >= len(doc):
|
|
doc.close()
|
|
raise ValueError(f"Page {page_no} does not exist (PDF has {len(doc)} pages)")
|
|
|
|
zoom = dpi / 72
|
|
matrix = fitz.Matrix(zoom, zoom)
|
|
|
|
page = doc[page_no]
|
|
pix = page.get_pixmap(matrix=matrix)
|
|
image_bytes = pix.tobytes("png")
|
|
|
|
doc.close()
|
|
return image_bytes
|
|
|
|
|
|
def get_render_dimensions(pdf_path: str | Path, page_no: int = 0, dpi: int = 300) -> tuple[int, int]:
|
|
"""
|
|
Get the dimensions of a rendered page.
|
|
|
|
Returns:
|
|
(width, height) in pixels
|
|
"""
|
|
doc = fitz.open(pdf_path)
|
|
page = doc[page_no]
|
|
|
|
zoom = dpi / 72
|
|
rect = page.rect
|
|
|
|
width = int(rect.width * zoom)
|
|
height = int(rect.height * zoom)
|
|
|
|
doc.close()
|
|
return width, height
|