""" PDF Type Detection Module Automatically distinguishes between: - Text-based PDFs (digitally generated) - Scanned image PDFs """ from pathlib import Path from typing import Literal import fitz # PyMuPDF PDFType = Literal["text", "scanned", "mixed"] def extract_text_first_page(pdf_path: str | Path) -> str: """Extract text from the first page of a PDF.""" doc = fitz.open(pdf_path) if len(doc) == 0: return "" first_page = doc[0] text = first_page.get_text() doc.close() return text def is_text_pdf(pdf_path: str | Path, min_chars: int = 30) -> bool: """ Check if PDF has extractable text layer. Args: pdf_path: Path to the PDF file min_chars: Minimum characters to consider it a text PDF Returns: True if PDF has text layer, False if scanned """ text = extract_text_first_page(pdf_path) return len(text.strip()) > min_chars def get_pdf_type(pdf_path: str | Path) -> PDFType: """ Determine the PDF type. Returns: 'text' - Has extractable text layer 'scanned' - Image-based, needs OCR 'mixed' - Some pages have text, some don't """ doc = fitz.open(pdf_path) if len(doc) == 0: doc.close() return "scanned" text_pages = 0 total_pages = len(doc) for page in doc: text = page.get_text().strip() if len(text) > 30: text_pages += 1 doc.close() if text_pages == total_pages: return "text" elif text_pages == 0: return "scanned" else: return "mixed" def get_page_info(pdf_path: str | Path) -> list[dict]: """ Get information about each page in the PDF. Returns: List of dicts with page info (number, width, height, has_text) """ doc = fitz.open(pdf_path) pages = [] for i, page in enumerate(doc): text = page.get_text().strip() rect = page.rect pages.append({ "page_no": i, "width": rect.width, "height": rect.height, "has_text": len(text) > 30, "char_count": len(text) }) doc.close() return pages