invoice-master-poc-v2/src/pdf/detector.py

"""
PDF Type Detection Module

Automatically distinguishes between:
- Text-based PDFs (digitally generated)
- Scanned image PDFs
"""

from pathlib import Path
from typing import Literal
import fitz  # PyMuPDF


PDFType = Literal["text", "scanned", "mixed"]


def extract_text_first_page(pdf_path: str | Path) -> str:
    """Extract text from the first page of a PDF."""
    doc = fitz.open(pdf_path)
    if len(doc) == 0:
        return ""

    first_page = doc[0]
    text = first_page.get_text()
    doc.close()
    return text


def is_text_pdf(pdf_path: str | Path, min_chars: int = 30) -> bool:
    """
    Check if PDF has extractable text layer.

    Args:
        pdf_path: Path to the PDF file
        min_chars: Minimum characters to consider it a text PDF

    Returns:
        True if PDF has text layer, False if scanned
    """
    text = extract_text_first_page(pdf_path)
    return len(text.strip()) > min_chars


def get_pdf_type(pdf_path: str | Path) -> PDFType:
    """
    Determine the PDF type.

    Returns:
        'text' - Has extractable text layer
        'scanned' - Image-based, needs OCR
        'mixed' - Some pages have text, some don't
    """
    doc = fitz.open(pdf_path)

    if len(doc) == 0:
        doc.close()
        return "scanned"

    text_pages = 0
    total_pages = len(doc)
    for page in doc:
        text = page.get_text().strip()
        if len(text) > 30:
            text_pages += 1

    doc.close()

    if text_pages == total_pages:
        return "text"
    elif text_pages == 0:
        return "scanned"
    else:
        return "mixed"


def get_page_info(pdf_path: str | Path) -> list[dict]:
    """
    Get information about each page in the PDF.

    Returns:
        List of dicts with page info (number, width, height, has_text)
    """
    doc = fitz.open(pdf_path)
    pages = []

    for i, page in enumerate(doc):
        text = page.get_text().strip()
        rect = page.rect
        pages.append({
            "page_no": i,
            "width": rect.width,
            "height": rect.height,
            "has_text": len(text) > 30,
            "char_count": len(text)
        })

    doc.close()
    return pages