99 lines
2.1 KiB
Python
99 lines
2.1 KiB
Python
"""
|
|
PDF Type Detection Module
|
|
|
|
Automatically distinguishes between:
|
|
- Text-based PDFs (digitally generated)
|
|
- Scanned image PDFs
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Literal
|
|
import fitz # PyMuPDF
|
|
|
|
|
|
PDFType = Literal["text", "scanned", "mixed"]
|
|
|
|
|
|
def extract_text_first_page(pdf_path: str | Path) -> str:
|
|
"""Extract text from the first page of a PDF."""
|
|
doc = fitz.open(pdf_path)
|
|
if len(doc) == 0:
|
|
return ""
|
|
|
|
first_page = doc[0]
|
|
text = first_page.get_text()
|
|
doc.close()
|
|
return text
|
|
|
|
|
|
def is_text_pdf(pdf_path: str | Path, min_chars: int = 30) -> bool:
|
|
"""
|
|
Check if PDF has extractable text layer.
|
|
|
|
Args:
|
|
pdf_path: Path to the PDF file
|
|
min_chars: Minimum characters to consider it a text PDF
|
|
|
|
Returns:
|
|
True if PDF has text layer, False if scanned
|
|
"""
|
|
text = extract_text_first_page(pdf_path)
|
|
return len(text.strip()) > min_chars
|
|
|
|
|
|
def get_pdf_type(pdf_path: str | Path) -> PDFType:
|
|
"""
|
|
Determine the PDF type.
|
|
|
|
Returns:
|
|
'text' - Has extractable text layer
|
|
'scanned' - Image-based, needs OCR
|
|
'mixed' - Some pages have text, some don't
|
|
"""
|
|
doc = fitz.open(pdf_path)
|
|
|
|
if len(doc) == 0:
|
|
doc.close()
|
|
return "scanned"
|
|
|
|
text_pages = 0
|
|
total_pages = len(doc)
|
|
for page in doc:
|
|
text = page.get_text().strip()
|
|
if len(text) > 30:
|
|
text_pages += 1
|
|
|
|
doc.close()
|
|
|
|
if text_pages == total_pages:
|
|
return "text"
|
|
elif text_pages == 0:
|
|
return "scanned"
|
|
else:
|
|
return "mixed"
|
|
|
|
|
|
def get_page_info(pdf_path: str | Path) -> list[dict]:
|
|
"""
|
|
Get information about each page in the PDF.
|
|
|
|
Returns:
|
|
List of dicts with page info (number, width, height, has_text)
|
|
"""
|
|
doc = fitz.open(pdf_path)
|
|
pages = []
|
|
|
|
for i, page in enumerate(doc):
|
|
text = page.get_text().strip()
|
|
rect = page.rect
|
|
pages.append({
|
|
"page_no": i,
|
|
"width": rect.width,
|
|
"height": rect.height,
|
|
"has_text": len(text) > 30,
|
|
"char_count": len(text)
|
|
})
|
|
|
|
doc.close()
|
|
return pages
|