Files
invoice-master-poc-v2/src/pdf/detector.py
2026-01-17 18:55:46 +01:00

99 lines
2.1 KiB
Python

"""
PDF Type Detection Module
Automatically distinguishes between:
- Text-based PDFs (digitally generated)
- Scanned image PDFs
"""
from pathlib import Path
from typing import Literal
import fitz # PyMuPDF
PDFType = Literal["text", "scanned", "mixed"]
def extract_text_first_page(pdf_path: str | Path) -> str:
"""Extract text from the first page of a PDF."""
doc = fitz.open(pdf_path)
if len(doc) == 0:
return ""
first_page = doc[0]
text = first_page.get_text()
doc.close()
return text
def is_text_pdf(pdf_path: str | Path, min_chars: int = 30) -> bool:
"""
Check if PDF has extractable text layer.
Args:
pdf_path: Path to the PDF file
min_chars: Minimum characters to consider it a text PDF
Returns:
True if PDF has text layer, False if scanned
"""
text = extract_text_first_page(pdf_path)
return len(text.strip()) > min_chars
def get_pdf_type(pdf_path: str | Path) -> PDFType:
"""
Determine the PDF type.
Returns:
'text' - Has extractable text layer
'scanned' - Image-based, needs OCR
'mixed' - Some pages have text, some don't
"""
doc = fitz.open(pdf_path)
if len(doc) == 0:
doc.close()
return "scanned"
text_pages = 0
total_pages = len(doc)
for page in doc:
text = page.get_text().strip()
if len(text) > 30:
text_pages += 1
doc.close()
if text_pages == total_pages:
return "text"
elif text_pages == 0:
return "scanned"
else:
return "mixed"
def get_page_info(pdf_path: str | Path) -> list[dict]:
"""
Get information about each page in the PDF.
Returns:
List of dicts with page info (number, width, height, has_text)
"""
doc = fitz.open(pdf_path)
pages = []
for i, page in enumerate(doc):
text = page.get_text().strip()
rect = page.rect
pages.append({
"page_no": i,
"width": rect.width,
"height": rect.height,
"has_text": len(text) > 30,
"char_count": len(text)
})
doc.close()
return pages