Initial commit: Invoice field extraction system using YOLO + OCR
Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
98
src/pdf/detector.py
Normal file
98
src/pdf/detector.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
PDF Type Detection Module
|
||||
|
||||
Automatically distinguishes between:
|
||||
- Text-based PDFs (digitally generated)
|
||||
- Scanned image PDFs
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
import fitz # PyMuPDF
|
||||
|
||||
|
||||
PDFType = Literal["text", "scanned", "mixed"]
|
||||
|
||||
|
||||
def extract_text_first_page(pdf_path: str | Path) -> str:
|
||||
"""Extract text from the first page of a PDF."""
|
||||
doc = fitz.open(pdf_path)
|
||||
if len(doc) == 0:
|
||||
return ""
|
||||
|
||||
first_page = doc[0]
|
||||
text = first_page.get_text()
|
||||
doc.close()
|
||||
return text
|
||||
|
||||
|
||||
def is_text_pdf(pdf_path: str | Path, min_chars: int = 30) -> bool:
|
||||
"""
|
||||
Check if PDF has extractable text layer.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
min_chars: Minimum characters to consider it a text PDF
|
||||
|
||||
Returns:
|
||||
True if PDF has text layer, False if scanned
|
||||
"""
|
||||
text = extract_text_first_page(pdf_path)
|
||||
return len(text.strip()) > min_chars
|
||||
|
||||
|
||||
def get_pdf_type(pdf_path: str | Path) -> PDFType:
|
||||
"""
|
||||
Determine the PDF type.
|
||||
|
||||
Returns:
|
||||
'text' - Has extractable text layer
|
||||
'scanned' - Image-based, needs OCR
|
||||
'mixed' - Some pages have text, some don't
|
||||
"""
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
if len(doc) == 0:
|
||||
doc.close()
|
||||
return "scanned"
|
||||
|
||||
text_pages = 0
|
||||
for page in doc:
|
||||
text = page.get_text().strip()
|
||||
if len(text) > 30:
|
||||
text_pages += 1
|
||||
|
||||
doc.close()
|
||||
|
||||
total_pages = len(doc)
|
||||
if text_pages == total_pages:
|
||||
return "text"
|
||||
elif text_pages == 0:
|
||||
return "scanned"
|
||||
else:
|
||||
return "mixed"
|
||||
|
||||
|
||||
def get_page_info(pdf_path: str | Path) -> list[dict]:
|
||||
"""
|
||||
Get information about each page in the PDF.
|
||||
|
||||
Returns:
|
||||
List of dicts with page info (number, width, height, has_text)
|
||||
"""
|
||||
doc = fitz.open(pdf_path)
|
||||
pages = []
|
||||
|
||||
for i, page in enumerate(doc):
|
||||
text = page.get_text().strip()
|
||||
rect = page.rect
|
||||
pages.append({
|
||||
"page_no": i,
|
||||
"width": rect.width,
|
||||
"height": rect.height,
|
||||
"has_text": len(text) > 30,
|
||||
"char_count": len(text)
|
||||
})
|
||||
|
||||
doc.close()
|
||||
return pages
|
||||
Reference in New Issue
Block a user