Initial commit: Invoice field extraction system using YOLO + OCR

Features: - Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations - Flexible date matching: year-month match, nearby date tolerance - PDF text extraction with PyMuPDF - OCR support for scanned documents (PaddleOCR) - YOLO training and inference pipeline - 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions
--- a/src/pdf/detector.py
+++ b/src/pdf/detector.py
@@ -0,0 +1,98 @@
+"""
+PDF Type Detection Module
+
+Automatically distinguishes between:
+- Text-based PDFs (digitally generated)
+- Scanned image PDFs
+"""
+
+from pathlib import Path
+from typing import Literal
+import fitz  # PyMuPDF
+
+
+PDFType = Literal["text", "scanned", "mixed"]
+
+
+def extract_text_first_page(pdf_path: str | Path) -> str:
+    """Extract text from the first page of a PDF."""
+    doc = fitz.open(pdf_path)
+    if len(doc) == 0:
+        return ""
+
+    first_page = doc[0]
+    text = first_page.get_text()
+    doc.close()
+    return text
+
+
+def is_text_pdf(pdf_path: str | Path, min_chars: int = 30) -> bool:
+    """
+    Check if PDF has extractable text layer.
+
+    Args:
+        pdf_path: Path to the PDF file
+        min_chars: Minimum characters to consider it a text PDF
+
+    Returns:
+        True if PDF has text layer, False if scanned
+    """
+    text = extract_text_first_page(pdf_path)
+    return len(text.strip()) > min_chars
+
+
+def get_pdf_type(pdf_path: str | Path) -> PDFType:
+    """
+    Determine the PDF type.
+
+    Returns:
+        'text' - Has extractable text layer
+        'scanned' - Image-based, needs OCR
+        'mixed' - Some pages have text, some don't
+    """
+    doc = fitz.open(pdf_path)
+
+    if len(doc) == 0:
+        doc.close()
+        return "scanned"
+
+    text_pages = 0
+    for page in doc:
+        text = page.get_text().strip()
+        if len(text) > 30:
+            text_pages += 1
+
+    doc.close()
+
+    total_pages = len(doc)
+    if text_pages == total_pages:
+        return "text"
+    elif text_pages == 0:
+        return "scanned"
+    else:
+        return "mixed"
+
+
+def get_page_info(pdf_path: str | Path) -> list[dict]:
+    """
+    Get information about each page in the PDF.
+
+    Returns:
+        List of dicts with page info (number, width, height, has_text)
+    """
+    doc = fitz.open(pdf_path)
+    pages = []
+
+    for i, page in enumerate(doc):
+        text = page.get_text().strip()
+        rect = page.rect
+        pages.append({
+            "page_no": i,
+            "width": rect.width,
+            "height": rect.height,
+            "has_text": len(text) > 30,
+            "char_count": len(text)
+        })
+
+    doc.close()
+    return pages