Initial commit: Invoice field extraction system using YOLO + OCR

Features:
- Auto-labeling pipeline: CSV values -> PDF search -> YOLO annotations
- Flexible date matching: year-month match, nearby date tolerance
- PDF text extraction with PyMuPDF
- OCR support for scanned documents (PaddleOCR)
- YOLO training and inference pipeline
- 7 field types: InvoiceNumber, InvoiceDate, InvoiceDueDate, OCR, Bankgiro, Plusgiro, Amount

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-01-10 17:44:14 +01:00
commit 8938661850
35 changed files with 5020 additions and 0 deletions

98
src/pdf/detector.py Normal file
View File

@@ -0,0 +1,98 @@
"""
PDF Type Detection Module
Automatically distinguishes between:
- Text-based PDFs (digitally generated)
- Scanned image PDFs
"""
from pathlib import Path
from typing import Literal
import fitz # PyMuPDF
PDFType = Literal["text", "scanned", "mixed"]
def extract_text_first_page(pdf_path: str | Path) -> str:
"""Extract text from the first page of a PDF."""
doc = fitz.open(pdf_path)
if len(doc) == 0:
return ""
first_page = doc[0]
text = first_page.get_text()
doc.close()
return text
def is_text_pdf(pdf_path: str | Path, min_chars: int = 30) -> bool:
"""
Check if PDF has extractable text layer.
Args:
pdf_path: Path to the PDF file
min_chars: Minimum characters to consider it a text PDF
Returns:
True if PDF has text layer, False if scanned
"""
text = extract_text_first_page(pdf_path)
return len(text.strip()) > min_chars
def get_pdf_type(pdf_path: str | Path) -> PDFType:
"""
Determine the PDF type.
Returns:
'text' - Has extractable text layer
'scanned' - Image-based, needs OCR
'mixed' - Some pages have text, some don't
"""
doc = fitz.open(pdf_path)
if len(doc) == 0:
doc.close()
return "scanned"
text_pages = 0
for page in doc:
text = page.get_text().strip()
if len(text) > 30:
text_pages += 1
doc.close()
total_pages = len(doc)
if text_pages == total_pages:
return "text"
elif text_pages == 0:
return "scanned"
else:
return "mixed"
def get_page_info(pdf_path: str | Path) -> list[dict]:
"""
Get information about each page in the PDF.
Returns:
List of dicts with page info (number, width, height, has_text)
"""
doc = fitz.open(pdf_path)
pages = []
for i, page in enumerate(doc):
text = page.get_text().strip()
rect = page.rect
pages.append({
"page_no": i,
"width": rect.width,
"height": rect.height,
"has_text": len(text) > 30,
"char_count": len(text)
})
doc.close()
return pages