Re-structure the project.

This commit is contained in:
Yaojia Wang
2026-01-25 15:21:11 +01:00
parent 8fd61ea928
commit e599424a92
80 changed files with 10672 additions and 1584 deletions

335
tests/pdf/test_detector.py Normal file
View File

@@ -0,0 +1,335 @@
"""
Tests for the PDF Type Detection Module.
Tests cover all detector functions in src/pdf/detector.py
Note: These tests require PyMuPDF (fitz) and actual PDF files or mocks.
Some tests are marked as integration tests that require real PDF files.
Usage:
pytest src/pdf/test_detector.py -v -o 'addopts='
"""
import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
from src.pdf.detector import (
extract_text_first_page,
is_text_pdf,
get_pdf_type,
get_page_info,
PDFType,
)
class TestExtractTextFirstPage:
"""Tests for extract_text_first_page function."""
def test_with_mock_empty_pdf(self):
"""Should return empty string for empty PDF."""
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=0)
with patch("fitz.open", return_value=mock_doc):
result = extract_text_first_page("test.pdf")
assert result == ""
def test_with_mock_text_pdf(self):
"""Should extract text from first page."""
mock_page = MagicMock()
mock_page.get_text.return_value = "Faktura 12345\nDatum: 2025-01-15"
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
result = extract_text_first_page("test.pdf")
assert "Faktura" in result
assert "12345" in result
class TestIsTextPDF:
"""Tests for is_text_pdf function."""
def test_empty_pdf_returns_false(self):
"""Should return False for PDF with no text."""
with patch("src.pdf.detector.extract_text_first_page", return_value=""):
assert is_text_pdf("test.pdf") is False
def test_short_text_returns_false(self):
"""Should return False for PDF with very short text."""
with patch("src.pdf.detector.extract_text_first_page", return_value="Hello"):
assert is_text_pdf("test.pdf") is False
def test_readable_text_with_keywords_returns_true(self):
"""Should return True for readable text with invoice keywords."""
text = """
Faktura
Datum: 2025-01-15
Belopp: 1234,56 SEK
Bankgiro: 5393-9484
Moms: 25%
""" + "a" * 200 # Ensure > 200 chars
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
assert is_text_pdf("test.pdf") is True
def test_garbled_text_returns_false(self):
"""Should return False for garbled/unreadable text."""
# Simulate garbled text (lots of non-printable characters)
garbled = "\x00\x01\x02" * 100 + "abc" * 20 # Low readable ratio
with patch("src.pdf.detector.extract_text_first_page", return_value=garbled):
assert is_text_pdf("test.pdf") is False
def test_text_without_keywords_needs_high_readability(self):
"""Should require high readability when no keywords found."""
# Text without invoice keywords
text = "The quick brown fox jumps over the lazy dog. " * 10
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
# Should pass if readable ratio is high enough
result = is_text_pdf("test.pdf")
# Result depends on character ratio - ASCII text should pass
assert result is True
def test_custom_min_chars(self):
"""Should respect custom min_chars parameter."""
text = "Short text here" # 15 chars
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
# Default min_chars=30 - should fail
assert is_text_pdf("test.pdf", min_chars=30) is False
# Custom min_chars=10 - should pass basic length check
# (but will still fail keyword/readability checks)
class TestGetPDFType:
"""Tests for get_pdf_type function."""
def test_empty_pdf_returns_scanned(self):
"""Should return 'scanned' for empty PDF."""
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=0)
with patch("fitz.open", return_value=mock_doc):
result = get_pdf_type("test.pdf")
assert result == "scanned"
def test_all_text_pages_returns_text(self):
"""Should return 'text' when all pages have text."""
mock_page1 = MagicMock()
mock_page1.get_text.return_value = "A" * 50 # > 30 chars
mock_page2 = MagicMock()
mock_page2.get_text.return_value = "B" * 50 # > 30 chars
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=2)
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
with patch("fitz.open", return_value=mock_doc):
result = get_pdf_type("test.pdf")
assert result == "text"
def test_no_text_pages_returns_scanned(self):
"""Should return 'scanned' when no pages have text."""
mock_page1 = MagicMock()
mock_page1.get_text.return_value = ""
mock_page2 = MagicMock()
mock_page2.get_text.return_value = "AB" # < 30 chars
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=2)
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
with patch("fitz.open", return_value=mock_doc):
result = get_pdf_type("test.pdf")
assert result == "scanned"
def test_mixed_pages_returns_mixed(self):
"""Should return 'mixed' when some pages have text."""
mock_page1 = MagicMock()
mock_page1.get_text.return_value = "A" * 50 # Has text
mock_page2 = MagicMock()
mock_page2.get_text.return_value = "" # No text
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=2)
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
with patch("fitz.open", return_value=mock_doc):
result = get_pdf_type("test.pdf")
assert result == "mixed"
class TestGetPageInfo:
"""Tests for get_page_info function."""
def test_single_page_pdf(self):
"""Should return info for single page."""
mock_rect = MagicMock()
mock_rect.width = 595.0 # A4 width in points
mock_rect.height = 842.0 # A4 height in points
mock_page = MagicMock()
mock_page.get_text.return_value = "A" * 50
mock_page.rect = mock_rect
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1)
def mock_iter(self):
yield mock_page
mock_doc.__iter__ = lambda self: mock_iter(self)
with patch("fitz.open", return_value=mock_doc):
pages = get_page_info("test.pdf")
assert len(pages) == 1
assert pages[0]["page_no"] == 0
assert pages[0]["width"] == 595.0
assert pages[0]["height"] == 842.0
assert pages[0]["has_text"] is True
assert pages[0]["char_count"] == 50
def test_multi_page_pdf(self):
"""Should return info for all pages."""
def create_mock_page(text, width, height):
mock_rect = MagicMock()
mock_rect.width = width
mock_rect.height = height
mock_page = MagicMock()
mock_page.get_text.return_value = text
mock_page.rect = mock_rect
return mock_page
pages_data = [
("A" * 50, 595.0, 842.0), # Page 0: has text
("", 595.0, 842.0), # Page 1: no text
("B" * 100, 612.0, 792.0), # Page 2: different size, has text
]
mock_pages = [create_mock_page(*data) for data in pages_data]
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=3)
def mock_iter(self):
for page in mock_pages:
yield page
mock_doc.__iter__ = lambda self: mock_iter(self)
with patch("fitz.open", return_value=mock_doc):
pages = get_page_info("test.pdf")
assert len(pages) == 3
# Page 0
assert pages[0]["page_no"] == 0
assert pages[0]["has_text"] is True
assert pages[0]["char_count"] == 50
# Page 1
assert pages[1]["page_no"] == 1
assert pages[1]["has_text"] is False
assert pages[1]["char_count"] == 0
# Page 2
assert pages[2]["page_no"] == 2
assert pages[2]["has_text"] is True
assert pages[2]["width"] == 612.0
class TestPDFTypeAnnotation:
"""Tests for PDFType type alias."""
def test_valid_types(self):
"""PDFType should accept valid literal values."""
# These are compile-time checks, but we can verify at runtime
valid_types: list[PDFType] = ["text", "scanned", "mixed"]
assert all(t in ["text", "scanned", "mixed"] for t in valid_types)
class TestIsTextPDFKeywordDetection:
"""Tests for keyword detection in is_text_pdf."""
def test_detects_swedish_keywords(self):
"""Should detect Swedish invoice keywords."""
keywords = [
("faktura", True),
("datum", True),
("belopp", True),
("bankgiro", True),
("plusgiro", True),
("moms", True),
]
for keyword, expected in keywords:
# Create text with keyword and enough content
text = f"Document with {keyword} keyword here" + " more text" * 50
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
# Need at least 2 keywords for is_text_pdf to return True
# So this tests if keyword is recognized when combined with others
pass
def test_detects_english_keywords(self):
"""Should detect English invoice keywords."""
text = "Invoice document with date and amount information" + " x" * 100
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
# invoice + date = 2 keywords
result = is_text_pdf("test.pdf")
assert result is True
def test_needs_at_least_two_keywords(self):
"""Should require at least 2 keywords to pass keyword check."""
# Only one keyword
text = "This is a faktura document" + " x" * 200
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
# With only 1 keyword, falls back to other checks
# Should still pass if readability is high
pass
class TestReadabilityChecks:
"""Tests for readability ratio checks in is_text_pdf."""
def test_high_ascii_ratio_passes(self):
"""Should pass when ASCII ratio is high."""
# Pure ASCII text
text = "This is a normal document with only ASCII characters. " * 10
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
result = is_text_pdf("test.pdf")
assert result is True
def test_swedish_characters_accepted(self):
"""Should accept Swedish characters as readable."""
text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
result = is_text_pdf("test.pdf")
assert result is True
def test_low_readability_fails(self):
"""Should fail when readability ratio is too low."""
# Mix of readable and unreadable characters
# Create text with < 70% readable characters
readable = "abc" * 30 # 90 readable chars
unreadable = "\x80\x81\x82" * 50 # 150 unreadable chars
text = readable + unreadable
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
result = is_text_pdf("test.pdf")
assert result is False
if __name__ == "__main__":
pytest.main([__file__, "-v"])