336 lines
12 KiB
Python
336 lines
12 KiB
Python
"""
|
|
Tests for the PDF Type Detection Module.
|
|
|
|
Tests cover all detector functions in src/pdf/detector.py
|
|
|
|
Note: These tests require PyMuPDF (fitz) and actual PDF files or mocks.
|
|
Some tests are marked as integration tests that require real PDF files.
|
|
|
|
Usage:
|
|
pytest src/pdf/test_detector.py -v -o 'addopts='
|
|
"""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
from unittest.mock import patch, MagicMock
|
|
from shared.pdf.detector import (
|
|
extract_text_first_page,
|
|
is_text_pdf,
|
|
get_pdf_type,
|
|
get_page_info,
|
|
PDFType,
|
|
)
|
|
|
|
|
|
class TestExtractTextFirstPage:
|
|
"""Tests for extract_text_first_page function."""
|
|
|
|
def test_with_mock_empty_pdf(self):
|
|
"""Should return empty string for empty PDF."""
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=0)
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
result = extract_text_first_page("test.pdf")
|
|
assert result == ""
|
|
|
|
def test_with_mock_text_pdf(self):
|
|
"""Should extract text from first page."""
|
|
mock_page = MagicMock()
|
|
mock_page.get_text.return_value = "Faktura 12345\nDatum: 2025-01-15"
|
|
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=1)
|
|
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
result = extract_text_first_page("test.pdf")
|
|
assert "Faktura" in result
|
|
assert "12345" in result
|
|
|
|
|
|
class TestIsTextPDF:
|
|
"""Tests for is_text_pdf function."""
|
|
|
|
def test_empty_pdf_returns_false(self):
|
|
"""Should return False for PDF with no text."""
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=""):
|
|
assert is_text_pdf("test.pdf") is False
|
|
|
|
def test_short_text_returns_false(self):
|
|
"""Should return False for PDF with very short text."""
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value="Hello"):
|
|
assert is_text_pdf("test.pdf") is False
|
|
|
|
def test_readable_text_with_keywords_returns_true(self):
|
|
"""Should return True for readable text with invoice keywords."""
|
|
text = """
|
|
Faktura
|
|
Datum: 2025-01-15
|
|
Belopp: 1234,56 SEK
|
|
Bankgiro: 5393-9484
|
|
Moms: 25%
|
|
""" + "a" * 200 # Ensure > 200 chars
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
assert is_text_pdf("test.pdf") is True
|
|
|
|
def test_garbled_text_returns_false(self):
|
|
"""Should return False for garbled/unreadable text."""
|
|
# Simulate garbled text (lots of non-printable characters)
|
|
garbled = "\x00\x01\x02" * 100 + "abc" * 20 # Low readable ratio
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=garbled):
|
|
assert is_text_pdf("test.pdf") is False
|
|
|
|
def test_text_without_keywords_needs_high_readability(self):
|
|
"""Should require high readability when no keywords found."""
|
|
# Text without invoice keywords
|
|
text = "The quick brown fox jumps over the lazy dog. " * 10
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
# Should pass if readable ratio is high enough
|
|
result = is_text_pdf("test.pdf")
|
|
# Result depends on character ratio - ASCII text should pass
|
|
assert result is True
|
|
|
|
def test_custom_min_chars(self):
|
|
"""Should respect custom min_chars parameter."""
|
|
text = "Short text here" # 15 chars
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
# Default min_chars=30 - should fail
|
|
assert is_text_pdf("test.pdf", min_chars=30) is False
|
|
# Custom min_chars=10 - should pass basic length check
|
|
# (but will still fail keyword/readability checks)
|
|
|
|
|
|
class TestGetPDFType:
|
|
"""Tests for get_pdf_type function."""
|
|
|
|
def test_empty_pdf_returns_scanned(self):
|
|
"""Should return 'scanned' for empty PDF."""
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=0)
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
result = get_pdf_type("test.pdf")
|
|
assert result == "scanned"
|
|
|
|
def test_all_text_pages_returns_text(self):
|
|
"""Should return 'text' when all pages have text."""
|
|
mock_page1 = MagicMock()
|
|
mock_page1.get_text.return_value = "A" * 50 # > 30 chars
|
|
|
|
mock_page2 = MagicMock()
|
|
mock_page2.get_text.return_value = "B" * 50 # > 30 chars
|
|
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=2)
|
|
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
result = get_pdf_type("test.pdf")
|
|
assert result == "text"
|
|
|
|
def test_no_text_pages_returns_scanned(self):
|
|
"""Should return 'scanned' when no pages have text."""
|
|
mock_page1 = MagicMock()
|
|
mock_page1.get_text.return_value = ""
|
|
|
|
mock_page2 = MagicMock()
|
|
mock_page2.get_text.return_value = "AB" # < 30 chars
|
|
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=2)
|
|
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
result = get_pdf_type("test.pdf")
|
|
assert result == "scanned"
|
|
|
|
def test_mixed_pages_returns_mixed(self):
|
|
"""Should return 'mixed' when some pages have text."""
|
|
mock_page1 = MagicMock()
|
|
mock_page1.get_text.return_value = "A" * 50 # Has text
|
|
|
|
mock_page2 = MagicMock()
|
|
mock_page2.get_text.return_value = "" # No text
|
|
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=2)
|
|
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
result = get_pdf_type("test.pdf")
|
|
assert result == "mixed"
|
|
|
|
|
|
class TestGetPageInfo:
|
|
"""Tests for get_page_info function."""
|
|
|
|
def test_single_page_pdf(self):
|
|
"""Should return info for single page."""
|
|
mock_rect = MagicMock()
|
|
mock_rect.width = 595.0 # A4 width in points
|
|
mock_rect.height = 842.0 # A4 height in points
|
|
|
|
mock_page = MagicMock()
|
|
mock_page.get_text.return_value = "A" * 50
|
|
mock_page.rect = mock_rect
|
|
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=1)
|
|
|
|
def mock_iter(self):
|
|
yield mock_page
|
|
mock_doc.__iter__ = lambda self: mock_iter(self)
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
pages = get_page_info("test.pdf")
|
|
|
|
assert len(pages) == 1
|
|
assert pages[0]["page_no"] == 0
|
|
assert pages[0]["width"] == 595.0
|
|
assert pages[0]["height"] == 842.0
|
|
assert pages[0]["has_text"] is True
|
|
assert pages[0]["char_count"] == 50
|
|
|
|
def test_multi_page_pdf(self):
|
|
"""Should return info for all pages."""
|
|
def create_mock_page(text, width, height):
|
|
mock_rect = MagicMock()
|
|
mock_rect.width = width
|
|
mock_rect.height = height
|
|
|
|
mock_page = MagicMock()
|
|
mock_page.get_text.return_value = text
|
|
mock_page.rect = mock_rect
|
|
return mock_page
|
|
|
|
pages_data = [
|
|
("A" * 50, 595.0, 842.0), # Page 0: has text
|
|
("", 595.0, 842.0), # Page 1: no text
|
|
("B" * 100, 612.0, 792.0), # Page 2: different size, has text
|
|
]
|
|
|
|
mock_pages = [create_mock_page(*data) for data in pages_data]
|
|
|
|
mock_doc = MagicMock()
|
|
mock_doc.__len__ = MagicMock(return_value=3)
|
|
|
|
def mock_iter(self):
|
|
for page in mock_pages:
|
|
yield page
|
|
mock_doc.__iter__ = lambda self: mock_iter(self)
|
|
|
|
with patch("fitz.open", return_value=mock_doc):
|
|
pages = get_page_info("test.pdf")
|
|
|
|
assert len(pages) == 3
|
|
|
|
# Page 0
|
|
assert pages[0]["page_no"] == 0
|
|
assert pages[0]["has_text"] is True
|
|
assert pages[0]["char_count"] == 50
|
|
|
|
# Page 1
|
|
assert pages[1]["page_no"] == 1
|
|
assert pages[1]["has_text"] is False
|
|
assert pages[1]["char_count"] == 0
|
|
|
|
# Page 2
|
|
assert pages[2]["page_no"] == 2
|
|
assert pages[2]["has_text"] is True
|
|
assert pages[2]["width"] == 612.0
|
|
|
|
|
|
class TestPDFTypeAnnotation:
|
|
"""Tests for PDFType type alias."""
|
|
|
|
def test_valid_types(self):
|
|
"""PDFType should accept valid literal values."""
|
|
# These are compile-time checks, but we can verify at runtime
|
|
valid_types: list[PDFType] = ["text", "scanned", "mixed"]
|
|
assert all(t in ["text", "scanned", "mixed"] for t in valid_types)
|
|
|
|
|
|
class TestIsTextPDFKeywordDetection:
|
|
"""Tests for keyword detection in is_text_pdf."""
|
|
|
|
def test_detects_swedish_keywords(self):
|
|
"""Should detect Swedish invoice keywords."""
|
|
keywords = [
|
|
("faktura", True),
|
|
("datum", True),
|
|
("belopp", True),
|
|
("bankgiro", True),
|
|
("plusgiro", True),
|
|
("moms", True),
|
|
]
|
|
|
|
for keyword, expected in keywords:
|
|
# Create text with keyword and enough content
|
|
text = f"Document with {keyword} keyword here" + " more text" * 50
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
# Need at least 2 keywords for is_text_pdf to return True
|
|
# So this tests if keyword is recognized when combined with others
|
|
pass
|
|
|
|
def test_detects_english_keywords(self):
|
|
"""Should detect English invoice keywords."""
|
|
text = "Invoice document with date and amount information" + " x" * 100
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
# invoice + date = 2 keywords
|
|
result = is_text_pdf("test.pdf")
|
|
assert result is True
|
|
|
|
def test_needs_at_least_two_keywords(self):
|
|
"""Should require at least 2 keywords to pass keyword check."""
|
|
# Only one keyword
|
|
text = "This is a faktura document" + " x" * 200
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
# With only 1 keyword, falls back to other checks
|
|
# Should still pass if readability is high
|
|
pass
|
|
|
|
|
|
class TestReadabilityChecks:
|
|
"""Tests for readability ratio checks in is_text_pdf."""
|
|
|
|
def test_high_ascii_ratio_passes(self):
|
|
"""Should pass when ASCII ratio is high."""
|
|
# Pure ASCII text
|
|
text = "This is a normal document with only ASCII characters. " * 10
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
result = is_text_pdf("test.pdf")
|
|
assert result is True
|
|
|
|
def test_swedish_characters_accepted(self):
|
|
"""Should accept Swedish characters as readable."""
|
|
text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
result = is_text_pdf("test.pdf")
|
|
assert result is True
|
|
|
|
def test_low_readability_fails(self):
|
|
"""Should fail when readability ratio is too low."""
|
|
# Mix of readable and unreadable characters
|
|
# Create text with < 70% readable characters
|
|
readable = "abc" * 30 # 90 readable chars
|
|
unreadable = "\x80\x81\x82" * 50 # 150 unreadable chars
|
|
text = readable + unreadable
|
|
|
|
with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
|
|
result = is_text_pdf("test.pdf")
|
|
assert result is False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|