""" Tests for the PDF Type Detection Module. Tests cover all detector functions in src/pdf/detector.py Note: These tests require PyMuPDF (fitz) and actual PDF files or mocks. Some tests are marked as integration tests that require real PDF files. Usage: pytest src/pdf/test_detector.py -v -o 'addopts=' """ import pytest from pathlib import Path from unittest.mock import patch, MagicMock from src.pdf.detector import ( extract_text_first_page, is_text_pdf, get_pdf_type, get_page_info, PDFType, ) class TestExtractTextFirstPage: """Tests for extract_text_first_page function.""" def test_with_mock_empty_pdf(self): """Should return empty string for empty PDF.""" mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=0) with patch("fitz.open", return_value=mock_doc): result = extract_text_first_page("test.pdf") assert result == "" def test_with_mock_text_pdf(self): """Should extract text from first page.""" mock_page = MagicMock() mock_page.get_text.return_value = "Faktura 12345\nDatum: 2025-01-15" mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): result = extract_text_first_page("test.pdf") assert "Faktura" in result assert "12345" in result class TestIsTextPDF: """Tests for is_text_pdf function.""" def test_empty_pdf_returns_false(self): """Should return False for PDF with no text.""" with patch("src.pdf.detector.extract_text_first_page", return_value=""): assert is_text_pdf("test.pdf") is False def test_short_text_returns_false(self): """Should return False for PDF with very short text.""" with patch("src.pdf.detector.extract_text_first_page", return_value="Hello"): assert is_text_pdf("test.pdf") is False def test_readable_text_with_keywords_returns_true(self): """Should return True for readable text with invoice keywords.""" text = """ Faktura Datum: 2025-01-15 Belopp: 1234,56 SEK Bankgiro: 5393-9484 Moms: 25% """ + "a" * 200 # Ensure > 200 chars with patch("src.pdf.detector.extract_text_first_page", return_value=text): assert is_text_pdf("test.pdf") is True def test_garbled_text_returns_false(self): """Should return False for garbled/unreadable text.""" # Simulate garbled text (lots of non-printable characters) garbled = "\x00\x01\x02" * 100 + "abc" * 20 # Low readable ratio with patch("src.pdf.detector.extract_text_first_page", return_value=garbled): assert is_text_pdf("test.pdf") is False def test_text_without_keywords_needs_high_readability(self): """Should require high readability when no keywords found.""" # Text without invoice keywords text = "The quick brown fox jumps over the lazy dog. " * 10 with patch("src.pdf.detector.extract_text_first_page", return_value=text): # Should pass if readable ratio is high enough result = is_text_pdf("test.pdf") # Result depends on character ratio - ASCII text should pass assert result is True def test_custom_min_chars(self): """Should respect custom min_chars parameter.""" text = "Short text here" # 15 chars with patch("src.pdf.detector.extract_text_first_page", return_value=text): # Default min_chars=30 - should fail assert is_text_pdf("test.pdf", min_chars=30) is False # Custom min_chars=10 - should pass basic length check # (but will still fail keyword/readability checks) class TestGetPDFType: """Tests for get_pdf_type function.""" def test_empty_pdf_returns_scanned(self): """Should return 'scanned' for empty PDF.""" mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=0) with patch("fitz.open", return_value=mock_doc): result = get_pdf_type("test.pdf") assert result == "scanned" def test_all_text_pages_returns_text(self): """Should return 'text' when all pages have text.""" mock_page1 = MagicMock() mock_page1.get_text.return_value = "A" * 50 # > 30 chars mock_page2 = MagicMock() mock_page2.get_text.return_value = "B" * 50 # > 30 chars mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=2) mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2])) with patch("fitz.open", return_value=mock_doc): result = get_pdf_type("test.pdf") assert result == "text" def test_no_text_pages_returns_scanned(self): """Should return 'scanned' when no pages have text.""" mock_page1 = MagicMock() mock_page1.get_text.return_value = "" mock_page2 = MagicMock() mock_page2.get_text.return_value = "AB" # < 30 chars mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=2) mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2])) with patch("fitz.open", return_value=mock_doc): result = get_pdf_type("test.pdf") assert result == "scanned" def test_mixed_pages_returns_mixed(self): """Should return 'mixed' when some pages have text.""" mock_page1 = MagicMock() mock_page1.get_text.return_value = "A" * 50 # Has text mock_page2 = MagicMock() mock_page2.get_text.return_value = "" # No text mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=2) mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2])) with patch("fitz.open", return_value=mock_doc): result = get_pdf_type("test.pdf") assert result == "mixed" class TestGetPageInfo: """Tests for get_page_info function.""" def test_single_page_pdf(self): """Should return info for single page.""" mock_rect = MagicMock() mock_rect.width = 595.0 # A4 width in points mock_rect.height = 842.0 # A4 height in points mock_page = MagicMock() mock_page.get_text.return_value = "A" * 50 mock_page.rect = mock_rect mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) def mock_iter(self): yield mock_page mock_doc.__iter__ = lambda self: mock_iter(self) with patch("fitz.open", return_value=mock_doc): pages = get_page_info("test.pdf") assert len(pages) == 1 assert pages[0]["page_no"] == 0 assert pages[0]["width"] == 595.0 assert pages[0]["height"] == 842.0 assert pages[0]["has_text"] is True assert pages[0]["char_count"] == 50 def test_multi_page_pdf(self): """Should return info for all pages.""" def create_mock_page(text, width, height): mock_rect = MagicMock() mock_rect.width = width mock_rect.height = height mock_page = MagicMock() mock_page.get_text.return_value = text mock_page.rect = mock_rect return mock_page pages_data = [ ("A" * 50, 595.0, 842.0), # Page 0: has text ("", 595.0, 842.0), # Page 1: no text ("B" * 100, 612.0, 792.0), # Page 2: different size, has text ] mock_pages = [create_mock_page(*data) for data in pages_data] mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=3) def mock_iter(self): for page in mock_pages: yield page mock_doc.__iter__ = lambda self: mock_iter(self) with patch("fitz.open", return_value=mock_doc): pages = get_page_info("test.pdf") assert len(pages) == 3 # Page 0 assert pages[0]["page_no"] == 0 assert pages[0]["has_text"] is True assert pages[0]["char_count"] == 50 # Page 1 assert pages[1]["page_no"] == 1 assert pages[1]["has_text"] is False assert pages[1]["char_count"] == 0 # Page 2 assert pages[2]["page_no"] == 2 assert pages[2]["has_text"] is True assert pages[2]["width"] == 612.0 class TestPDFTypeAnnotation: """Tests for PDFType type alias.""" def test_valid_types(self): """PDFType should accept valid literal values.""" # These are compile-time checks, but we can verify at runtime valid_types: list[PDFType] = ["text", "scanned", "mixed"] assert all(t in ["text", "scanned", "mixed"] for t in valid_types) class TestIsTextPDFKeywordDetection: """Tests for keyword detection in is_text_pdf.""" def test_detects_swedish_keywords(self): """Should detect Swedish invoice keywords.""" keywords = [ ("faktura", True), ("datum", True), ("belopp", True), ("bankgiro", True), ("plusgiro", True), ("moms", True), ] for keyword, expected in keywords: # Create text with keyword and enough content text = f"Document with {keyword} keyword here" + " more text" * 50 with patch("src.pdf.detector.extract_text_first_page", return_value=text): # Need at least 2 keywords for is_text_pdf to return True # So this tests if keyword is recognized when combined with others pass def test_detects_english_keywords(self): """Should detect English invoice keywords.""" text = "Invoice document with date and amount information" + " x" * 100 with patch("src.pdf.detector.extract_text_first_page", return_value=text): # invoice + date = 2 keywords result = is_text_pdf("test.pdf") assert result is True def test_needs_at_least_two_keywords(self): """Should require at least 2 keywords to pass keyword check.""" # Only one keyword text = "This is a faktura document" + " x" * 200 with patch("src.pdf.detector.extract_text_first_page", return_value=text): # With only 1 keyword, falls back to other checks # Should still pass if readability is high pass class TestReadabilityChecks: """Tests for readability ratio checks in is_text_pdf.""" def test_high_ascii_ratio_passes(self): """Should pass when ASCII ratio is high.""" # Pure ASCII text text = "This is a normal document with only ASCII characters. " * 10 with patch("src.pdf.detector.extract_text_first_page", return_value=text): result = is_text_pdf("test.pdf") assert result is True def test_swedish_characters_accepted(self): """Should accept Swedish characters as readable.""" text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50 with patch("src.pdf.detector.extract_text_first_page", return_value=text): result = is_text_pdf("test.pdf") assert result is True def test_low_readability_fails(self): """Should fail when readability ratio is too low.""" # Mix of readable and unreadable characters # Create text with < 70% readable characters readable = "abc" * 30 # 90 readable chars unreadable = "\x80\x81\x82" * 50 # 150 unreadable chars text = readable + unreadable with patch("src.pdf.detector.extract_text_first_page", return_value=text): result = is_text_pdf("test.pdf") assert result is False if __name__ == "__main__": pytest.main([__file__, "-v"])