invoice-master-poc-v2/tests/pdf/test_detector.py

"""
Tests for the PDF Type Detection Module.

Tests cover all detector functions in src/pdf/detector.py

Note: These tests require PyMuPDF (fitz) and actual PDF files or mocks.
Some tests are marked as integration tests that require real PDF files.

Usage:
    pytest src/pdf/test_detector.py -v -o 'addopts='
"""

import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
from shared.pdf.detector import (
    extract_text_first_page,
    is_text_pdf,
    get_pdf_type,
    get_page_info,
    PDFType,
)


class TestExtractTextFirstPage:
    """Tests for extract_text_first_page function."""

    def test_with_mock_empty_pdf(self):
        """Should return empty string for empty PDF."""
        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=0)

        with patch("fitz.open", return_value=mock_doc):
            result = extract_text_first_page("test.pdf")
            assert result == ""

    def test_with_mock_text_pdf(self):
        """Should extract text from first page."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = "Faktura 12345\nDatum: 2025-01-15"

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=1)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            result = extract_text_first_page("test.pdf")
            assert "Faktura" in result
            assert "12345" in result


class TestIsTextPDF:
    """Tests for is_text_pdf function."""

    def test_empty_pdf_returns_false(self):
        """Should return False for PDF with no text."""
        with patch("shared.pdf.detector.extract_text_first_page", return_value=""):
            assert is_text_pdf("test.pdf") is False

    def test_short_text_returns_false(self):
        """Should return False for PDF with very short text."""
        with patch("shared.pdf.detector.extract_text_first_page", return_value="Hello"):
            assert is_text_pdf("test.pdf") is False

    def test_readable_text_with_keywords_returns_true(self):
        """Should return True for readable text with invoice keywords."""
        text = """
        Faktura
        Datum: 2025-01-15
        Belopp: 1234,56 SEK
        Bankgiro: 5393-9484
        Moms: 25%
        """ + "a" * 200  # Ensure > 200 chars

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            assert is_text_pdf("test.pdf") is True

    def test_garbled_text_returns_false(self):
        """Should return False for garbled/unreadable text."""
        # Simulate garbled text (lots of non-printable characters)
        garbled = "\x00\x01\x02" * 100 + "abc" * 20  # Low readable ratio

        with patch("shared.pdf.detector.extract_text_first_page", return_value=garbled):
            assert is_text_pdf("test.pdf") is False

    def test_text_without_keywords_needs_high_readability(self):
        """Should require high readability when no keywords found."""
        # Text without invoice keywords
        text = "The quick brown fox jumps over the lazy dog. " * 10

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # Should pass if readable ratio is high enough
            result = is_text_pdf("test.pdf")
            # Result depends on character ratio - ASCII text should pass
            assert result is True

    def test_custom_min_chars(self):
        """Should respect custom min_chars parameter."""
        text = "Short text here"  # 15 chars

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # Default min_chars=30 - should fail
            assert is_text_pdf("test.pdf", min_chars=30) is False
            # Custom min_chars=10 - should pass basic length check
            # (but will still fail keyword/readability checks)


class TestGetPDFType:
    """Tests for get_pdf_type function."""

    def test_empty_pdf_returns_scanned(self):
        """Should return 'scanned' for empty PDF."""
        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=0)

        with patch("fitz.open", return_value=mock_doc):
            result = get_pdf_type("test.pdf")
            assert result == "scanned"

    def test_all_text_pages_returns_text(self):
        """Should return 'text' when all pages have text."""
        mock_page1 = MagicMock()
        mock_page1.get_text.return_value = "A" * 50  # > 30 chars

        mock_page2 = MagicMock()
        mock_page2.get_text.return_value = "B" * 50  # > 30 chars

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=2)
        mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))

        with patch("fitz.open", return_value=mock_doc):
            result = get_pdf_type("test.pdf")
            assert result == "text"

    def test_no_text_pages_returns_scanned(self):
        """Should return 'scanned' when no pages have text."""
        mock_page1 = MagicMock()
        mock_page1.get_text.return_value = ""

        mock_page2 = MagicMock()
        mock_page2.get_text.return_value = "AB"  # < 30 chars

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=2)
        mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))

        with patch("fitz.open", return_value=mock_doc):
            result = get_pdf_type("test.pdf")
            assert result == "scanned"

    def test_mixed_pages_returns_mixed(self):
        """Should return 'mixed' when some pages have text."""
        mock_page1 = MagicMock()
        mock_page1.get_text.return_value = "A" * 50  # Has text

        mock_page2 = MagicMock()
        mock_page2.get_text.return_value = ""  # No text

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=2)
        mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))

        with patch("fitz.open", return_value=mock_doc):
            result = get_pdf_type("test.pdf")
            assert result == "mixed"


class TestGetPageInfo:
    """Tests for get_page_info function."""

    def test_single_page_pdf(self):
        """Should return info for single page."""
        mock_rect = MagicMock()
        mock_rect.width = 595.0  # A4 width in points
        mock_rect.height = 842.0  # A4 height in points

        mock_page = MagicMock()
        mock_page.get_text.return_value = "A" * 50
        mock_page.rect = mock_rect

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=1)

        def mock_iter(self):
            yield mock_page
        mock_doc.__iter__ = lambda self: mock_iter(self)

        with patch("fitz.open", return_value=mock_doc):
            pages = get_page_info("test.pdf")

            assert len(pages) == 1
            assert pages[0]["page_no"] == 0
            assert pages[0]["width"] == 595.0
            assert pages[0]["height"] == 842.0
            assert pages[0]["has_text"] is True
            assert pages[0]["char_count"] == 50

    def test_multi_page_pdf(self):
        """Should return info for all pages."""
        def create_mock_page(text, width, height):
            mock_rect = MagicMock()
            mock_rect.width = width
            mock_rect.height = height

            mock_page = MagicMock()
            mock_page.get_text.return_value = text
            mock_page.rect = mock_rect
            return mock_page

        pages_data = [
            ("A" * 50, 595.0, 842.0),  # Page 0: has text
            ("", 595.0, 842.0),         # Page 1: no text
            ("B" * 100, 612.0, 792.0),  # Page 2: different size, has text
        ]

        mock_pages = [create_mock_page(*data) for data in pages_data]

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=3)

        def mock_iter(self):
            for page in mock_pages:
                yield page
        mock_doc.__iter__ = lambda self: mock_iter(self)

        with patch("fitz.open", return_value=mock_doc):
            pages = get_page_info("test.pdf")

            assert len(pages) == 3

            # Page 0
            assert pages[0]["page_no"] == 0
            assert pages[0]["has_text"] is True
            assert pages[0]["char_count"] == 50

            # Page 1
            assert pages[1]["page_no"] == 1
            assert pages[1]["has_text"] is False
            assert pages[1]["char_count"] == 0

            # Page 2
            assert pages[2]["page_no"] == 2
            assert pages[2]["has_text"] is True
            assert pages[2]["width"] == 612.0


class TestPDFTypeAnnotation:
    """Tests for PDFType type alias."""

    def test_valid_types(self):
        """PDFType should accept valid literal values."""
        # These are compile-time checks, but we can verify at runtime
        valid_types: list[PDFType] = ["text", "scanned", "mixed"]
        assert all(t in ["text", "scanned", "mixed"] for t in valid_types)


class TestIsTextPDFKeywordDetection:
    """Tests for keyword detection in is_text_pdf."""

    def test_detects_swedish_keywords(self):
        """Should detect Swedish invoice keywords."""
        keywords = [
            ("faktura", True),
            ("datum", True),
            ("belopp", True),
            ("bankgiro", True),
            ("plusgiro", True),
            ("moms", True),
        ]

        for keyword, expected in keywords:
            # Create text with keyword and enough content
            text = f"Document with {keyword} keyword here" + " more text" * 50

            with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
                # Need at least 2 keywords for is_text_pdf to return True
                # So this tests if keyword is recognized when combined with others
                pass

    def test_detects_english_keywords(self):
        """Should detect English invoice keywords."""
        text = "Invoice document with date and amount information" + " x" * 100

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # invoice + date = 2 keywords
            result = is_text_pdf("test.pdf")
            assert result is True

    def test_needs_at_least_two_keywords(self):
        """Should require at least 2 keywords to pass keyword check."""
        # Only one keyword
        text = "This is a faktura document" + " x" * 200

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            # With only 1 keyword, falls back to other checks
            # Should still pass if readability is high
            pass


class TestReadabilityChecks:
    """Tests for readability ratio checks in is_text_pdf."""

    def test_high_ascii_ratio_passes(self):
        """Should pass when ASCII ratio is high."""
        # Pure ASCII text
        text = "This is a normal document with only ASCII characters. " * 10

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            result = is_text_pdf("test.pdf")
            assert result is True

    def test_swedish_characters_accepted(self):
        """Should accept Swedish characters as readable."""
        text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            result = is_text_pdf("test.pdf")
            assert result is True

    def test_low_readability_fails(self):
        """Should fail when readability ratio is too low."""
        # Mix of readable and unreadable characters
        # Create text with < 70% readable characters
        readable = "abc" * 30  # 90 readable chars
        unreadable = "\x80\x81\x82" * 50  # 150 unreadable chars
        text = readable + unreadable

        with patch("shared.pdf.detector.extract_text_first_page", return_value=text):
            result = is_text_pdf("test.pdf")
            assert result is False


if __name__ == "__main__":
    pytest.main([__file__, "-v"])