invoice-master-poc-v2/tests/pdf/test_extractor.py

"""
Tests for the PDF Text Extraction Module.

Tests cover all extractor functions in src/pdf/extractor.py

Note: These tests require PyMuPDF (fitz) and use mocks for unit testing.

Usage:
    pytest src/pdf/test_extractor.py -v -o 'addopts='
"""

import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
from src.pdf.extractor import (
    Token,
    PDFDocument,
    extract_text_tokens,
    extract_words,
    extract_lines,
    get_page_dimensions,
)


class TestToken:
    """Tests for Token dataclass."""

    def test_creation(self):
        """Should create Token with all fields."""
        token = Token(
            text="Hello",
            bbox=(10.0, 20.0, 50.0, 35.0),
            page_no=0
        )
        assert token.text == "Hello"
        assert token.bbox == (10.0, 20.0, 50.0, 35.0)
        assert token.page_no == 0

    def test_x0_property(self):
        """Should return correct x0."""
        token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
        assert token.x0 == 10.0

    def test_y0_property(self):
        """Should return correct y0."""
        token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
        assert token.y0 == 20.0

    def test_x1_property(self):
        """Should return correct x1."""
        token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
        assert token.x1 == 50.0

    def test_y1_property(self):
        """Should return correct y1."""
        token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
        assert token.y1 == 35.0

    def test_width_property(self):
        """Should calculate correct width."""
        token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
        assert token.width == 40.0

    def test_height_property(self):
        """Should calculate correct height."""
        token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
        assert token.height == 15.0

    def test_center_property(self):
        """Should calculate correct center."""
        token = Token(text="test", bbox=(10.0, 20.0, 50.0, 40.0), page_no=0)
        center = token.center
        assert center == (30.0, 30.0)


class TestPDFDocument:
    """Tests for PDFDocument context manager."""

    def test_context_manager_opens_and_closes(self):
        """Should open document on enter and close on exit."""
        mock_doc = MagicMock()

        with patch("fitz.open", return_value=mock_doc) as mock_open:
            with PDFDocument("test.pdf") as pdf:
                mock_open.assert_called_once_with(Path("test.pdf"))
                assert pdf._doc is not None

            mock_doc.close.assert_called_once()

    def test_doc_property_raises_outside_context(self):
        """Should raise error when accessing doc outside context."""
        pdf = PDFDocument("test.pdf")

        with pytest.raises(RuntimeError, match="must be used within a context manager"):
            _ = pdf.doc

    def test_page_count(self):
        """Should return correct page count."""
        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=5)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                assert pdf.page_count == 5

    def test_get_page_dimensions(self):
        """Should return page dimensions."""
        mock_rect = MagicMock()
        mock_rect.width = 595.0
        mock_rect.height = 842.0

        mock_page = MagicMock()
        mock_page.rect = mock_rect

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                width, height = pdf.get_page_dimensions(0)
                assert width == 595.0
                assert height == 842.0

    def test_get_page_dimensions_cached(self):
        """Should cache page dimensions."""
        mock_rect = MagicMock()
        mock_rect.width = 595.0
        mock_rect.height = 842.0

        mock_page = MagicMock()
        mock_page.rect = mock_rect

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                # Call twice
                pdf.get_page_dimensions(0)
                pdf.get_page_dimensions(0)

                # Should only access page once due to caching
                assert mock_doc.__getitem__.call_count == 1

    def test_get_render_dimensions(self):
        """Should calculate render dimensions based on DPI."""
        mock_rect = MagicMock()
        mock_rect.width = 595.0  # A4 width in points
        mock_rect.height = 842.0  # A4 height in points

        mock_page = MagicMock()
        mock_page.rect = mock_rect

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                # At 72 DPI (1:1), dimensions should match
                w72, h72 = pdf.get_render_dimensions(0, dpi=72)
                assert w72 == 595
                assert h72 == 842

                # At 150 DPI (150/72 = ~2.08x zoom)
                w150, h150 = pdf.get_render_dimensions(0, dpi=150)
                assert w150 == int(595 * 150 / 72)
                assert h150 == int(842 * 150 / 72)


class TestPDFDocumentExtractTextTokens:
    """Tests for PDFDocument.extract_text_tokens method."""

    def test_extract_from_dict_mode(self):
        """Should extract tokens using dict mode."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = {
            "blocks": [
                {
                    "type": 0,  # Text block
                    "lines": [
                        {
                            "spans": [
                                {"text": "Hello", "bbox": [10, 20, 50, 35]},
                                {"text": "World", "bbox": [60, 20, 100, 35]},
                            ]
                        }
                    ]
                }
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                tokens = list(pdf.extract_text_tokens(0))

                assert len(tokens) == 2
                assert tokens[0].text == "Hello"
                assert tokens[1].text == "World"

    def test_skips_non_text_blocks(self):
        """Should skip non-text blocks (like images)."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = {
            "blocks": [
                {"type": 1},  # Image block - should be skipped
                {
                    "type": 0,
                    "lines": [{"spans": [{"text": "Text", "bbox": [0, 0, 50, 20]}]}]
                }
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                tokens = list(pdf.extract_text_tokens(0))

                assert len(tokens) == 1
                assert tokens[0].text == "Text"

    def test_skips_empty_text(self):
        """Should skip spans with empty text."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = {
            "blocks": [
                {
                    "type": 0,
                    "lines": [
                        {
                            "spans": [
                                {"text": "", "bbox": [0, 0, 10, 10]},
                                {"text": "  ", "bbox": [10, 0, 20, 10]},
                                {"text": "Valid", "bbox": [20, 0, 50, 10]},
                            ]
                        }
                    ]
                }
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                tokens = list(pdf.extract_text_tokens(0))

                assert len(tokens) == 1
                assert tokens[0].text == "Valid"

    def test_fallback_to_words_mode(self):
        """Should fallback to words mode if dict mode yields nothing."""
        mock_page = MagicMock()
        # Dict mode returns empty blocks
        mock_page.get_text.side_effect = lambda mode: (
            {"blocks": []} if mode == "dict"
            else [(10, 20, 50, 35, "Fallback", 0, 0, 0)]
        )

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            with PDFDocument("test.pdf") as pdf:
                tokens = list(pdf.extract_text_tokens(0))

                assert len(tokens) == 1
                assert tokens[0].text == "Fallback"


class TestExtractTextTokensFunction:
    """Tests for extract_text_tokens standalone function."""

    def test_extract_all_pages(self):
        """Should extract from all pages when page_no is None."""
        mock_page0 = MagicMock()
        mock_page0.get_text.return_value = {
            "blocks": [
                {"type": 0, "lines": [{"spans": [{"text": "Page0", "bbox": [0, 0, 50, 20]}]}]}
            ]
        }

        mock_page1 = MagicMock()
        mock_page1.get_text.return_value = {
            "blocks": [
                {"type": 0, "lines": [{"spans": [{"text": "Page1", "bbox": [0, 0, 50, 20]}]}]}
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=2)
        mock_doc.__getitem__ = lambda self, idx: [mock_page0, mock_page1][idx]

        with patch("fitz.open", return_value=mock_doc):
            tokens = list(extract_text_tokens("test.pdf", page_no=None))

            assert len(tokens) == 2
            assert tokens[0].text == "Page0"
            assert tokens[0].page_no == 0
            assert tokens[1].text == "Page1"
            assert tokens[1].page_no == 1

    def test_extract_specific_page(self):
        """Should extract from specific page only."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = {
            "blocks": [
                {"type": 0, "lines": [{"spans": [{"text": "Specific", "bbox": [0, 0, 50, 20]}]}]}
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=3)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            tokens = list(extract_text_tokens("test.pdf", page_no=1))

            assert len(tokens) == 1
            assert tokens[0].page_no == 1

    def test_skips_corrupted_bbox(self):
        """Should skip tokens with corrupted bbox values."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = {
            "blocks": [
                {
                    "type": 0,
                    "lines": [
                        {
                            "spans": [
                                {"text": "Good", "bbox": [0, 0, 50, 20]},
                                {"text": "Bad", "bbox": [1e10, 0, 50, 20]},  # Corrupted
                            ]
                        }
                    ]
                }
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=1)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            tokens = list(extract_text_tokens("test.pdf", page_no=0))

            assert len(tokens) == 1
            assert tokens[0].text == "Good"


class TestExtractWordsFunction:
    """Tests for extract_words function."""

    def test_extract_words(self):
        """Should extract words using words mode."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = [
            (10, 20, 50, 35, "Hello", 0, 0, 0),
            (60, 20, 100, 35, "World", 0, 0, 1),
        ]

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=1)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            tokens = list(extract_words("test.pdf", page_no=0))

            assert len(tokens) == 2
            assert tokens[0].text == "Hello"
            assert tokens[0].bbox == (10, 20, 50, 35)
            assert tokens[1].text == "World"

    def test_skips_empty_words(self):
        """Should skip empty words."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = [
            (10, 20, 50, 35, "", 0, 0, 0),
            (60, 20, 100, 35, "  ", 0, 0, 1),
            (110, 20, 150, 35, "Valid", 0, 0, 2),
        ]

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=1)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            tokens = list(extract_words("test.pdf", page_no=0))

            assert len(tokens) == 1
            assert tokens[0].text == "Valid"


class TestExtractLinesFunction:
    """Tests for extract_lines function."""

    def test_extract_lines(self):
        """Should extract full lines by combining spans."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = {
            "blocks": [
                {
                    "type": 0,
                    "lines": [
                        {
                            "spans": [
                                {"text": "Hello", "bbox": [10, 20, 50, 35]},
                                {"text": "World", "bbox": [55, 20, 100, 35]},
                            ]
                        },
                        {
                            "spans": [
                                {"text": "Second line", "bbox": [10, 40, 100, 55]},
                            ]
                        }
                    ]
                }
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=1)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            tokens = list(extract_lines("test.pdf", page_no=0))

            assert len(tokens) == 2
            assert tokens[0].text == "Hello World"
            # BBox should span both spans
            assert tokens[0].bbox[0] == 10  # min x0
            assert tokens[0].bbox[2] == 100  # max x1

    def test_skips_empty_lines(self):
        """Should skip lines with no text."""
        mock_page = MagicMock()
        mock_page.get_text.return_value = {
            "blocks": [
                {
                    "type": 0,
                    "lines": [
                        {"spans": []},  # Empty line
                        {"spans": [{"text": "Valid", "bbox": [0, 0, 50, 20]}]},
                    ]
                }
            ]
        }

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=1)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            tokens = list(extract_lines("test.pdf", page_no=0))

            assert len(tokens) == 1
            assert tokens[0].text == "Valid"


class TestGetPageDimensionsFunction:
    """Tests for get_page_dimensions standalone function."""

    def test_get_dimensions(self):
        """Should return page dimensions."""
        mock_rect = MagicMock()
        mock_rect.width = 612.0  # Letter width
        mock_rect.height = 792.0  # Letter height

        mock_page = MagicMock()
        mock_page.rect = mock_rect

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            width, height = get_page_dimensions("test.pdf", page_no=0)

            assert width == 612.0
            assert height == 792.0

    def test_get_dimensions_different_page(self):
        """Should get dimensions for specific page."""
        mock_rect = MagicMock()
        mock_rect.width = 595.0
        mock_rect.height = 842.0

        mock_page = MagicMock()
        mock_page.rect = mock_rect

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        with patch("fitz.open", return_value=mock_doc):
            get_page_dimensions("test.pdf", page_no=2)
            mock_doc.__getitem__.assert_called_with(2)


class TestPDFDocumentIsTextPDF:
    """Tests for PDFDocument.is_text_pdf method."""

    def test_delegates_to_detector(self):
        """Should delegate to detector module's is_text_pdf."""
        mock_doc = MagicMock()

        with patch("fitz.open", return_value=mock_doc):
            with patch("src.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check:
                with PDFDocument("test.pdf") as pdf:
                    result = pdf.is_text_pdf(min_chars=50)

                    mock_check.assert_called_once_with(Path("test.pdf"), 50)
                    assert result is True


class TestPDFDocumentRenderPage:
    """Tests for PDFDocument render methods."""

    def test_render_page(self, tmp_path):
        """Should render page to image file."""
        mock_pix = MagicMock()

        mock_page = MagicMock()
        mock_page.get_pixmap.return_value = mock_pix

        mock_doc = MagicMock()
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)

        output_path = tmp_path / "output.png"

        with patch("fitz.open", return_value=mock_doc):
            with patch("fitz.Matrix") as mock_matrix:
                with PDFDocument("test.pdf") as pdf:
                    result = pdf.render_page(0, output_path, dpi=150)

                    # Verify matrix created with correct zoom
                    zoom = 150 / 72
                    mock_matrix.assert_called_once_with(zoom, zoom)

                    # Verify pixmap saved
                    mock_pix.save.assert_called_once_with(str(output_path))

                    assert result == output_path

    def test_render_all_pages(self, tmp_path):
        """Should render all pages to images."""
        mock_pix = MagicMock()

        mock_page = MagicMock()
        mock_page.get_pixmap.return_value = mock_pix

        mock_doc = MagicMock()
        mock_doc.__len__ = MagicMock(return_value=2)
        mock_doc.__getitem__ = MagicMock(return_value=mock_page)
        mock_doc.stem = "test"  # For filename generation

        with patch("fitz.open", return_value=mock_doc):
            with patch("fitz.Matrix"):
                with PDFDocument(tmp_path / "test.pdf") as pdf:
                    results = list(pdf.render_all_pages(tmp_path, dpi=150))

                    assert len(results) == 2
                    assert results[0][0] == 0  # Page number
                    assert results[1][0] == 1


if __name__ == "__main__":
    pytest.main([__file__, "-v"])