""" Tests for the PDF Text Extraction Module. Tests cover all extractor functions in src/pdf/extractor.py Note: These tests require PyMuPDF (fitz) and use mocks for unit testing. Usage: pytest src/pdf/test_extractor.py -v -o 'addopts=' """ import pytest from pathlib import Path from unittest.mock import patch, MagicMock from shared.pdf.extractor import ( Token, PDFDocument, extract_text_tokens, extract_words, extract_lines, get_page_dimensions, ) class TestToken: """Tests for Token dataclass.""" def test_creation(self): """Should create Token with all fields.""" token = Token( text="Hello", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0 ) assert token.text == "Hello" assert token.bbox == (10.0, 20.0, 50.0, 35.0) assert token.page_no == 0 def test_x0_property(self): """Should return correct x0.""" token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0) assert token.x0 == 10.0 def test_y0_property(self): """Should return correct y0.""" token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0) assert token.y0 == 20.0 def test_x1_property(self): """Should return correct x1.""" token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0) assert token.x1 == 50.0 def test_y1_property(self): """Should return correct y1.""" token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0) assert token.y1 == 35.0 def test_width_property(self): """Should calculate correct width.""" token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0) assert token.width == 40.0 def test_height_property(self): """Should calculate correct height.""" token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0) assert token.height == 15.0 def test_center_property(self): """Should calculate correct center.""" token = Token(text="test", bbox=(10.0, 20.0, 50.0, 40.0), page_no=0) center = token.center assert center == (30.0, 30.0) class TestPDFDocument: """Tests for PDFDocument context manager.""" def test_context_manager_opens_and_closes(self): """Should open document on enter and close on exit.""" mock_doc = MagicMock() with patch("fitz.open", return_value=mock_doc) as mock_open: with PDFDocument("test.pdf") as pdf: mock_open.assert_called_once_with(Path("test.pdf")) assert pdf._doc is not None mock_doc.close.assert_called_once() def test_doc_property_raises_outside_context(self): """Should raise error when accessing doc outside context.""" pdf = PDFDocument("test.pdf") with pytest.raises(RuntimeError, match="must be used within a context manager"): _ = pdf.doc def test_page_count(self): """Should return correct page count.""" mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=5) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: assert pdf.page_count == 5 def test_get_page_dimensions(self): """Should return page dimensions.""" mock_rect = MagicMock() mock_rect.width = 595.0 mock_rect.height = 842.0 mock_page = MagicMock() mock_page.rect = mock_rect mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: width, height = pdf.get_page_dimensions(0) assert width == 595.0 assert height == 842.0 def test_get_page_dimensions_cached(self): """Should cache page dimensions.""" mock_rect = MagicMock() mock_rect.width = 595.0 mock_rect.height = 842.0 mock_page = MagicMock() mock_page.rect = mock_rect mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: # Call twice pdf.get_page_dimensions(0) pdf.get_page_dimensions(0) # Should only access page once due to caching assert mock_doc.__getitem__.call_count == 1 def test_get_render_dimensions(self): """Should calculate render dimensions based on DPI.""" mock_rect = MagicMock() mock_rect.width = 595.0 # A4 width in points mock_rect.height = 842.0 # A4 height in points mock_page = MagicMock() mock_page.rect = mock_rect mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: # At 72 DPI (1:1), dimensions should match w72, h72 = pdf.get_render_dimensions(0, dpi=72) assert w72 == 595 assert h72 == 842 # At 150 DPI (150/72 = ~2.08x zoom) w150, h150 = pdf.get_render_dimensions(0, dpi=150) assert w150 == int(595 * 150 / 72) assert h150 == int(842 * 150 / 72) class TestPDFDocumentExtractTextTokens: """Tests for PDFDocument.extract_text_tokens method.""" def test_extract_from_dict_mode(self): """Should extract tokens using dict mode.""" mock_page = MagicMock() mock_page.get_text.return_value = { "blocks": [ { "type": 0, # Text block "lines": [ { "spans": [ {"text": "Hello", "bbox": [10, 20, 50, 35]}, {"text": "World", "bbox": [60, 20, 100, 35]}, ] } ] } ] } mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: tokens = list(pdf.extract_text_tokens(0)) assert len(tokens) == 2 assert tokens[0].text == "Hello" assert tokens[1].text == "World" def test_skips_non_text_blocks(self): """Should skip non-text blocks (like images).""" mock_page = MagicMock() mock_page.get_text.return_value = { "blocks": [ {"type": 1}, # Image block - should be skipped { "type": 0, "lines": [{"spans": [{"text": "Text", "bbox": [0, 0, 50, 20]}]}] } ] } mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: tokens = list(pdf.extract_text_tokens(0)) assert len(tokens) == 1 assert tokens[0].text == "Text" def test_skips_empty_text(self): """Should skip spans with empty text.""" mock_page = MagicMock() mock_page.get_text.return_value = { "blocks": [ { "type": 0, "lines": [ { "spans": [ {"text": "", "bbox": [0, 0, 10, 10]}, {"text": " ", "bbox": [10, 0, 20, 10]}, {"text": "Valid", "bbox": [20, 0, 50, 10]}, ] } ] } ] } mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: tokens = list(pdf.extract_text_tokens(0)) assert len(tokens) == 1 assert tokens[0].text == "Valid" def test_fallback_to_words_mode(self): """Should fallback to words mode if dict mode yields nothing.""" mock_page = MagicMock() # Dict mode returns empty blocks mock_page.get_text.side_effect = lambda mode: ( {"blocks": []} if mode == "dict" else [(10, 20, 50, 35, "Fallback", 0, 0, 0)] ) mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): with PDFDocument("test.pdf") as pdf: tokens = list(pdf.extract_text_tokens(0)) assert len(tokens) == 1 assert tokens[0].text == "Fallback" class TestExtractTextTokensFunction: """Tests for extract_text_tokens standalone function.""" def test_extract_all_pages(self): """Should extract from all pages when page_no is None.""" mock_page0 = MagicMock() mock_page0.get_text.return_value = { "blocks": [ {"type": 0, "lines": [{"spans": [{"text": "Page0", "bbox": [0, 0, 50, 20]}]}]} ] } mock_page1 = MagicMock() mock_page1.get_text.return_value = { "blocks": [ {"type": 0, "lines": [{"spans": [{"text": "Page1", "bbox": [0, 0, 50, 20]}]}]} ] } mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=2) mock_doc.__getitem__ = lambda self, idx: [mock_page0, mock_page1][idx] with patch("fitz.open", return_value=mock_doc): tokens = list(extract_text_tokens("test.pdf", page_no=None)) assert len(tokens) == 2 assert tokens[0].text == "Page0" assert tokens[0].page_no == 0 assert tokens[1].text == "Page1" assert tokens[1].page_no == 1 def test_extract_specific_page(self): """Should extract from specific page only.""" mock_page = MagicMock() mock_page.get_text.return_value = { "blocks": [ {"type": 0, "lines": [{"spans": [{"text": "Specific", "bbox": [0, 0, 50, 20]}]}]} ] } mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=3) mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): tokens = list(extract_text_tokens("test.pdf", page_no=1)) assert len(tokens) == 1 assert tokens[0].page_no == 1 def test_skips_corrupted_bbox(self): """Should skip tokens with corrupted bbox values.""" mock_page = MagicMock() mock_page.get_text.return_value = { "blocks": [ { "type": 0, "lines": [ { "spans": [ {"text": "Good", "bbox": [0, 0, 50, 20]}, {"text": "Bad", "bbox": [1e10, 0, 50, 20]}, # Corrupted ] } ] } ] } mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): tokens = list(extract_text_tokens("test.pdf", page_no=0)) assert len(tokens) == 1 assert tokens[0].text == "Good" class TestExtractWordsFunction: """Tests for extract_words function.""" def test_extract_words(self): """Should extract words using words mode.""" mock_page = MagicMock() mock_page.get_text.return_value = [ (10, 20, 50, 35, "Hello", 0, 0, 0), (60, 20, 100, 35, "World", 0, 0, 1), ] mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): tokens = list(extract_words("test.pdf", page_no=0)) assert len(tokens) == 2 assert tokens[0].text == "Hello" assert tokens[0].bbox == (10, 20, 50, 35) assert tokens[1].text == "World" def test_skips_empty_words(self): """Should skip empty words.""" mock_page = MagicMock() mock_page.get_text.return_value = [ (10, 20, 50, 35, "", 0, 0, 0), (60, 20, 100, 35, " ", 0, 0, 1), (110, 20, 150, 35, "Valid", 0, 0, 2), ] mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): tokens = list(extract_words("test.pdf", page_no=0)) assert len(tokens) == 1 assert tokens[0].text == "Valid" class TestExtractLinesFunction: """Tests for extract_lines function.""" def test_extract_lines(self): """Should extract full lines by combining spans.""" mock_page = MagicMock() mock_page.get_text.return_value = { "blocks": [ { "type": 0, "lines": [ { "spans": [ {"text": "Hello", "bbox": [10, 20, 50, 35]}, {"text": "World", "bbox": [55, 20, 100, 35]}, ] }, { "spans": [ {"text": "Second line", "bbox": [10, 40, 100, 55]}, ] } ] } ] } mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): tokens = list(extract_lines("test.pdf", page_no=0)) assert len(tokens) == 2 assert tokens[0].text == "Hello World" # BBox should span both spans assert tokens[0].bbox[0] == 10 # min x0 assert tokens[0].bbox[2] == 100 # max x1 def test_skips_empty_lines(self): """Should skip lines with no text.""" mock_page = MagicMock() mock_page.get_text.return_value = { "blocks": [ { "type": 0, "lines": [ {"spans": []}, # Empty line {"spans": [{"text": "Valid", "bbox": [0, 0, 50, 20]}]}, ] } ] } mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=1) mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): tokens = list(extract_lines("test.pdf", page_no=0)) assert len(tokens) == 1 assert tokens[0].text == "Valid" class TestGetPageDimensionsFunction: """Tests for get_page_dimensions standalone function.""" def test_get_dimensions(self): """Should return page dimensions.""" mock_rect = MagicMock() mock_rect.width = 612.0 # Letter width mock_rect.height = 792.0 # Letter height mock_page = MagicMock() mock_page.rect = mock_rect mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): width, height = get_page_dimensions("test.pdf", page_no=0) assert width == 612.0 assert height == 792.0 def test_get_dimensions_different_page(self): """Should get dimensions for specific page.""" mock_rect = MagicMock() mock_rect.width = 595.0 mock_rect.height = 842.0 mock_page = MagicMock() mock_page.rect = mock_rect mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) with patch("fitz.open", return_value=mock_doc): get_page_dimensions("test.pdf", page_no=2) mock_doc.__getitem__.assert_called_with(2) class TestPDFDocumentIsTextPDF: """Tests for PDFDocument.is_text_pdf method.""" def test_delegates_to_detector(self): """Should delegate to detector module's is_text_pdf.""" mock_doc = MagicMock() with patch("fitz.open", return_value=mock_doc): with patch("shared.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check: with PDFDocument("test.pdf") as pdf: result = pdf.is_text_pdf(min_chars=50) mock_check.assert_called_once_with(Path("test.pdf"), 50) assert result is True class TestPDFDocumentRenderPage: """Tests for PDFDocument render methods.""" def test_render_page(self, tmp_path): """Should render page to image file.""" mock_pix = MagicMock() mock_page = MagicMock() mock_page.get_pixmap.return_value = mock_pix mock_doc = MagicMock() mock_doc.__getitem__ = MagicMock(return_value=mock_page) output_path = tmp_path / "output.png" with patch("fitz.open", return_value=mock_doc): with patch("fitz.Matrix") as mock_matrix: with PDFDocument("test.pdf") as pdf: result = pdf.render_page(0, output_path, dpi=150) # Verify matrix created with correct zoom zoom = 150 / 72 mock_matrix.assert_called_once_with(zoom, zoom) # Verify pixmap saved mock_pix.save.assert_called_once_with(str(output_path)) assert result == output_path def test_render_all_pages(self, tmp_path): """Should render all pages to images.""" mock_pix = MagicMock() mock_page = MagicMock() mock_page.get_pixmap.return_value = mock_pix mock_doc = MagicMock() mock_doc.__len__ = MagicMock(return_value=2) mock_doc.__getitem__ = MagicMock(return_value=mock_page) mock_doc.stem = "test" # For filename generation with patch("fitz.open", return_value=mock_doc): with patch("fitz.Matrix"): with PDFDocument(tmp_path / "test.pdf") as pdf: results = list(pdf.render_all_pages(tmp_path, dpi=150)) assert len(results) == 2 assert results[0][0] == 0 # Page number assert results[1][0] == 1 if __name__ == "__main__": pytest.main([__file__, "-v"])