Re-structure the project.

This commit is contained in:
Yaojia Wang
2026-01-25 15:21:11 +01:00
parent 8fd61ea928
commit e599424a92
80 changed files with 10672 additions and 1584 deletions

572
tests/pdf/test_extractor.py Normal file
View File

@@ -0,0 +1,572 @@
"""
Tests for the PDF Text Extraction Module.
Tests cover all extractor functions in src/pdf/extractor.py
Note: These tests require PyMuPDF (fitz) and use mocks for unit testing.
Usage:
pytest src/pdf/test_extractor.py -v -o 'addopts='
"""
import pytest
from pathlib import Path
from unittest.mock import patch, MagicMock
from src.pdf.extractor import (
Token,
PDFDocument,
extract_text_tokens,
extract_words,
extract_lines,
get_page_dimensions,
)
class TestToken:
"""Tests for Token dataclass."""
def test_creation(self):
"""Should create Token with all fields."""
token = Token(
text="Hello",
bbox=(10.0, 20.0, 50.0, 35.0),
page_no=0
)
assert token.text == "Hello"
assert token.bbox == (10.0, 20.0, 50.0, 35.0)
assert token.page_no == 0
def test_x0_property(self):
"""Should return correct x0."""
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
assert token.x0 == 10.0
def test_y0_property(self):
"""Should return correct y0."""
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
assert token.y0 == 20.0
def test_x1_property(self):
"""Should return correct x1."""
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
assert token.x1 == 50.0
def test_y1_property(self):
"""Should return correct y1."""
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
assert token.y1 == 35.0
def test_width_property(self):
"""Should calculate correct width."""
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
assert token.width == 40.0
def test_height_property(self):
"""Should calculate correct height."""
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
assert token.height == 15.0
def test_center_property(self):
"""Should calculate correct center."""
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 40.0), page_no=0)
center = token.center
assert center == (30.0, 30.0)
class TestPDFDocument:
"""Tests for PDFDocument context manager."""
def test_context_manager_opens_and_closes(self):
"""Should open document on enter and close on exit."""
mock_doc = MagicMock()
with patch("fitz.open", return_value=mock_doc) as mock_open:
with PDFDocument("test.pdf") as pdf:
mock_open.assert_called_once_with(Path("test.pdf"))
assert pdf._doc is not None
mock_doc.close.assert_called_once()
def test_doc_property_raises_outside_context(self):
"""Should raise error when accessing doc outside context."""
pdf = PDFDocument("test.pdf")
with pytest.raises(RuntimeError, match="must be used within a context manager"):
_ = pdf.doc
def test_page_count(self):
"""Should return correct page count."""
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=5)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
assert pdf.page_count == 5
def test_get_page_dimensions(self):
"""Should return page dimensions."""
mock_rect = MagicMock()
mock_rect.width = 595.0
mock_rect.height = 842.0
mock_page = MagicMock()
mock_page.rect = mock_rect
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
width, height = pdf.get_page_dimensions(0)
assert width == 595.0
assert height == 842.0
def test_get_page_dimensions_cached(self):
"""Should cache page dimensions."""
mock_rect = MagicMock()
mock_rect.width = 595.0
mock_rect.height = 842.0
mock_page = MagicMock()
mock_page.rect = mock_rect
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
# Call twice
pdf.get_page_dimensions(0)
pdf.get_page_dimensions(0)
# Should only access page once due to caching
assert mock_doc.__getitem__.call_count == 1
def test_get_render_dimensions(self):
"""Should calculate render dimensions based on DPI."""
mock_rect = MagicMock()
mock_rect.width = 595.0 # A4 width in points
mock_rect.height = 842.0 # A4 height in points
mock_page = MagicMock()
mock_page.rect = mock_rect
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
# At 72 DPI (1:1), dimensions should match
w72, h72 = pdf.get_render_dimensions(0, dpi=72)
assert w72 == 595
assert h72 == 842
# At 150 DPI (150/72 = ~2.08x zoom)
w150, h150 = pdf.get_render_dimensions(0, dpi=150)
assert w150 == int(595 * 150 / 72)
assert h150 == int(842 * 150 / 72)
class TestPDFDocumentExtractTextTokens:
"""Tests for PDFDocument.extract_text_tokens method."""
def test_extract_from_dict_mode(self):
"""Should extract tokens using dict mode."""
mock_page = MagicMock()
mock_page.get_text.return_value = {
"blocks": [
{
"type": 0, # Text block
"lines": [
{
"spans": [
{"text": "Hello", "bbox": [10, 20, 50, 35]},
{"text": "World", "bbox": [60, 20, 100, 35]},
]
}
]
}
]
}
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
tokens = list(pdf.extract_text_tokens(0))
assert len(tokens) == 2
assert tokens[0].text == "Hello"
assert tokens[1].text == "World"
def test_skips_non_text_blocks(self):
"""Should skip non-text blocks (like images)."""
mock_page = MagicMock()
mock_page.get_text.return_value = {
"blocks": [
{"type": 1}, # Image block - should be skipped
{
"type": 0,
"lines": [{"spans": [{"text": "Text", "bbox": [0, 0, 50, 20]}]}]
}
]
}
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
tokens = list(pdf.extract_text_tokens(0))
assert len(tokens) == 1
assert tokens[0].text == "Text"
def test_skips_empty_text(self):
"""Should skip spans with empty text."""
mock_page = MagicMock()
mock_page.get_text.return_value = {
"blocks": [
{
"type": 0,
"lines": [
{
"spans": [
{"text": "", "bbox": [0, 0, 10, 10]},
{"text": " ", "bbox": [10, 0, 20, 10]},
{"text": "Valid", "bbox": [20, 0, 50, 10]},
]
}
]
}
]
}
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
tokens = list(pdf.extract_text_tokens(0))
assert len(tokens) == 1
assert tokens[0].text == "Valid"
def test_fallback_to_words_mode(self):
"""Should fallback to words mode if dict mode yields nothing."""
mock_page = MagicMock()
# Dict mode returns empty blocks
mock_page.get_text.side_effect = lambda mode: (
{"blocks": []} if mode == "dict"
else [(10, 20, 50, 35, "Fallback", 0, 0, 0)]
)
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
with PDFDocument("test.pdf") as pdf:
tokens = list(pdf.extract_text_tokens(0))
assert len(tokens) == 1
assert tokens[0].text == "Fallback"
class TestExtractTextTokensFunction:
"""Tests for extract_text_tokens standalone function."""
def test_extract_all_pages(self):
"""Should extract from all pages when page_no is None."""
mock_page0 = MagicMock()
mock_page0.get_text.return_value = {
"blocks": [
{"type": 0, "lines": [{"spans": [{"text": "Page0", "bbox": [0, 0, 50, 20]}]}]}
]
}
mock_page1 = MagicMock()
mock_page1.get_text.return_value = {
"blocks": [
{"type": 0, "lines": [{"spans": [{"text": "Page1", "bbox": [0, 0, 50, 20]}]}]}
]
}
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=2)
mock_doc.__getitem__ = lambda self, idx: [mock_page0, mock_page1][idx]
with patch("fitz.open", return_value=mock_doc):
tokens = list(extract_text_tokens("test.pdf", page_no=None))
assert len(tokens) == 2
assert tokens[0].text == "Page0"
assert tokens[0].page_no == 0
assert tokens[1].text == "Page1"
assert tokens[1].page_no == 1
def test_extract_specific_page(self):
"""Should extract from specific page only."""
mock_page = MagicMock()
mock_page.get_text.return_value = {
"blocks": [
{"type": 0, "lines": [{"spans": [{"text": "Specific", "bbox": [0, 0, 50, 20]}]}]}
]
}
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=3)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
tokens = list(extract_text_tokens("test.pdf", page_no=1))
assert len(tokens) == 1
assert tokens[0].page_no == 1
def test_skips_corrupted_bbox(self):
"""Should skip tokens with corrupted bbox values."""
mock_page = MagicMock()
mock_page.get_text.return_value = {
"blocks": [
{
"type": 0,
"lines": [
{
"spans": [
{"text": "Good", "bbox": [0, 0, 50, 20]},
{"text": "Bad", "bbox": [1e10, 0, 50, 20]}, # Corrupted
]
}
]
}
]
}
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
tokens = list(extract_text_tokens("test.pdf", page_no=0))
assert len(tokens) == 1
assert tokens[0].text == "Good"
class TestExtractWordsFunction:
"""Tests for extract_words function."""
def test_extract_words(self):
"""Should extract words using words mode."""
mock_page = MagicMock()
mock_page.get_text.return_value = [
(10, 20, 50, 35, "Hello", 0, 0, 0),
(60, 20, 100, 35, "World", 0, 0, 1),
]
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
tokens = list(extract_words("test.pdf", page_no=0))
assert len(tokens) == 2
assert tokens[0].text == "Hello"
assert tokens[0].bbox == (10, 20, 50, 35)
assert tokens[1].text == "World"
def test_skips_empty_words(self):
"""Should skip empty words."""
mock_page = MagicMock()
mock_page.get_text.return_value = [
(10, 20, 50, 35, "", 0, 0, 0),
(60, 20, 100, 35, " ", 0, 0, 1),
(110, 20, 150, 35, "Valid", 0, 0, 2),
]
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
tokens = list(extract_words("test.pdf", page_no=0))
assert len(tokens) == 1
assert tokens[0].text == "Valid"
class TestExtractLinesFunction:
"""Tests for extract_lines function."""
def test_extract_lines(self):
"""Should extract full lines by combining spans."""
mock_page = MagicMock()
mock_page.get_text.return_value = {
"blocks": [
{
"type": 0,
"lines": [
{
"spans": [
{"text": "Hello", "bbox": [10, 20, 50, 35]},
{"text": "World", "bbox": [55, 20, 100, 35]},
]
},
{
"spans": [
{"text": "Second line", "bbox": [10, 40, 100, 55]},
]
}
]
}
]
}
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
tokens = list(extract_lines("test.pdf", page_no=0))
assert len(tokens) == 2
assert tokens[0].text == "Hello World"
# BBox should span both spans
assert tokens[0].bbox[0] == 10 # min x0
assert tokens[0].bbox[2] == 100 # max x1
def test_skips_empty_lines(self):
"""Should skip lines with no text."""
mock_page = MagicMock()
mock_page.get_text.return_value = {
"blocks": [
{
"type": 0,
"lines": [
{"spans": []}, # Empty line
{"spans": [{"text": "Valid", "bbox": [0, 0, 50, 20]}]},
]
}
]
}
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=1)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
tokens = list(extract_lines("test.pdf", page_no=0))
assert len(tokens) == 1
assert tokens[0].text == "Valid"
class TestGetPageDimensionsFunction:
"""Tests for get_page_dimensions standalone function."""
def test_get_dimensions(self):
"""Should return page dimensions."""
mock_rect = MagicMock()
mock_rect.width = 612.0 # Letter width
mock_rect.height = 792.0 # Letter height
mock_page = MagicMock()
mock_page.rect = mock_rect
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
width, height = get_page_dimensions("test.pdf", page_no=0)
assert width == 612.0
assert height == 792.0
def test_get_dimensions_different_page(self):
"""Should get dimensions for specific page."""
mock_rect = MagicMock()
mock_rect.width = 595.0
mock_rect.height = 842.0
mock_page = MagicMock()
mock_page.rect = mock_rect
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
with patch("fitz.open", return_value=mock_doc):
get_page_dimensions("test.pdf", page_no=2)
mock_doc.__getitem__.assert_called_with(2)
class TestPDFDocumentIsTextPDF:
"""Tests for PDFDocument.is_text_pdf method."""
def test_delegates_to_detector(self):
"""Should delegate to detector module's is_text_pdf."""
mock_doc = MagicMock()
with patch("fitz.open", return_value=mock_doc):
with patch("src.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check:
with PDFDocument("test.pdf") as pdf:
result = pdf.is_text_pdf(min_chars=50)
mock_check.assert_called_once_with(Path("test.pdf"), 50)
assert result is True
class TestPDFDocumentRenderPage:
"""Tests for PDFDocument render methods."""
def test_render_page(self, tmp_path):
"""Should render page to image file."""
mock_pix = MagicMock()
mock_page = MagicMock()
mock_page.get_pixmap.return_value = mock_pix
mock_doc = MagicMock()
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
output_path = tmp_path / "output.png"
with patch("fitz.open", return_value=mock_doc):
with patch("fitz.Matrix") as mock_matrix:
with PDFDocument("test.pdf") as pdf:
result = pdf.render_page(0, output_path, dpi=150)
# Verify matrix created with correct zoom
zoom = 150 / 72
mock_matrix.assert_called_once_with(zoom, zoom)
# Verify pixmap saved
mock_pix.save.assert_called_once_with(str(output_path))
assert result == output_path
def test_render_all_pages(self, tmp_path):
"""Should render all pages to images."""
mock_pix = MagicMock()
mock_page = MagicMock()
mock_page.get_pixmap.return_value = mock_pix
mock_doc = MagicMock()
mock_doc.__len__ = MagicMock(return_value=2)
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
mock_doc.stem = "test" # For filename generation
with patch("fitz.open", return_value=mock_doc):
with patch("fitz.Matrix"):
with PDFDocument(tmp_path / "test.pdf") as pdf:
results = list(pdf.render_all_pages(tmp_path, dpi=150))
assert len(results) == 2
assert results[0][0] == 0 # Page number
assert results[1][0] == 1
if __name__ == "__main__":
pytest.main([__file__, "-v"])