Re-structure the project.
This commit is contained in:
0
tests/pdf/__init__.py
Normal file
0
tests/pdf/__init__.py
Normal file
335
tests/pdf/test_detector.py
Normal file
335
tests/pdf/test_detector.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""
|
||||
Tests for the PDF Type Detection Module.
|
||||
|
||||
Tests cover all detector functions in src/pdf/detector.py
|
||||
|
||||
Note: These tests require PyMuPDF (fitz) and actual PDF files or mocks.
|
||||
Some tests are marked as integration tests that require real PDF files.
|
||||
|
||||
Usage:
|
||||
pytest src/pdf/test_detector.py -v -o 'addopts='
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
from src.pdf.detector import (
|
||||
extract_text_first_page,
|
||||
is_text_pdf,
|
||||
get_pdf_type,
|
||||
get_page_info,
|
||||
PDFType,
|
||||
)
|
||||
|
||||
|
||||
class TestExtractTextFirstPage:
|
||||
"""Tests for extract_text_first_page function."""
|
||||
|
||||
def test_with_mock_empty_pdf(self):
|
||||
"""Should return empty string for empty PDF."""
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=0)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
result = extract_text_first_page("test.pdf")
|
||||
assert result == ""
|
||||
|
||||
def test_with_mock_text_pdf(self):
|
||||
"""Should extract text from first page."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = "Faktura 12345\nDatum: 2025-01-15"
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
result = extract_text_first_page("test.pdf")
|
||||
assert "Faktura" in result
|
||||
assert "12345" in result
|
||||
|
||||
|
||||
class TestIsTextPDF:
|
||||
"""Tests for is_text_pdf function."""
|
||||
|
||||
def test_empty_pdf_returns_false(self):
|
||||
"""Should return False for PDF with no text."""
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=""):
|
||||
assert is_text_pdf("test.pdf") is False
|
||||
|
||||
def test_short_text_returns_false(self):
|
||||
"""Should return False for PDF with very short text."""
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value="Hello"):
|
||||
assert is_text_pdf("test.pdf") is False
|
||||
|
||||
def test_readable_text_with_keywords_returns_true(self):
|
||||
"""Should return True for readable text with invoice keywords."""
|
||||
text = """
|
||||
Faktura
|
||||
Datum: 2025-01-15
|
||||
Belopp: 1234,56 SEK
|
||||
Bankgiro: 5393-9484
|
||||
Moms: 25%
|
||||
""" + "a" * 200 # Ensure > 200 chars
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
assert is_text_pdf("test.pdf") is True
|
||||
|
||||
def test_garbled_text_returns_false(self):
|
||||
"""Should return False for garbled/unreadable text."""
|
||||
# Simulate garbled text (lots of non-printable characters)
|
||||
garbled = "\x00\x01\x02" * 100 + "abc" * 20 # Low readable ratio
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=garbled):
|
||||
assert is_text_pdf("test.pdf") is False
|
||||
|
||||
def test_text_without_keywords_needs_high_readability(self):
|
||||
"""Should require high readability when no keywords found."""
|
||||
# Text without invoice keywords
|
||||
text = "The quick brown fox jumps over the lazy dog. " * 10
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# Should pass if readable ratio is high enough
|
||||
result = is_text_pdf("test.pdf")
|
||||
# Result depends on character ratio - ASCII text should pass
|
||||
assert result is True
|
||||
|
||||
def test_custom_min_chars(self):
|
||||
"""Should respect custom min_chars parameter."""
|
||||
text = "Short text here" # 15 chars
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# Default min_chars=30 - should fail
|
||||
assert is_text_pdf("test.pdf", min_chars=30) is False
|
||||
# Custom min_chars=10 - should pass basic length check
|
||||
# (but will still fail keyword/readability checks)
|
||||
|
||||
|
||||
class TestGetPDFType:
|
||||
"""Tests for get_pdf_type function."""
|
||||
|
||||
def test_empty_pdf_returns_scanned(self):
|
||||
"""Should return 'scanned' for empty PDF."""
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=0)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
result = get_pdf_type("test.pdf")
|
||||
assert result == "scanned"
|
||||
|
||||
def test_all_text_pages_returns_text(self):
|
||||
"""Should return 'text' when all pages have text."""
|
||||
mock_page1 = MagicMock()
|
||||
mock_page1.get_text.return_value = "A" * 50 # > 30 chars
|
||||
|
||||
mock_page2 = MagicMock()
|
||||
mock_page2.get_text.return_value = "B" * 50 # > 30 chars
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=2)
|
||||
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
result = get_pdf_type("test.pdf")
|
||||
assert result == "text"
|
||||
|
||||
def test_no_text_pages_returns_scanned(self):
|
||||
"""Should return 'scanned' when no pages have text."""
|
||||
mock_page1 = MagicMock()
|
||||
mock_page1.get_text.return_value = ""
|
||||
|
||||
mock_page2 = MagicMock()
|
||||
mock_page2.get_text.return_value = "AB" # < 30 chars
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=2)
|
||||
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
result = get_pdf_type("test.pdf")
|
||||
assert result == "scanned"
|
||||
|
||||
def test_mixed_pages_returns_mixed(self):
|
||||
"""Should return 'mixed' when some pages have text."""
|
||||
mock_page1 = MagicMock()
|
||||
mock_page1.get_text.return_value = "A" * 50 # Has text
|
||||
|
||||
mock_page2 = MagicMock()
|
||||
mock_page2.get_text.return_value = "" # No text
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=2)
|
||||
mock_doc.__iter__ = MagicMock(return_value=iter([mock_page1, mock_page2]))
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
result = get_pdf_type("test.pdf")
|
||||
assert result == "mixed"
|
||||
|
||||
|
||||
class TestGetPageInfo:
|
||||
"""Tests for get_page_info function."""
|
||||
|
||||
def test_single_page_pdf(self):
|
||||
"""Should return info for single page."""
|
||||
mock_rect = MagicMock()
|
||||
mock_rect.width = 595.0 # A4 width in points
|
||||
mock_rect.height = 842.0 # A4 height in points
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = "A" * 50
|
||||
mock_page.rect = mock_rect
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1)
|
||||
|
||||
def mock_iter(self):
|
||||
yield mock_page
|
||||
mock_doc.__iter__ = lambda self: mock_iter(self)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
pages = get_page_info("test.pdf")
|
||||
|
||||
assert len(pages) == 1
|
||||
assert pages[0]["page_no"] == 0
|
||||
assert pages[0]["width"] == 595.0
|
||||
assert pages[0]["height"] == 842.0
|
||||
assert pages[0]["has_text"] is True
|
||||
assert pages[0]["char_count"] == 50
|
||||
|
||||
def test_multi_page_pdf(self):
|
||||
"""Should return info for all pages."""
|
||||
def create_mock_page(text, width, height):
|
||||
mock_rect = MagicMock()
|
||||
mock_rect.width = width
|
||||
mock_rect.height = height
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = text
|
||||
mock_page.rect = mock_rect
|
||||
return mock_page
|
||||
|
||||
pages_data = [
|
||||
("A" * 50, 595.0, 842.0), # Page 0: has text
|
||||
("", 595.0, 842.0), # Page 1: no text
|
||||
("B" * 100, 612.0, 792.0), # Page 2: different size, has text
|
||||
]
|
||||
|
||||
mock_pages = [create_mock_page(*data) for data in pages_data]
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=3)
|
||||
|
||||
def mock_iter(self):
|
||||
for page in mock_pages:
|
||||
yield page
|
||||
mock_doc.__iter__ = lambda self: mock_iter(self)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
pages = get_page_info("test.pdf")
|
||||
|
||||
assert len(pages) == 3
|
||||
|
||||
# Page 0
|
||||
assert pages[0]["page_no"] == 0
|
||||
assert pages[0]["has_text"] is True
|
||||
assert pages[0]["char_count"] == 50
|
||||
|
||||
# Page 1
|
||||
assert pages[1]["page_no"] == 1
|
||||
assert pages[1]["has_text"] is False
|
||||
assert pages[1]["char_count"] == 0
|
||||
|
||||
# Page 2
|
||||
assert pages[2]["page_no"] == 2
|
||||
assert pages[2]["has_text"] is True
|
||||
assert pages[2]["width"] == 612.0
|
||||
|
||||
|
||||
class TestPDFTypeAnnotation:
|
||||
"""Tests for PDFType type alias."""
|
||||
|
||||
def test_valid_types(self):
|
||||
"""PDFType should accept valid literal values."""
|
||||
# These are compile-time checks, but we can verify at runtime
|
||||
valid_types: list[PDFType] = ["text", "scanned", "mixed"]
|
||||
assert all(t in ["text", "scanned", "mixed"] for t in valid_types)
|
||||
|
||||
|
||||
class TestIsTextPDFKeywordDetection:
|
||||
"""Tests for keyword detection in is_text_pdf."""
|
||||
|
||||
def test_detects_swedish_keywords(self):
|
||||
"""Should detect Swedish invoice keywords."""
|
||||
keywords = [
|
||||
("faktura", True),
|
||||
("datum", True),
|
||||
("belopp", True),
|
||||
("bankgiro", True),
|
||||
("plusgiro", True),
|
||||
("moms", True),
|
||||
]
|
||||
|
||||
for keyword, expected in keywords:
|
||||
# Create text with keyword and enough content
|
||||
text = f"Document with {keyword} keyword here" + " more text" * 50
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# Need at least 2 keywords for is_text_pdf to return True
|
||||
# So this tests if keyword is recognized when combined with others
|
||||
pass
|
||||
|
||||
def test_detects_english_keywords(self):
|
||||
"""Should detect English invoice keywords."""
|
||||
text = "Invoice document with date and amount information" + " x" * 100
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# invoice + date = 2 keywords
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is True
|
||||
|
||||
def test_needs_at_least_two_keywords(self):
|
||||
"""Should require at least 2 keywords to pass keyword check."""
|
||||
# Only one keyword
|
||||
text = "This is a faktura document" + " x" * 200
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
# With only 1 keyword, falls back to other checks
|
||||
# Should still pass if readability is high
|
||||
pass
|
||||
|
||||
|
||||
class TestReadabilityChecks:
|
||||
"""Tests for readability ratio checks in is_text_pdf."""
|
||||
|
||||
def test_high_ascii_ratio_passes(self):
|
||||
"""Should pass when ASCII ratio is high."""
|
||||
# Pure ASCII text
|
||||
text = "This is a normal document with only ASCII characters. " * 10
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is True
|
||||
|
||||
def test_swedish_characters_accepted(self):
|
||||
"""Should accept Swedish characters as readable."""
|
||||
text = "Fakturadatum för årets moms på öre belopp" + " normal" * 50
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is True
|
||||
|
||||
def test_low_readability_fails(self):
|
||||
"""Should fail when readability ratio is too low."""
|
||||
# Mix of readable and unreadable characters
|
||||
# Create text with < 70% readable characters
|
||||
readable = "abc" * 30 # 90 readable chars
|
||||
unreadable = "\x80\x81\x82" * 50 # 150 unreadable chars
|
||||
text = readable + unreadable
|
||||
|
||||
with patch("src.pdf.detector.extract_text_first_page", return_value=text):
|
||||
result = is_text_pdf("test.pdf")
|
||||
assert result is False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
572
tests/pdf/test_extractor.py
Normal file
572
tests/pdf/test_extractor.py
Normal file
@@ -0,0 +1,572 @@
|
||||
"""
|
||||
Tests for the PDF Text Extraction Module.
|
||||
|
||||
Tests cover all extractor functions in src/pdf/extractor.py
|
||||
|
||||
Note: These tests require PyMuPDF (fitz) and use mocks for unit testing.
|
||||
|
||||
Usage:
|
||||
pytest src/pdf/test_extractor.py -v -o 'addopts='
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
from src.pdf.extractor import (
|
||||
Token,
|
||||
PDFDocument,
|
||||
extract_text_tokens,
|
||||
extract_words,
|
||||
extract_lines,
|
||||
get_page_dimensions,
|
||||
)
|
||||
|
||||
|
||||
class TestToken:
|
||||
"""Tests for Token dataclass."""
|
||||
|
||||
def test_creation(self):
|
||||
"""Should create Token with all fields."""
|
||||
token = Token(
|
||||
text="Hello",
|
||||
bbox=(10.0, 20.0, 50.0, 35.0),
|
||||
page_no=0
|
||||
)
|
||||
assert token.text == "Hello"
|
||||
assert token.bbox == (10.0, 20.0, 50.0, 35.0)
|
||||
assert token.page_no == 0
|
||||
|
||||
def test_x0_property(self):
|
||||
"""Should return correct x0."""
|
||||
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
|
||||
assert token.x0 == 10.0
|
||||
|
||||
def test_y0_property(self):
|
||||
"""Should return correct y0."""
|
||||
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
|
||||
assert token.y0 == 20.0
|
||||
|
||||
def test_x1_property(self):
|
||||
"""Should return correct x1."""
|
||||
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
|
||||
assert token.x1 == 50.0
|
||||
|
||||
def test_y1_property(self):
|
||||
"""Should return correct y1."""
|
||||
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
|
||||
assert token.y1 == 35.0
|
||||
|
||||
def test_width_property(self):
|
||||
"""Should calculate correct width."""
|
||||
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
|
||||
assert token.width == 40.0
|
||||
|
||||
def test_height_property(self):
|
||||
"""Should calculate correct height."""
|
||||
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 35.0), page_no=0)
|
||||
assert token.height == 15.0
|
||||
|
||||
def test_center_property(self):
|
||||
"""Should calculate correct center."""
|
||||
token = Token(text="test", bbox=(10.0, 20.0, 50.0, 40.0), page_no=0)
|
||||
center = token.center
|
||||
assert center == (30.0, 30.0)
|
||||
|
||||
|
||||
class TestPDFDocument:
|
||||
"""Tests for PDFDocument context manager."""
|
||||
|
||||
def test_context_manager_opens_and_closes(self):
|
||||
"""Should open document on enter and close on exit."""
|
||||
mock_doc = MagicMock()
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc) as mock_open:
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
mock_open.assert_called_once_with(Path("test.pdf"))
|
||||
assert pdf._doc is not None
|
||||
|
||||
mock_doc.close.assert_called_once()
|
||||
|
||||
def test_doc_property_raises_outside_context(self):
|
||||
"""Should raise error when accessing doc outside context."""
|
||||
pdf = PDFDocument("test.pdf")
|
||||
|
||||
with pytest.raises(RuntimeError, match="must be used within a context manager"):
|
||||
_ = pdf.doc
|
||||
|
||||
def test_page_count(self):
|
||||
"""Should return correct page count."""
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=5)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
assert pdf.page_count == 5
|
||||
|
||||
def test_get_page_dimensions(self):
|
||||
"""Should return page dimensions."""
|
||||
mock_rect = MagicMock()
|
||||
mock_rect.width = 595.0
|
||||
mock_rect.height = 842.0
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.rect = mock_rect
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
width, height = pdf.get_page_dimensions(0)
|
||||
assert width == 595.0
|
||||
assert height == 842.0
|
||||
|
||||
def test_get_page_dimensions_cached(self):
|
||||
"""Should cache page dimensions."""
|
||||
mock_rect = MagicMock()
|
||||
mock_rect.width = 595.0
|
||||
mock_rect.height = 842.0
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.rect = mock_rect
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
# Call twice
|
||||
pdf.get_page_dimensions(0)
|
||||
pdf.get_page_dimensions(0)
|
||||
|
||||
# Should only access page once due to caching
|
||||
assert mock_doc.__getitem__.call_count == 1
|
||||
|
||||
def test_get_render_dimensions(self):
|
||||
"""Should calculate render dimensions based on DPI."""
|
||||
mock_rect = MagicMock()
|
||||
mock_rect.width = 595.0 # A4 width in points
|
||||
mock_rect.height = 842.0 # A4 height in points
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.rect = mock_rect
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
# At 72 DPI (1:1), dimensions should match
|
||||
w72, h72 = pdf.get_render_dimensions(0, dpi=72)
|
||||
assert w72 == 595
|
||||
assert h72 == 842
|
||||
|
||||
# At 150 DPI (150/72 = ~2.08x zoom)
|
||||
w150, h150 = pdf.get_render_dimensions(0, dpi=150)
|
||||
assert w150 == int(595 * 150 / 72)
|
||||
assert h150 == int(842 * 150 / 72)
|
||||
|
||||
|
||||
class TestPDFDocumentExtractTextTokens:
|
||||
"""Tests for PDFDocument.extract_text_tokens method."""
|
||||
|
||||
def test_extract_from_dict_mode(self):
|
||||
"""Should extract tokens using dict mode."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = {
|
||||
"blocks": [
|
||||
{
|
||||
"type": 0, # Text block
|
||||
"lines": [
|
||||
{
|
||||
"spans": [
|
||||
{"text": "Hello", "bbox": [10, 20, 50, 35]},
|
||||
{"text": "World", "bbox": [60, 20, 100, 35]},
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
tokens = list(pdf.extract_text_tokens(0))
|
||||
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "Hello"
|
||||
assert tokens[1].text == "World"
|
||||
|
||||
def test_skips_non_text_blocks(self):
|
||||
"""Should skip non-text blocks (like images)."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = {
|
||||
"blocks": [
|
||||
{"type": 1}, # Image block - should be skipped
|
||||
{
|
||||
"type": 0,
|
||||
"lines": [{"spans": [{"text": "Text", "bbox": [0, 0, 50, 20]}]}]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
tokens = list(pdf.extract_text_tokens(0))
|
||||
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == "Text"
|
||||
|
||||
def test_skips_empty_text(self):
|
||||
"""Should skip spans with empty text."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = {
|
||||
"blocks": [
|
||||
{
|
||||
"type": 0,
|
||||
"lines": [
|
||||
{
|
||||
"spans": [
|
||||
{"text": "", "bbox": [0, 0, 10, 10]},
|
||||
{"text": " ", "bbox": [10, 0, 20, 10]},
|
||||
{"text": "Valid", "bbox": [20, 0, 50, 10]},
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
tokens = list(pdf.extract_text_tokens(0))
|
||||
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == "Valid"
|
||||
|
||||
def test_fallback_to_words_mode(self):
|
||||
"""Should fallback to words mode if dict mode yields nothing."""
|
||||
mock_page = MagicMock()
|
||||
# Dict mode returns empty blocks
|
||||
mock_page.get_text.side_effect = lambda mode: (
|
||||
{"blocks": []} if mode == "dict"
|
||||
else [(10, 20, 50, 35, "Fallback", 0, 0, 0)]
|
||||
)
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
tokens = list(pdf.extract_text_tokens(0))
|
||||
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == "Fallback"
|
||||
|
||||
|
||||
class TestExtractTextTokensFunction:
|
||||
"""Tests for extract_text_tokens standalone function."""
|
||||
|
||||
def test_extract_all_pages(self):
|
||||
"""Should extract from all pages when page_no is None."""
|
||||
mock_page0 = MagicMock()
|
||||
mock_page0.get_text.return_value = {
|
||||
"blocks": [
|
||||
{"type": 0, "lines": [{"spans": [{"text": "Page0", "bbox": [0, 0, 50, 20]}]}]}
|
||||
]
|
||||
}
|
||||
|
||||
mock_page1 = MagicMock()
|
||||
mock_page1.get_text.return_value = {
|
||||
"blocks": [
|
||||
{"type": 0, "lines": [{"spans": [{"text": "Page1", "bbox": [0, 0, 50, 20]}]}]}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=2)
|
||||
mock_doc.__getitem__ = lambda self, idx: [mock_page0, mock_page1][idx]
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
tokens = list(extract_text_tokens("test.pdf", page_no=None))
|
||||
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "Page0"
|
||||
assert tokens[0].page_no == 0
|
||||
assert tokens[1].text == "Page1"
|
||||
assert tokens[1].page_no == 1
|
||||
|
||||
def test_extract_specific_page(self):
|
||||
"""Should extract from specific page only."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = {
|
||||
"blocks": [
|
||||
{"type": 0, "lines": [{"spans": [{"text": "Specific", "bbox": [0, 0, 50, 20]}]}]}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=3)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
tokens = list(extract_text_tokens("test.pdf", page_no=1))
|
||||
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].page_no == 1
|
||||
|
||||
def test_skips_corrupted_bbox(self):
|
||||
"""Should skip tokens with corrupted bbox values."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = {
|
||||
"blocks": [
|
||||
{
|
||||
"type": 0,
|
||||
"lines": [
|
||||
{
|
||||
"spans": [
|
||||
{"text": "Good", "bbox": [0, 0, 50, 20]},
|
||||
{"text": "Bad", "bbox": [1e10, 0, 50, 20]}, # Corrupted
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
tokens = list(extract_text_tokens("test.pdf", page_no=0))
|
||||
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == "Good"
|
||||
|
||||
|
||||
class TestExtractWordsFunction:
|
||||
"""Tests for extract_words function."""
|
||||
|
||||
def test_extract_words(self):
|
||||
"""Should extract words using words mode."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = [
|
||||
(10, 20, 50, 35, "Hello", 0, 0, 0),
|
||||
(60, 20, 100, 35, "World", 0, 0, 1),
|
||||
]
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
tokens = list(extract_words("test.pdf", page_no=0))
|
||||
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "Hello"
|
||||
assert tokens[0].bbox == (10, 20, 50, 35)
|
||||
assert tokens[1].text == "World"
|
||||
|
||||
def test_skips_empty_words(self):
|
||||
"""Should skip empty words."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = [
|
||||
(10, 20, 50, 35, "", 0, 0, 0),
|
||||
(60, 20, 100, 35, " ", 0, 0, 1),
|
||||
(110, 20, 150, 35, "Valid", 0, 0, 2),
|
||||
]
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
tokens = list(extract_words("test.pdf", page_no=0))
|
||||
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == "Valid"
|
||||
|
||||
|
||||
class TestExtractLinesFunction:
|
||||
"""Tests for extract_lines function."""
|
||||
|
||||
def test_extract_lines(self):
|
||||
"""Should extract full lines by combining spans."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = {
|
||||
"blocks": [
|
||||
{
|
||||
"type": 0,
|
||||
"lines": [
|
||||
{
|
||||
"spans": [
|
||||
{"text": "Hello", "bbox": [10, 20, 50, 35]},
|
||||
{"text": "World", "bbox": [55, 20, 100, 35]},
|
||||
]
|
||||
},
|
||||
{
|
||||
"spans": [
|
||||
{"text": "Second line", "bbox": [10, 40, 100, 55]},
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
tokens = list(extract_lines("test.pdf", page_no=0))
|
||||
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "Hello World"
|
||||
# BBox should span both spans
|
||||
assert tokens[0].bbox[0] == 10 # min x0
|
||||
assert tokens[0].bbox[2] == 100 # max x1
|
||||
|
||||
def test_skips_empty_lines(self):
|
||||
"""Should skip lines with no text."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_text.return_value = {
|
||||
"blocks": [
|
||||
{
|
||||
"type": 0,
|
||||
"lines": [
|
||||
{"spans": []}, # Empty line
|
||||
{"spans": [{"text": "Valid", "bbox": [0, 0, 50, 20]}]},
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=1)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
tokens = list(extract_lines("test.pdf", page_no=0))
|
||||
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == "Valid"
|
||||
|
||||
|
||||
class TestGetPageDimensionsFunction:
|
||||
"""Tests for get_page_dimensions standalone function."""
|
||||
|
||||
def test_get_dimensions(self):
|
||||
"""Should return page dimensions."""
|
||||
mock_rect = MagicMock()
|
||||
mock_rect.width = 612.0 # Letter width
|
||||
mock_rect.height = 792.0 # Letter height
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.rect = mock_rect
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
width, height = get_page_dimensions("test.pdf", page_no=0)
|
||||
|
||||
assert width == 612.0
|
||||
assert height == 792.0
|
||||
|
||||
def test_get_dimensions_different_page(self):
|
||||
"""Should get dimensions for specific page."""
|
||||
mock_rect = MagicMock()
|
||||
mock_rect.width = 595.0
|
||||
mock_rect.height = 842.0
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.rect = mock_rect
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
get_page_dimensions("test.pdf", page_no=2)
|
||||
mock_doc.__getitem__.assert_called_with(2)
|
||||
|
||||
|
||||
class TestPDFDocumentIsTextPDF:
|
||||
"""Tests for PDFDocument.is_text_pdf method."""
|
||||
|
||||
def test_delegates_to_detector(self):
|
||||
"""Should delegate to detector module's is_text_pdf."""
|
||||
mock_doc = MagicMock()
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with patch("src.pdf.extractor._is_text_pdf_standalone", return_value=True) as mock_check:
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
result = pdf.is_text_pdf(min_chars=50)
|
||||
|
||||
mock_check.assert_called_once_with(Path("test.pdf"), 50)
|
||||
assert result is True
|
||||
|
||||
|
||||
class TestPDFDocumentRenderPage:
|
||||
"""Tests for PDFDocument render methods."""
|
||||
|
||||
def test_render_page(self, tmp_path):
|
||||
"""Should render page to image file."""
|
||||
mock_pix = MagicMock()
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_pixmap.return_value = mock_pix
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
|
||||
output_path = tmp_path / "output.png"
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with patch("fitz.Matrix") as mock_matrix:
|
||||
with PDFDocument("test.pdf") as pdf:
|
||||
result = pdf.render_page(0, output_path, dpi=150)
|
||||
|
||||
# Verify matrix created with correct zoom
|
||||
zoom = 150 / 72
|
||||
mock_matrix.assert_called_once_with(zoom, zoom)
|
||||
|
||||
# Verify pixmap saved
|
||||
mock_pix.save.assert_called_once_with(str(output_path))
|
||||
|
||||
assert result == output_path
|
||||
|
||||
def test_render_all_pages(self, tmp_path):
|
||||
"""Should render all pages to images."""
|
||||
mock_pix = MagicMock()
|
||||
|
||||
mock_page = MagicMock()
|
||||
mock_page.get_pixmap.return_value = mock_pix
|
||||
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.__len__ = MagicMock(return_value=2)
|
||||
mock_doc.__getitem__ = MagicMock(return_value=mock_page)
|
||||
mock_doc.stem = "test" # For filename generation
|
||||
|
||||
with patch("fitz.open", return_value=mock_doc):
|
||||
with patch("fitz.Matrix"):
|
||||
with PDFDocument(tmp_path / "test.pdf") as pdf:
|
||||
results = list(pdf.render_all_pages(tmp_path, dpi=150))
|
||||
|
||||
assert len(results) == 2
|
||||
assert results[0][0] == 0 # Page number
|
||||
assert results[1][0] == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user