""" Tests for PP-StructureV3 Table Detection TDD tests for TableDetector class. Tests are designed to run without requiring the actual PP-StructureV3 library by using mock objects. """ import pytest from dataclasses import dataclass from typing import Any from unittest.mock import MagicMock, patch import numpy as np from backend.table.structure_detector import ( TableDetectionResult, TableDetector, TableDetectorConfig, ) class TestTableDetectionResult: """Tests for TableDetectionResult dataclass.""" def test_create_with_required_fields(self): """Test creating result with required fields.""" result = TableDetectionResult( bbox=(10.0, 20.0, 300.0, 400.0), html="
Test
", confidence=0.95, table_type="wired", ) assert result.bbox == (10.0, 20.0, 300.0, 400.0) assert result.html == "
Test
" assert result.confidence == 0.95 assert result.table_type == "wired" assert result.cells == [] def test_create_with_cells(self): """Test creating result with cell data.""" cells = [ {"text": "Header1", "row": 0, "col": 0}, {"text": "Value1", "row": 1, "col": 0}, ] result = TableDetectionResult( bbox=(0, 0, 100, 100), html="
", confidence=0.9, table_type="wireless", cells=cells, ) assert len(result.cells) == 2 assert result.cells[0]["text"] == "Header1" assert result.table_type == "wireless" def test_bbox_is_tuple_of_floats(self): """Test that bbox contains float values.""" result = TableDetectionResult( bbox=(10, 20, 300, 400), # int inputs html="", confidence=0.9, table_type="wired", ) # Should work with int inputs (duck typing) assert len(result.bbox) == 4 class TestTableDetectorConfig: """Tests for TableDetectorConfig dataclass.""" def test_default_values(self): """Test default configuration values.""" config = TableDetectorConfig() assert config.device == "gpu:0" assert config.use_doc_orientation_classify is False assert config.use_doc_unwarping is False assert config.use_textline_orientation is False # SLANeXt models for better table recognition accuracy assert config.wired_table_model == "SLANeXt_wired" assert config.wireless_table_model == "SLANeXt_wireless" assert config.layout_model == "PP-DocLayout_plus-L" assert config.min_confidence == 0.5 def test_custom_values(self): """Test custom configuration values.""" config = TableDetectorConfig( device="cpu", min_confidence=0.7, wired_table_model="SLANet_plus", ) assert config.device == "cpu" assert config.min_confidence == 0.7 assert config.wired_table_model == "SLANet_plus" class TestTableDetectorInitialization: """Tests for TableDetector initialization.""" def test_init_with_default_config(self): """Test initialization with default config.""" detector = TableDetector() assert detector.config is not None assert detector.config.device == "gpu:0" assert detector._initialized is False def test_init_with_custom_config(self): """Test initialization with custom config.""" config = TableDetectorConfig(device="cpu", min_confidence=0.8) detector = TableDetector(config=config) assert detector.config.device == "cpu" assert detector.config.min_confidence == 0.8 def test_init_with_mock_pipeline(self): """Test initialization with pre-initialized pipeline.""" mock_pipeline = MagicMock() detector = TableDetector(pipeline=mock_pipeline) assert detector._initialized is True assert detector._pipeline is mock_pipeline class TestTableDetectorDetection: """Tests for TableDetector.detect() method.""" def create_mock_element( self, label: str = "table", bbox: tuple = (10, 20, 300, 400), html: str = "
Test
", score: float = 0.95, ) -> MagicMock: """Create a mock PP-StructureV3 element.""" element = MagicMock() element.label = label element.bbox = bbox element.html = html element.score = score element.cells = [] return element def create_mock_result(self, elements: list) -> MagicMock: """Create a mock PP-StructureV3 result (legacy API without 'get').""" # Use spec=[] to prevent MagicMock from having a 'get' method # This simulates the legacy API that uses layout_elements attribute result = MagicMock(spec=["layout_elements"]) result.layout_elements = elements return result def test_detect_single_table(self): """Test detecting a single table in image.""" # Setup mock pipeline mock_pipeline = MagicMock() element = self.create_mock_element() mock_result = self.create_mock_result([element]) mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].bbox == (10.0, 20.0, 300.0, 400.0) assert results[0].confidence == 0.95 assert results[0].table_type == "wired" mock_pipeline.predict.assert_called_once() def test_detect_multiple_tables(self): """Test detecting multiple tables in image.""" mock_pipeline = MagicMock() element1 = self.create_mock_element( bbox=(10, 20, 300, 200), html="1
", ) element2 = self.create_mock_element( bbox=(10, 220, 300, 400), html="2
", ) mock_result = self.create_mock_result([element1, element2]) mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((500, 400, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 2 assert results[0].html == "1
" assert results[1].html == "2
" def test_detect_no_tables(self): """Test handling of image with no tables.""" mock_pipeline = MagicMock() # Return result with non-table elements text_element = MagicMock() text_element.label = "text" mock_result = self.create_mock_result([text_element]) mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 0 def test_detect_filters_low_confidence(self): """Test that low confidence tables are filtered out.""" mock_pipeline = MagicMock() low_conf_element = self.create_mock_element(score=0.3) high_conf_element = self.create_mock_element(score=0.9) mock_result = self.create_mock_result([low_conf_element, high_conf_element]) mock_pipeline.predict.return_value = [mock_result] config = TableDetectorConfig(min_confidence=0.5) detector = TableDetector(config=config, pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].confidence == 0.9 def test_detect_wireless_table(self): """Test detecting wireless (borderless) table.""" mock_pipeline = MagicMock() element = self.create_mock_element(label="wireless_table") mock_result = self.create_mock_result([element]) mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].table_type == "wireless" def test_detect_with_file_path(self): """Test detection with file path input.""" mock_pipeline = MagicMock() element = self.create_mock_element() mock_result = self.create_mock_result([element]) mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) # Should accept string path results = detector.detect("/path/to/image.png") mock_pipeline.predict.assert_called_with("/path/to/image.png") def test_detect_returns_empty_on_none_results(self): """Test handling of None results from pipeline.""" mock_pipeline = MagicMock() mock_pipeline.predict.return_value = None detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert results == [] class TestTableDetectorLazyInit: """Tests for lazy initialization of PP-StructureV3.""" def test_lazy_init_flag_starts_false(self): """Test that pipeline is not initialized on construction.""" detector = TableDetector() assert detector._initialized is False assert detector._pipeline is None def test_lazy_init_with_injected_pipeline(self): """Test that injected pipeline skips lazy initialization.""" mock_pipeline = MagicMock() mock_pipeline.predict.return_value = [] detector = TableDetector(pipeline=mock_pipeline) assert detector._initialized is True assert detector._pipeline is mock_pipeline # Detection should work without triggering _ensure_initialized import image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert results == [] mock_pipeline.predict.assert_called_once() def test_import_error_without_paddleocr(self): """Test ImportError when paddleocr is not available.""" detector = TableDetector() # Simulate paddleocr not being installed with patch.dict("sys.modules", {"paddleocr": None}): with pytest.raises(ImportError) as exc_info: detector._ensure_initialized() assert "paddleocr" in str(exc_info.value).lower() class TestTableDetectorParseResults: """Tests for result parsing logic.""" def test_parse_element_with_box_attribute(self): """Test parsing element with 'box' instead of 'bbox'.""" mock_pipeline = MagicMock() element = MagicMock() element.label = "table" element.box = [10, 20, 300, 400] # 'box' instead of 'bbox' element.html = "
" element.score = 0.9 element.cells = [] del element.bbox # Remove bbox attribute mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].bbox == (10.0, 20.0, 300.0, 400.0) def test_parse_element_with_table_html_attribute(self): """Test parsing element with 'table_html' instead of 'html'.""" mock_pipeline = MagicMock() element = MagicMock() element.label = "table" element.bbox = [0, 0, 100, 100] element.table_html = "
Content
" element.score = 0.9 element.cells = [] del element.html mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert "" in results[0].html def test_parse_element_with_type_attribute(self): """Test parsing element with 'type' instead of 'label'.""" mock_pipeline = MagicMock() element = MagicMock() element.type = "table" # 'type' instead of 'label' element.bbox = [0, 0, 100, 100] element.html = "
" element.score = 0.9 element.cells = [] del element.label mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 def test_parse_cells_data(self): """Test parsing cell-level data from element.""" mock_pipeline = MagicMock() # Create mock cells cell1 = MagicMock() cell1.text = "Header" cell1.row = 0 cell1.col = 0 cell1.row_span = 1 cell1.col_span = 1 cell1.bbox = [0, 0, 50, 20] cell2 = MagicMock() cell2.text = "Value" cell2.row = 1 cell2.col = 0 cell2.row_span = 1 cell2.col_span = 1 cell2.bbox = [0, 20, 50, 40] element = MagicMock() element.label = "table" element.bbox = [0, 0, 100, 100] element.html = "
" element.score = 0.9 element.cells = [cell1, cell2] mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert len(results[0].cells) == 2 assert results[0].cells[0]["text"] == "Header" assert results[0].cells[0]["row"] == 0 assert results[0].cells[1]["text"] == "Value" assert results[0].cells[1]["row"] == 1 class TestTableDetectorEdgeCases: """Tests for edge cases and error handling.""" def test_handles_malformed_element_gracefully(self): """Test graceful handling of malformed element data.""" mock_pipeline = MagicMock() # Element missing required attributes bad_element = MagicMock() bad_element.label = "table" # Missing bbox, html, score del bad_element.bbox del bad_element.box good_element = MagicMock() good_element.label = "table" good_element.bbox = [0, 0, 100, 100] good_element.html = "
" good_element.score = 0.9 good_element.cells = [] mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [bad_element, good_element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) # Should not raise, should skip bad element results = detector.detect(image) assert len(results) == 1 def test_handles_empty_layout_elements(self): """Test handling of empty layout_elements list.""" mock_pipeline = MagicMock() mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert results == [] def test_handles_result_without_layout_elements(self): """Test handling of result without layout_elements attribute.""" mock_pipeline = MagicMock() mock_result = MagicMock(spec=[]) # No attributes mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert results == [] def test_confidence_as_list(self): """Test handling confidence score as list.""" mock_pipeline = MagicMock() element = MagicMock() element.label = "table" element.bbox = [0, 0, 100, 100] element.html = "
" element.score = [0.95] # Score as list element.cells = [] mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].confidence == 0.95 class TestPaddleX3xAPI: """Tests for PaddleX 3.x API support (LayoutParsingResultV2).""" def test_parse_paddlex_result_with_tables(self): """Test parsing PaddleX 3.x LayoutParsingResultV2 with tables.""" mock_pipeline = MagicMock() # Simulate PaddleX 3.x dict-like result mock_result = { "table_res_list": [ { "cell_box_list": [[0, 0, 50, 20], [50, 0, 100, 20]], "pred_html": "
Cell1Cell2
", "table_ocr_pred": ["Cell1", "Cell2"], "table_region_id": 0, } ], "parsing_res_list": [ {"label": "table", "bbox": [10, 20, 200, 300]}, ], } mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].html == "
Cell1Cell2
" assert results[0].bbox == (10.0, 20.0, 200.0, 300.0) assert len(results[0].cells) == 2 assert results[0].cells[0]["text"] == "Cell1" assert results[0].cells[1]["text"] == "Cell2" def test_parse_paddlex_result_empty_tables(self): """Test parsing PaddleX 3.x result with no tables.""" mock_pipeline = MagicMock() mock_result = { "table_res_list": None, "parsing_res_list": [ {"label": "text", "bbox": [10, 20, 200, 300]}, ], } mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 0 def test_parse_paddlex_result_multiple_tables(self): """Test parsing PaddleX 3.x result with multiple tables.""" mock_pipeline = MagicMock() mock_result = { "table_res_list": [ { "cell_box_list": [[0, 0, 50, 20]], "pred_html": "1
", "table_ocr_pred": ["Text1"], "table_region_id": 0, }, { "cell_box_list": [[0, 0, 100, 40]], "pred_html": "2
", "table_ocr_pred": ["Text2"], "table_region_id": 1, }, ], "parsing_res_list": [ {"label": "table", "bbox": [10, 20, 200, 300]}, {"label": "table", "bbox": [10, 350, 200, 600]}, ], } mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 2 assert results[0].html == "1
" assert results[1].html == "2
" assert results[0].bbox == (10.0, 20.0, 200.0, 300.0) assert results[1].bbox == (10.0, 350.0, 200.0, 600.0) def test_parse_paddlex_result_with_numpy_arrays(self): """Test parsing PaddleX 3.x result where bbox/cell_box are numpy arrays.""" mock_pipeline = MagicMock() # Simulate PaddleX 3.x result with numpy arrays (real PP-StructureV3 returns these) mock_result = { "table_res_list": [ { "cell_box_list": [ np.array([0.0, 0.0, 50.0, 20.0]), np.array([50.0, 0.0, 100.0, 20.0]), ], "pred_html": "
AB
", "table_ocr_pred": ["A", "B"], } ], "parsing_res_list": [ {"label": "table", "bbox": np.array([10.0, 20.0, 200.0, 300.0])}, ], } mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].bbox == (10.0, 20.0, 200.0, 300.0) assert results[0].html == "
AB
" assert len(results[0].cells) == 2 assert results[0].cells[0]["text"] == "A" assert results[0].cells[0]["bbox"] == [0.0, 0.0, 50.0, 20.0] assert results[0].cells[1]["text"] == "B" def test_parse_paddlex_result_with_empty_numpy_arrays(self): """Test parsing PaddleX 3.x result where some arrays are empty.""" mock_pipeline = MagicMock() mock_result = { "table_res_list": [ { "cell_box_list": np.array([]), # Empty numpy array "pred_html": "
", "table_ocr_pred": np.array([]), # Empty numpy array } ], "parsing_res_list": [ {"label": "table", "bbox": np.array([10.0, 20.0, 200.0, 300.0])}, ], } mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].cells == [] # Empty cells list assert results[0].html == "
" def test_parse_paddlex_result_with_dict_ocr_data(self): """Test parsing PaddleX 3.x result with dict-format table_ocr_pred.""" mock_pipeline = MagicMock() mock_result = { "table_res_list": [ { "cell_box_list": [[0, 0, 50, 20], [50, 0, 100, 20]], "pred_html": "
AB
", "table_ocr_pred": { "rec_texts": ["A", "B"], "rec_scores": [0.99, 0.98], }, } ], "parsing_res_list": [ {"label": "table", "bbox": [10, 20, 200, 300]}, ], } mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert len(results[0].cells) == 2 assert results[0].cells[0]["text"] == "A" assert results[0].cells[1]["text"] == "B" def test_parse_paddlex_result_no_bbox_in_parsing_res(self): """Test parsing PaddleX 3.x result when table bbox not in parsing_res.""" mock_pipeline = MagicMock() mock_result = { "table_res_list": [ { "cell_box_list": [[0, 0, 50, 20]], "pred_html": "
A
", "table_ocr_pred": ["A"], } ], "parsing_res_list": [ {"label": "text", "bbox": [10, 20, 200, 300]}, # Not a table ], } mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 # Should use default bbox [0,0,0,0] when not found assert results[0].bbox == (0.0, 0.0, 0.0, 0.0) class TestIteratorResults: """Tests for iterator/generator result handling.""" def test_handles_iterator_results(self): """Test handling of iterator results from pipeline.""" mock_pipeline = MagicMock() # Return a generator instead of list def result_generator(): element = MagicMock() element.label = "table" element.bbox = [0, 0, 100, 100] element.html = "
" element.score = 0.9 element.cells = [] mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] yield mock_result mock_pipeline.predict.return_value = result_generator() detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 def test_handles_failed_iterator_conversion(self): """Test handling when iterator conversion fails.""" mock_pipeline = MagicMock() # Create an object that has __iter__ but fails when converted to list class FailingIterator: def __iter__(self): raise RuntimeError("Iterator failed") mock_pipeline.predict.return_value = FailingIterator() detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) # Should return empty list, not raise assert results == [] class TestPathConversion: """Tests for path handling.""" def test_converts_path_object_to_string(self): """Test that Path objects are converted to strings.""" from pathlib import Path mock_pipeline = MagicMock() mock_pipeline.predict.return_value = [] detector = TableDetector(pipeline=mock_pipeline) path = Path("/some/path/to/image.png") detector.detect(path) # Should be called with string, not Path mock_pipeline.predict.assert_called_with("/some/path/to/image.png") class TestHtmlExtraction: """Tests for HTML extraction from different element formats.""" def test_extracts_html_from_res_dict(self): """Test extracting HTML from element.res dictionary.""" mock_pipeline = MagicMock() element = MagicMock() element.label = "table" element.bbox = [0, 0, 100, 100] element.res = {"html": "
From res
"} element.score = 0.9 element.cells = [] # Remove direct html attribute del element.html del element.table_html mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].html == "
From res
" def test_returns_empty_html_when_not_found(self): """Test empty HTML when no html attribute found.""" mock_pipeline = MagicMock() element = MagicMock() element.label = "table" element.bbox = [0, 0, 100, 100] element.score = 0.9 element.cells = [] # Remove all html attributes del element.html del element.table_html del element.res mock_result = MagicMock(spec=["layout_elements"]) mock_result.layout_elements = [element] mock_pipeline.predict.return_value = [mock_result] detector = TableDetector(pipeline=mock_pipeline) image = np.zeros((100, 100, 3), dtype=np.uint8) results = detector.detect(image) assert len(results) == 1 assert results[0].html == "" class TestTableTypeDetection: """Tests for table type detection.""" def test_detects_borderless_table(self): """Test detection of borderless table type via _get_table_type.""" detector = TableDetector() # Create mock element with borderless label element = MagicMock() element.label = "borderless_table" result = detector._get_table_type(element) assert result == "wireless" def test_detects_wireless_table_label(self): """Test detection of wireless table type.""" detector = TableDetector() element = MagicMock() element.label = "wireless_table" result = detector._get_table_type(element) assert result == "wireless" def test_defaults_to_wired_table(self): """Test default table type is wired.""" detector = TableDetector() element = MagicMock() element.label = "table" result = detector._get_table_type(element) assert result == "wired" def test_type_attribute_instead_of_label(self): """Test table type detection using type attribute.""" detector = TableDetector() element = MagicMock() element.type = "wireless" del element.label # Remove label result = detector._get_table_type(element) assert result == "wireless" class TestPipelineRuntimeError: """Tests for pipeline runtime errors.""" def test_raises_runtime_error_when_pipeline_none(self): """Test RuntimeError when pipeline is None during detect.""" detector = TableDetector() detector._initialized = True # Bypass lazy init detector._pipeline = None image = np.zeros((100, 100, 3), dtype=np.uint8) with pytest.raises(RuntimeError) as exc_info: detector.detect(image) assert "not initialized" in str(exc_info.value).lower()