- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
903 lines
30 KiB
Python
903 lines
30 KiB
Python
"""
|
|
Tests for PP-StructureV3 Table Detection
|
|
|
|
TDD tests for TableDetector class. Tests are designed to run without
|
|
requiring the actual PP-StructureV3 library by using mock objects.
|
|
"""
|
|
|
|
import pytest
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
from unittest.mock import MagicMock, patch
|
|
import numpy as np
|
|
|
|
from backend.table.structure_detector import (
|
|
TableDetectionResult,
|
|
TableDetector,
|
|
TableDetectorConfig,
|
|
)
|
|
|
|
|
|
class TestTableDetectionResult:
|
|
"""Tests for TableDetectionResult dataclass."""
|
|
|
|
def test_create_with_required_fields(self):
|
|
"""Test creating result with required fields."""
|
|
result = TableDetectionResult(
|
|
bbox=(10.0, 20.0, 300.0, 400.0),
|
|
html="<table><tr><td>Test</td></tr></table>",
|
|
confidence=0.95,
|
|
table_type="wired",
|
|
)
|
|
|
|
assert result.bbox == (10.0, 20.0, 300.0, 400.0)
|
|
assert result.html == "<table><tr><td>Test</td></tr></table>"
|
|
assert result.confidence == 0.95
|
|
assert result.table_type == "wired"
|
|
assert result.cells == []
|
|
|
|
def test_create_with_cells(self):
|
|
"""Test creating result with cell data."""
|
|
cells = [
|
|
{"text": "Header1", "row": 0, "col": 0},
|
|
{"text": "Value1", "row": 1, "col": 0},
|
|
]
|
|
result = TableDetectionResult(
|
|
bbox=(0, 0, 100, 100),
|
|
html="<table></table>",
|
|
confidence=0.9,
|
|
table_type="wireless",
|
|
cells=cells,
|
|
)
|
|
|
|
assert len(result.cells) == 2
|
|
assert result.cells[0]["text"] == "Header1"
|
|
assert result.table_type == "wireless"
|
|
|
|
def test_bbox_is_tuple_of_floats(self):
|
|
"""Test that bbox contains float values."""
|
|
result = TableDetectionResult(
|
|
bbox=(10, 20, 300, 400), # int inputs
|
|
html="",
|
|
confidence=0.9,
|
|
table_type="wired",
|
|
)
|
|
|
|
# Should work with int inputs (duck typing)
|
|
assert len(result.bbox) == 4
|
|
|
|
|
|
class TestTableDetectorConfig:
|
|
"""Tests for TableDetectorConfig dataclass."""
|
|
|
|
def test_default_values(self):
|
|
"""Test default configuration values."""
|
|
config = TableDetectorConfig()
|
|
|
|
assert config.device == "gpu:0"
|
|
assert config.use_doc_orientation_classify is False
|
|
assert config.use_doc_unwarping is False
|
|
assert config.use_textline_orientation is False
|
|
# SLANeXt models for better table recognition accuracy
|
|
assert config.wired_table_model == "SLANeXt_wired"
|
|
assert config.wireless_table_model == "SLANeXt_wireless"
|
|
assert config.layout_model == "PP-DocLayout_plus-L"
|
|
assert config.min_confidence == 0.5
|
|
|
|
def test_custom_values(self):
|
|
"""Test custom configuration values."""
|
|
config = TableDetectorConfig(
|
|
device="cpu",
|
|
min_confidence=0.7,
|
|
wired_table_model="SLANet_plus",
|
|
)
|
|
|
|
assert config.device == "cpu"
|
|
assert config.min_confidence == 0.7
|
|
assert config.wired_table_model == "SLANet_plus"
|
|
|
|
|
|
class TestTableDetectorInitialization:
|
|
"""Tests for TableDetector initialization."""
|
|
|
|
def test_init_with_default_config(self):
|
|
"""Test initialization with default config."""
|
|
detector = TableDetector()
|
|
|
|
assert detector.config is not None
|
|
assert detector.config.device == "gpu:0"
|
|
assert detector._initialized is False
|
|
|
|
def test_init_with_custom_config(self):
|
|
"""Test initialization with custom config."""
|
|
config = TableDetectorConfig(device="cpu", min_confidence=0.8)
|
|
detector = TableDetector(config=config)
|
|
|
|
assert detector.config.device == "cpu"
|
|
assert detector.config.min_confidence == 0.8
|
|
|
|
def test_init_with_mock_pipeline(self):
|
|
"""Test initialization with pre-initialized pipeline."""
|
|
mock_pipeline = MagicMock()
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
|
|
assert detector._initialized is True
|
|
assert detector._pipeline is mock_pipeline
|
|
|
|
|
|
class TestTableDetectorDetection:
|
|
"""Tests for TableDetector.detect() method."""
|
|
|
|
def create_mock_element(
|
|
self,
|
|
label: str = "table",
|
|
bbox: tuple = (10, 20, 300, 400),
|
|
html: str = "<table><tr><td>Test</td></tr></table>",
|
|
score: float = 0.95,
|
|
) -> MagicMock:
|
|
"""Create a mock PP-StructureV3 element."""
|
|
element = MagicMock()
|
|
element.label = label
|
|
element.bbox = bbox
|
|
element.html = html
|
|
element.score = score
|
|
element.cells = []
|
|
return element
|
|
|
|
def create_mock_result(self, elements: list) -> MagicMock:
|
|
"""Create a mock PP-StructureV3 result (legacy API without 'get')."""
|
|
# Use spec=[] to prevent MagicMock from having a 'get' method
|
|
# This simulates the legacy API that uses layout_elements attribute
|
|
result = MagicMock(spec=["layout_elements"])
|
|
result.layout_elements = elements
|
|
return result
|
|
|
|
def test_detect_single_table(self):
|
|
"""Test detecting a single table in image."""
|
|
# Setup mock pipeline
|
|
mock_pipeline = MagicMock()
|
|
element = self.create_mock_element()
|
|
mock_result = self.create_mock_result([element])
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].bbox == (10.0, 20.0, 300.0, 400.0)
|
|
assert results[0].confidence == 0.95
|
|
assert results[0].table_type == "wired"
|
|
mock_pipeline.predict.assert_called_once()
|
|
|
|
def test_detect_multiple_tables(self):
|
|
"""Test detecting multiple tables in image."""
|
|
mock_pipeline = MagicMock()
|
|
element1 = self.create_mock_element(
|
|
bbox=(10, 20, 300, 200),
|
|
html="<table>1</table>",
|
|
)
|
|
element2 = self.create_mock_element(
|
|
bbox=(10, 220, 300, 400),
|
|
html="<table>2</table>",
|
|
)
|
|
mock_result = self.create_mock_result([element1, element2])
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((500, 400, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 2
|
|
assert results[0].html == "<table>1</table>"
|
|
assert results[1].html == "<table>2</table>"
|
|
|
|
def test_detect_no_tables(self):
|
|
"""Test handling of image with no tables."""
|
|
mock_pipeline = MagicMock()
|
|
# Return result with non-table elements
|
|
text_element = MagicMock()
|
|
text_element.label = "text"
|
|
mock_result = self.create_mock_result([text_element])
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 0
|
|
|
|
def test_detect_filters_low_confidence(self):
|
|
"""Test that low confidence tables are filtered out."""
|
|
mock_pipeline = MagicMock()
|
|
low_conf_element = self.create_mock_element(score=0.3)
|
|
high_conf_element = self.create_mock_element(score=0.9)
|
|
mock_result = self.create_mock_result([low_conf_element, high_conf_element])
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
config = TableDetectorConfig(min_confidence=0.5)
|
|
detector = TableDetector(config=config, pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].confidence == 0.9
|
|
|
|
def test_detect_wireless_table(self):
|
|
"""Test detecting wireless (borderless) table."""
|
|
mock_pipeline = MagicMock()
|
|
element = self.create_mock_element(label="wireless_table")
|
|
mock_result = self.create_mock_result([element])
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].table_type == "wireless"
|
|
|
|
def test_detect_with_file_path(self):
|
|
"""Test detection with file path input."""
|
|
mock_pipeline = MagicMock()
|
|
element = self.create_mock_element()
|
|
mock_result = self.create_mock_result([element])
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
|
|
# Should accept string path
|
|
results = detector.detect("/path/to/image.png")
|
|
|
|
mock_pipeline.predict.assert_called_with("/path/to/image.png")
|
|
|
|
def test_detect_returns_empty_on_none_results(self):
|
|
"""Test handling of None results from pipeline."""
|
|
mock_pipeline = MagicMock()
|
|
mock_pipeline.predict.return_value = None
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert results == []
|
|
|
|
|
|
class TestTableDetectorLazyInit:
|
|
"""Tests for lazy initialization of PP-StructureV3."""
|
|
|
|
def test_lazy_init_flag_starts_false(self):
|
|
"""Test that pipeline is not initialized on construction."""
|
|
detector = TableDetector()
|
|
assert detector._initialized is False
|
|
assert detector._pipeline is None
|
|
|
|
def test_lazy_init_with_injected_pipeline(self):
|
|
"""Test that injected pipeline skips lazy initialization."""
|
|
mock_pipeline = MagicMock()
|
|
mock_pipeline.predict.return_value = []
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
|
|
assert detector._initialized is True
|
|
assert detector._pipeline is mock_pipeline
|
|
|
|
# Detection should work without triggering _ensure_initialized import
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
results = detector.detect(image)
|
|
|
|
assert results == []
|
|
mock_pipeline.predict.assert_called_once()
|
|
|
|
def test_import_error_without_paddleocr(self):
|
|
"""Test ImportError when paddleocr is not available."""
|
|
detector = TableDetector()
|
|
|
|
# Simulate paddleocr not being installed
|
|
with patch.dict("sys.modules", {"paddleocr": None}):
|
|
with pytest.raises(ImportError) as exc_info:
|
|
detector._ensure_initialized()
|
|
|
|
assert "paddleocr" in str(exc_info.value).lower()
|
|
|
|
|
|
class TestTableDetectorParseResults:
|
|
"""Tests for result parsing logic."""
|
|
|
|
def test_parse_element_with_box_attribute(self):
|
|
"""Test parsing element with 'box' instead of 'bbox'."""
|
|
mock_pipeline = MagicMock()
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
element.box = [10, 20, 300, 400] # 'box' instead of 'bbox'
|
|
element.html = "<table></table>"
|
|
element.score = 0.9
|
|
element.cells = []
|
|
del element.bbox # Remove bbox attribute
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].bbox == (10.0, 20.0, 300.0, 400.0)
|
|
|
|
def test_parse_element_with_table_html_attribute(self):
|
|
"""Test parsing element with 'table_html' instead of 'html'."""
|
|
mock_pipeline = MagicMock()
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
element.bbox = [0, 0, 100, 100]
|
|
element.table_html = "<table><tr><td>Content</td></tr></table>"
|
|
element.score = 0.9
|
|
element.cells = []
|
|
del element.html
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert "<table>" in results[0].html
|
|
|
|
def test_parse_element_with_type_attribute(self):
|
|
"""Test parsing element with 'type' instead of 'label'."""
|
|
mock_pipeline = MagicMock()
|
|
element = MagicMock()
|
|
element.type = "table" # 'type' instead of 'label'
|
|
element.bbox = [0, 0, 100, 100]
|
|
element.html = "<table></table>"
|
|
element.score = 0.9
|
|
element.cells = []
|
|
del element.label
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
|
|
def test_parse_cells_data(self):
|
|
"""Test parsing cell-level data from element."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
# Create mock cells
|
|
cell1 = MagicMock()
|
|
cell1.text = "Header"
|
|
cell1.row = 0
|
|
cell1.col = 0
|
|
cell1.row_span = 1
|
|
cell1.col_span = 1
|
|
cell1.bbox = [0, 0, 50, 20]
|
|
|
|
cell2 = MagicMock()
|
|
cell2.text = "Value"
|
|
cell2.row = 1
|
|
cell2.col = 0
|
|
cell2.row_span = 1
|
|
cell2.col_span = 1
|
|
cell2.bbox = [0, 20, 50, 40]
|
|
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
element.bbox = [0, 0, 100, 100]
|
|
element.html = "<table></table>"
|
|
element.score = 0.9
|
|
element.cells = [cell1, cell2]
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert len(results[0].cells) == 2
|
|
assert results[0].cells[0]["text"] == "Header"
|
|
assert results[0].cells[0]["row"] == 0
|
|
assert results[0].cells[1]["text"] == "Value"
|
|
assert results[0].cells[1]["row"] == 1
|
|
|
|
|
|
class TestTableDetectorEdgeCases:
|
|
"""Tests for edge cases and error handling."""
|
|
|
|
def test_handles_malformed_element_gracefully(self):
|
|
"""Test graceful handling of malformed element data."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
# Element missing required attributes
|
|
bad_element = MagicMock()
|
|
bad_element.label = "table"
|
|
# Missing bbox, html, score
|
|
del bad_element.bbox
|
|
del bad_element.box
|
|
|
|
good_element = MagicMock()
|
|
good_element.label = "table"
|
|
good_element.bbox = [0, 0, 100, 100]
|
|
good_element.html = "<table></table>"
|
|
good_element.score = 0.9
|
|
good_element.cells = []
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [bad_element, good_element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
# Should not raise, should skip bad element
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
|
|
def test_handles_empty_layout_elements(self):
|
|
"""Test handling of empty layout_elements list."""
|
|
mock_pipeline = MagicMock()
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = []
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert results == []
|
|
|
|
def test_handles_result_without_layout_elements(self):
|
|
"""Test handling of result without layout_elements attribute."""
|
|
mock_pipeline = MagicMock()
|
|
mock_result = MagicMock(spec=[]) # No attributes
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert results == []
|
|
|
|
def test_confidence_as_list(self):
|
|
"""Test handling confidence score as list."""
|
|
mock_pipeline = MagicMock()
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
element.bbox = [0, 0, 100, 100]
|
|
element.html = "<table></table>"
|
|
element.score = [0.95] # Score as list
|
|
element.cells = []
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].confidence == 0.95
|
|
|
|
|
|
class TestPaddleX3xAPI:
|
|
"""Tests for PaddleX 3.x API support (LayoutParsingResultV2)."""
|
|
|
|
def test_parse_paddlex_result_with_tables(self):
|
|
"""Test parsing PaddleX 3.x LayoutParsingResultV2 with tables."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
# Simulate PaddleX 3.x dict-like result
|
|
mock_result = {
|
|
"table_res_list": [
|
|
{
|
|
"cell_box_list": [[0, 0, 50, 20], [50, 0, 100, 20]],
|
|
"pred_html": "<table><tr><td>Cell1</td><td>Cell2</td></tr></table>",
|
|
"table_ocr_pred": ["Cell1", "Cell2"],
|
|
"table_region_id": 0,
|
|
}
|
|
],
|
|
"parsing_res_list": [
|
|
{"label": "table", "bbox": [10, 20, 200, 300]},
|
|
],
|
|
}
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].html == "<table><tr><td>Cell1</td><td>Cell2</td></tr></table>"
|
|
assert results[0].bbox == (10.0, 20.0, 200.0, 300.0)
|
|
assert len(results[0].cells) == 2
|
|
assert results[0].cells[0]["text"] == "Cell1"
|
|
assert results[0].cells[1]["text"] == "Cell2"
|
|
|
|
def test_parse_paddlex_result_empty_tables(self):
|
|
"""Test parsing PaddleX 3.x result with no tables."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
mock_result = {
|
|
"table_res_list": None,
|
|
"parsing_res_list": [
|
|
{"label": "text", "bbox": [10, 20, 200, 300]},
|
|
],
|
|
}
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 0
|
|
|
|
def test_parse_paddlex_result_multiple_tables(self):
|
|
"""Test parsing PaddleX 3.x result with multiple tables."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
mock_result = {
|
|
"table_res_list": [
|
|
{
|
|
"cell_box_list": [[0, 0, 50, 20]],
|
|
"pred_html": "<table>1</table>",
|
|
"table_ocr_pred": ["Text1"],
|
|
"table_region_id": 0,
|
|
},
|
|
{
|
|
"cell_box_list": [[0, 0, 100, 40]],
|
|
"pred_html": "<table>2</table>",
|
|
"table_ocr_pred": ["Text2"],
|
|
"table_region_id": 1,
|
|
},
|
|
],
|
|
"parsing_res_list": [
|
|
{"label": "table", "bbox": [10, 20, 200, 300]},
|
|
{"label": "table", "bbox": [10, 350, 200, 600]},
|
|
],
|
|
}
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 2
|
|
assert results[0].html == "<table>1</table>"
|
|
assert results[1].html == "<table>2</table>"
|
|
assert results[0].bbox == (10.0, 20.0, 200.0, 300.0)
|
|
assert results[1].bbox == (10.0, 350.0, 200.0, 600.0)
|
|
|
|
def test_parse_paddlex_result_with_numpy_arrays(self):
|
|
"""Test parsing PaddleX 3.x result where bbox/cell_box are numpy arrays."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
# Simulate PaddleX 3.x result with numpy arrays (real PP-StructureV3 returns these)
|
|
mock_result = {
|
|
"table_res_list": [
|
|
{
|
|
"cell_box_list": [
|
|
np.array([0.0, 0.0, 50.0, 20.0]),
|
|
np.array([50.0, 0.0, 100.0, 20.0]),
|
|
],
|
|
"pred_html": "<table><tr><td>A</td><td>B</td></tr></table>",
|
|
"table_ocr_pred": ["A", "B"],
|
|
}
|
|
],
|
|
"parsing_res_list": [
|
|
{"label": "table", "bbox": np.array([10.0, 20.0, 200.0, 300.0])},
|
|
],
|
|
}
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].bbox == (10.0, 20.0, 200.0, 300.0)
|
|
assert results[0].html == "<table><tr><td>A</td><td>B</td></tr></table>"
|
|
assert len(results[0].cells) == 2
|
|
assert results[0].cells[0]["text"] == "A"
|
|
assert results[0].cells[0]["bbox"] == [0.0, 0.0, 50.0, 20.0]
|
|
assert results[0].cells[1]["text"] == "B"
|
|
|
|
def test_parse_paddlex_result_with_empty_numpy_arrays(self):
|
|
"""Test parsing PaddleX 3.x result where some arrays are empty."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
mock_result = {
|
|
"table_res_list": [
|
|
{
|
|
"cell_box_list": np.array([]), # Empty numpy array
|
|
"pred_html": "<table></table>",
|
|
"table_ocr_pred": np.array([]), # Empty numpy array
|
|
}
|
|
],
|
|
"parsing_res_list": [
|
|
{"label": "table", "bbox": np.array([10.0, 20.0, 200.0, 300.0])},
|
|
],
|
|
}
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].cells == [] # Empty cells list
|
|
assert results[0].html == "<table></table>"
|
|
|
|
def test_parse_paddlex_result_with_dict_ocr_data(self):
|
|
"""Test parsing PaddleX 3.x result with dict-format table_ocr_pred."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
mock_result = {
|
|
"table_res_list": [
|
|
{
|
|
"cell_box_list": [[0, 0, 50, 20], [50, 0, 100, 20]],
|
|
"pred_html": "<table><tr><td>A</td><td>B</td></tr></table>",
|
|
"table_ocr_pred": {
|
|
"rec_texts": ["A", "B"],
|
|
"rec_scores": [0.99, 0.98],
|
|
},
|
|
}
|
|
],
|
|
"parsing_res_list": [
|
|
{"label": "table", "bbox": [10, 20, 200, 300]},
|
|
],
|
|
}
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert len(results[0].cells) == 2
|
|
assert results[0].cells[0]["text"] == "A"
|
|
assert results[0].cells[1]["text"] == "B"
|
|
|
|
def test_parse_paddlex_result_no_bbox_in_parsing_res(self):
|
|
"""Test parsing PaddleX 3.x result when table bbox not in parsing_res."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
mock_result = {
|
|
"table_res_list": [
|
|
{
|
|
"cell_box_list": [[0, 0, 50, 20]],
|
|
"pred_html": "<table><tr><td>A</td></tr></table>",
|
|
"table_ocr_pred": ["A"],
|
|
}
|
|
],
|
|
"parsing_res_list": [
|
|
{"label": "text", "bbox": [10, 20, 200, 300]}, # Not a table
|
|
],
|
|
}
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
# Should use default bbox [0,0,0,0] when not found
|
|
assert results[0].bbox == (0.0, 0.0, 0.0, 0.0)
|
|
|
|
|
|
class TestIteratorResults:
|
|
"""Tests for iterator/generator result handling."""
|
|
|
|
def test_handles_iterator_results(self):
|
|
"""Test handling of iterator results from pipeline."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
# Return a generator instead of list
|
|
def result_generator():
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
element.bbox = [0, 0, 100, 100]
|
|
element.html = "<table></table>"
|
|
element.score = 0.9
|
|
element.cells = []
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
yield mock_result
|
|
|
|
mock_pipeline.predict.return_value = result_generator()
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
|
|
def test_handles_failed_iterator_conversion(self):
|
|
"""Test handling when iterator conversion fails."""
|
|
mock_pipeline = MagicMock()
|
|
|
|
# Create an object that has __iter__ but fails when converted to list
|
|
class FailingIterator:
|
|
def __iter__(self):
|
|
raise RuntimeError("Iterator failed")
|
|
|
|
mock_pipeline.predict.return_value = FailingIterator()
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
# Should return empty list, not raise
|
|
assert results == []
|
|
|
|
|
|
class TestPathConversion:
|
|
"""Tests for path handling."""
|
|
|
|
def test_converts_path_object_to_string(self):
|
|
"""Test that Path objects are converted to strings."""
|
|
from pathlib import Path
|
|
|
|
mock_pipeline = MagicMock()
|
|
mock_pipeline.predict.return_value = []
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
path = Path("/some/path/to/image.png")
|
|
|
|
detector.detect(path)
|
|
|
|
# Should be called with string, not Path
|
|
mock_pipeline.predict.assert_called_with("/some/path/to/image.png")
|
|
|
|
|
|
class TestHtmlExtraction:
|
|
"""Tests for HTML extraction from different element formats."""
|
|
|
|
def test_extracts_html_from_res_dict(self):
|
|
"""Test extracting HTML from element.res dictionary."""
|
|
mock_pipeline = MagicMock()
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
element.bbox = [0, 0, 100, 100]
|
|
element.res = {"html": "<table><tr><td>From res</td></tr></table>"}
|
|
element.score = 0.9
|
|
element.cells = []
|
|
# Remove direct html attribute
|
|
del element.html
|
|
del element.table_html
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].html == "<table><tr><td>From res</td></tr></table>"
|
|
|
|
def test_returns_empty_html_when_not_found(self):
|
|
"""Test empty HTML when no html attribute found."""
|
|
mock_pipeline = MagicMock()
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
element.bbox = [0, 0, 100, 100]
|
|
element.score = 0.9
|
|
element.cells = []
|
|
# Remove all html attributes
|
|
del element.html
|
|
del element.table_html
|
|
del element.res
|
|
|
|
mock_result = MagicMock(spec=["layout_elements"])
|
|
mock_result.layout_elements = [element]
|
|
mock_pipeline.predict.return_value = [mock_result]
|
|
|
|
detector = TableDetector(pipeline=mock_pipeline)
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
results = detector.detect(image)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].html == ""
|
|
|
|
|
|
class TestTableTypeDetection:
|
|
"""Tests for table type detection."""
|
|
|
|
def test_detects_borderless_table(self):
|
|
"""Test detection of borderless table type via _get_table_type."""
|
|
detector = TableDetector()
|
|
|
|
# Create mock element with borderless label
|
|
element = MagicMock()
|
|
element.label = "borderless_table"
|
|
|
|
result = detector._get_table_type(element)
|
|
assert result == "wireless"
|
|
|
|
def test_detects_wireless_table_label(self):
|
|
"""Test detection of wireless table type."""
|
|
detector = TableDetector()
|
|
|
|
element = MagicMock()
|
|
element.label = "wireless_table"
|
|
|
|
result = detector._get_table_type(element)
|
|
assert result == "wireless"
|
|
|
|
def test_defaults_to_wired_table(self):
|
|
"""Test default table type is wired."""
|
|
detector = TableDetector()
|
|
|
|
element = MagicMock()
|
|
element.label = "table"
|
|
|
|
result = detector._get_table_type(element)
|
|
assert result == "wired"
|
|
|
|
def test_type_attribute_instead_of_label(self):
|
|
"""Test table type detection using type attribute."""
|
|
detector = TableDetector()
|
|
|
|
element = MagicMock()
|
|
element.type = "wireless"
|
|
del element.label # Remove label
|
|
|
|
result = detector._get_table_type(element)
|
|
assert result == "wireless"
|
|
|
|
|
|
class TestPipelineRuntimeError:
|
|
"""Tests for pipeline runtime errors."""
|
|
|
|
def test_raises_runtime_error_when_pipeline_none(self):
|
|
"""Test RuntimeError when pipeline is None during detect."""
|
|
detector = TableDetector()
|
|
detector._initialized = True # Bypass lazy init
|
|
detector._pipeline = None
|
|
|
|
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
|
|
with pytest.raises(RuntimeError) as exc_info:
|
|
detector.detect(image)
|
|
|
|
assert "not initialized" in str(exc_info.value).lower()
|