refactor: split line_items_extractor into smaller modules with comprehensive tests
- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -658,3 +658,245 @@ class TestPaddleX3xAPI:
|
||||
assert len(results) == 1
|
||||
assert results[0].cells == [] # Empty cells list
|
||||
assert results[0].html == "<table></table>"
|
||||
|
||||
def test_parse_paddlex_result_with_dict_ocr_data(self):
|
||||
"""Test parsing PaddleX 3.x result with dict-format table_ocr_pred."""
|
||||
mock_pipeline = MagicMock()
|
||||
|
||||
mock_result = {
|
||||
"table_res_list": [
|
||||
{
|
||||
"cell_box_list": [[0, 0, 50, 20], [50, 0, 100, 20]],
|
||||
"pred_html": "<table><tr><td>A</td><td>B</td></tr></table>",
|
||||
"table_ocr_pred": {
|
||||
"rec_texts": ["A", "B"],
|
||||
"rec_scores": [0.99, 0.98],
|
||||
},
|
||||
}
|
||||
],
|
||||
"parsing_res_list": [
|
||||
{"label": "table", "bbox": [10, 20, 200, 300]},
|
||||
],
|
||||
}
|
||||
mock_pipeline.predict.return_value = [mock_result]
|
||||
|
||||
detector = TableDetector(pipeline=mock_pipeline)
|
||||
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
results = detector.detect(image)
|
||||
|
||||
assert len(results) == 1
|
||||
assert len(results[0].cells) == 2
|
||||
assert results[0].cells[0]["text"] == "A"
|
||||
assert results[0].cells[1]["text"] == "B"
|
||||
|
||||
def test_parse_paddlex_result_no_bbox_in_parsing_res(self):
|
||||
"""Test parsing PaddleX 3.x result when table bbox not in parsing_res."""
|
||||
mock_pipeline = MagicMock()
|
||||
|
||||
mock_result = {
|
||||
"table_res_list": [
|
||||
{
|
||||
"cell_box_list": [[0, 0, 50, 20]],
|
||||
"pred_html": "<table><tr><td>A</td></tr></table>",
|
||||
"table_ocr_pred": ["A"],
|
||||
}
|
||||
],
|
||||
"parsing_res_list": [
|
||||
{"label": "text", "bbox": [10, 20, 200, 300]}, # Not a table
|
||||
],
|
||||
}
|
||||
mock_pipeline.predict.return_value = [mock_result]
|
||||
|
||||
detector = TableDetector(pipeline=mock_pipeline)
|
||||
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
results = detector.detect(image)
|
||||
|
||||
assert len(results) == 1
|
||||
# Should use default bbox [0,0,0,0] when not found
|
||||
assert results[0].bbox == (0.0, 0.0, 0.0, 0.0)
|
||||
|
||||
|
||||
class TestIteratorResults:
|
||||
"""Tests for iterator/generator result handling."""
|
||||
|
||||
def test_handles_iterator_results(self):
|
||||
"""Test handling of iterator results from pipeline."""
|
||||
mock_pipeline = MagicMock()
|
||||
|
||||
# Return a generator instead of list
|
||||
def result_generator():
|
||||
element = MagicMock()
|
||||
element.label = "table"
|
||||
element.bbox = [0, 0, 100, 100]
|
||||
element.html = "<table></table>"
|
||||
element.score = 0.9
|
||||
element.cells = []
|
||||
mock_result = MagicMock(spec=["layout_elements"])
|
||||
mock_result.layout_elements = [element]
|
||||
yield mock_result
|
||||
|
||||
mock_pipeline.predict.return_value = result_generator()
|
||||
|
||||
detector = TableDetector(pipeline=mock_pipeline)
|
||||
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
results = detector.detect(image)
|
||||
|
||||
assert len(results) == 1
|
||||
|
||||
def test_handles_failed_iterator_conversion(self):
|
||||
"""Test handling when iterator conversion fails."""
|
||||
mock_pipeline = MagicMock()
|
||||
|
||||
# Create an object that has __iter__ but fails when converted to list
|
||||
class FailingIterator:
|
||||
def __iter__(self):
|
||||
raise RuntimeError("Iterator failed")
|
||||
|
||||
mock_pipeline.predict.return_value = FailingIterator()
|
||||
|
||||
detector = TableDetector(pipeline=mock_pipeline)
|
||||
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
results = detector.detect(image)
|
||||
|
||||
# Should return empty list, not raise
|
||||
assert results == []
|
||||
|
||||
|
||||
class TestPathConversion:
|
||||
"""Tests for path handling."""
|
||||
|
||||
def test_converts_path_object_to_string(self):
|
||||
"""Test that Path objects are converted to strings."""
|
||||
from pathlib import Path
|
||||
|
||||
mock_pipeline = MagicMock()
|
||||
mock_pipeline.predict.return_value = []
|
||||
|
||||
detector = TableDetector(pipeline=mock_pipeline)
|
||||
path = Path("/some/path/to/image.png")
|
||||
|
||||
detector.detect(path)
|
||||
|
||||
# Should be called with string, not Path
|
||||
mock_pipeline.predict.assert_called_with("/some/path/to/image.png")
|
||||
|
||||
|
||||
class TestHtmlExtraction:
|
||||
"""Tests for HTML extraction from different element formats."""
|
||||
|
||||
def test_extracts_html_from_res_dict(self):
|
||||
"""Test extracting HTML from element.res dictionary."""
|
||||
mock_pipeline = MagicMock()
|
||||
element = MagicMock()
|
||||
element.label = "table"
|
||||
element.bbox = [0, 0, 100, 100]
|
||||
element.res = {"html": "<table><tr><td>From res</td></tr></table>"}
|
||||
element.score = 0.9
|
||||
element.cells = []
|
||||
# Remove direct html attribute
|
||||
del element.html
|
||||
del element.table_html
|
||||
|
||||
mock_result = MagicMock(spec=["layout_elements"])
|
||||
mock_result.layout_elements = [element]
|
||||
mock_pipeline.predict.return_value = [mock_result]
|
||||
|
||||
detector = TableDetector(pipeline=mock_pipeline)
|
||||
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
results = detector.detect(image)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].html == "<table><tr><td>From res</td></tr></table>"
|
||||
|
||||
def test_returns_empty_html_when_not_found(self):
|
||||
"""Test empty HTML when no html attribute found."""
|
||||
mock_pipeline = MagicMock()
|
||||
element = MagicMock()
|
||||
element.label = "table"
|
||||
element.bbox = [0, 0, 100, 100]
|
||||
element.score = 0.9
|
||||
element.cells = []
|
||||
# Remove all html attributes
|
||||
del element.html
|
||||
del element.table_html
|
||||
del element.res
|
||||
|
||||
mock_result = MagicMock(spec=["layout_elements"])
|
||||
mock_result.layout_elements = [element]
|
||||
mock_pipeline.predict.return_value = [mock_result]
|
||||
|
||||
detector = TableDetector(pipeline=mock_pipeline)
|
||||
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
results = detector.detect(image)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].html == ""
|
||||
|
||||
|
||||
class TestTableTypeDetection:
|
||||
"""Tests for table type detection."""
|
||||
|
||||
def test_detects_borderless_table(self):
|
||||
"""Test detection of borderless table type via _get_table_type."""
|
||||
detector = TableDetector()
|
||||
|
||||
# Create mock element with borderless label
|
||||
element = MagicMock()
|
||||
element.label = "borderless_table"
|
||||
|
||||
result = detector._get_table_type(element)
|
||||
assert result == "wireless"
|
||||
|
||||
def test_detects_wireless_table_label(self):
|
||||
"""Test detection of wireless table type."""
|
||||
detector = TableDetector()
|
||||
|
||||
element = MagicMock()
|
||||
element.label = "wireless_table"
|
||||
|
||||
result = detector._get_table_type(element)
|
||||
assert result == "wireless"
|
||||
|
||||
def test_defaults_to_wired_table(self):
|
||||
"""Test default table type is wired."""
|
||||
detector = TableDetector()
|
||||
|
||||
element = MagicMock()
|
||||
element.label = "table"
|
||||
|
||||
result = detector._get_table_type(element)
|
||||
assert result == "wired"
|
||||
|
||||
def test_type_attribute_instead_of_label(self):
|
||||
"""Test table type detection using type attribute."""
|
||||
detector = TableDetector()
|
||||
|
||||
element = MagicMock()
|
||||
element.type = "wireless"
|
||||
del element.label # Remove label
|
||||
|
||||
result = detector._get_table_type(element)
|
||||
assert result == "wireless"
|
||||
|
||||
|
||||
class TestPipelineRuntimeError:
|
||||
"""Tests for pipeline runtime errors."""
|
||||
|
||||
def test_raises_runtime_error_when_pipeline_none(self):
|
||||
"""Test RuntimeError when pipeline is None during detect."""
|
||||
detector = TableDetector()
|
||||
detector._initialized = True # Bypass lazy init
|
||||
detector._pipeline = None
|
||||
|
||||
image = np.zeros((100, 100, 3), dtype=np.uint8)
|
||||
|
||||
with pytest.raises(RuntimeError) as exc_info:
|
||||
detector.detect(image)
|
||||
|
||||
assert "not initialized" in str(exc_info.value).lower()
|
||||
|
||||
Reference in New Issue
Block a user