refactor: split line_items_extractor into smaller modules with comprehensive tests
- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -142,6 +142,33 @@ class TestTextLineItemsExtractor:
|
||||
rows = extractor._group_by_row(elements)
|
||||
assert len(rows) == 2
|
||||
|
||||
def test_group_by_row_varying_heights_uses_average(self, extractor):
|
||||
"""Test grouping handles varying element heights using dynamic average.
|
||||
|
||||
When elements have varying heights, the row center should be recalculated
|
||||
as new elements are added, preventing tall elements from being incorrectly
|
||||
grouped with the next row.
|
||||
"""
|
||||
# First element: small height, center_y = 105
|
||||
# Second element: tall, center_y = 115 (but should still be same row)
|
||||
# Third element: next row, center_y = 160
|
||||
elements = [
|
||||
TextElement(text="Short", bbox=(0, 100, 100, 110)), # center_y = 105
|
||||
TextElement(text="Tall item", bbox=(150, 100, 250, 130)), # center_y = 115
|
||||
TextElement(text="Next row", bbox=(0, 150, 100, 170)), # center_y = 160
|
||||
]
|
||||
rows = extractor._group_by_row(elements)
|
||||
|
||||
# With dynamic average, both first and second element should be same row
|
||||
assert len(rows) == 2
|
||||
assert len(rows[0]) == 2 # Short and Tall item
|
||||
assert len(rows[1]) == 1 # Next row
|
||||
|
||||
def test_group_by_row_empty_input(self, extractor):
|
||||
"""Test grouping with empty input returns empty list."""
|
||||
rows = extractor._group_by_row([])
|
||||
assert rows == []
|
||||
|
||||
def test_looks_like_line_item_with_amount(self, extractor):
|
||||
"""Test line item detection with amount."""
|
||||
row = [
|
||||
@@ -253,6 +280,67 @@ class TestTextLineItemsExtractor:
|
||||
assert len(elements) == 4
|
||||
|
||||
|
||||
class TestExceptionHandling:
|
||||
"""Tests for exception handling in text element extraction."""
|
||||
|
||||
def test_extract_text_elements_handles_missing_bbox(self):
|
||||
"""Test that missing bbox is handled gracefully."""
|
||||
extractor = TextLineItemsExtractor()
|
||||
parsing_res = [
|
||||
{"label": "text", "text": "No bbox"}, # Missing bbox
|
||||
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Valid"},
|
||||
]
|
||||
elements = extractor._extract_text_elements(parsing_res)
|
||||
# Should only have 1 valid element
|
||||
assert len(elements) == 1
|
||||
assert elements[0].text == "Valid"
|
||||
|
||||
def test_extract_text_elements_handles_invalid_bbox(self):
|
||||
"""Test that invalid bbox (less than 4 values) is handled."""
|
||||
extractor = TextLineItemsExtractor()
|
||||
parsing_res = [
|
||||
{"label": "text", "bbox": [0, 100], "text": "Invalid bbox"}, # Only 2 values
|
||||
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Valid"},
|
||||
]
|
||||
elements = extractor._extract_text_elements(parsing_res)
|
||||
assert len(elements) == 1
|
||||
assert elements[0].text == "Valid"
|
||||
|
||||
def test_extract_text_elements_handles_none_text(self):
|
||||
"""Test that None text is handled."""
|
||||
extractor = TextLineItemsExtractor()
|
||||
parsing_res = [
|
||||
{"label": "text", "bbox": [0, 100, 200, 120], "text": None},
|
||||
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Valid"},
|
||||
]
|
||||
elements = extractor._extract_text_elements(parsing_res)
|
||||
assert len(elements) == 1
|
||||
assert elements[0].text == "Valid"
|
||||
|
||||
def test_extract_text_elements_handles_empty_string(self):
|
||||
"""Test that empty string text is skipped."""
|
||||
extractor = TextLineItemsExtractor()
|
||||
parsing_res = [
|
||||
{"label": "text", "bbox": [0, 100, 200, 120], "text": ""},
|
||||
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Valid"},
|
||||
]
|
||||
elements = extractor._extract_text_elements(parsing_res)
|
||||
assert len(elements) == 1
|
||||
assert elements[0].text == "Valid"
|
||||
|
||||
def test_extract_text_elements_handles_malformed_element(self):
|
||||
"""Test that completely malformed elements are handled."""
|
||||
extractor = TextLineItemsExtractor()
|
||||
parsing_res = [
|
||||
"not a dict", # String instead of dict
|
||||
123, # Number instead of dict
|
||||
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Valid"},
|
||||
]
|
||||
elements = extractor._extract_text_elements(parsing_res)
|
||||
assert len(elements) == 1
|
||||
assert elements[0].text == "Valid"
|
||||
|
||||
|
||||
class TestConvertTextLineItem:
|
||||
"""Tests for convert_text_line_item function."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user