refactor: split line_items_extractor into smaller modules with comprehensive tests

- Extract models.py (LineItem, LineItemsResult dataclasses)
- Extract html_table_parser.py (ColumnMapper, HtmlTableParser)
- Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells)
- Reduce line_items_extractor.py from 971 to 396 lines
- Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.)
- Fix row grouping algorithm in text_line_items_extractor.py
- Demote INFO logs to DEBUG level in structure_detector.py
- Add 209 tests achieving 85%+ coverage on main modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-02-03 23:02:00 +01:00
parent c2c8f2dd04
commit 8723ef4653
11 changed files with 2230 additions and 841 deletions

View File

@@ -142,6 +142,33 @@ class TestTextLineItemsExtractor:
rows = extractor._group_by_row(elements)
assert len(rows) == 2
def test_group_by_row_varying_heights_uses_average(self, extractor):
"""Test grouping handles varying element heights using dynamic average.
When elements have varying heights, the row center should be recalculated
as new elements are added, preventing tall elements from being incorrectly
grouped with the next row.
"""
# First element: small height, center_y = 105
# Second element: tall, center_y = 115 (but should still be same row)
# Third element: next row, center_y = 160
elements = [
TextElement(text="Short", bbox=(0, 100, 100, 110)), # center_y = 105
TextElement(text="Tall item", bbox=(150, 100, 250, 130)), # center_y = 115
TextElement(text="Next row", bbox=(0, 150, 100, 170)), # center_y = 160
]
rows = extractor._group_by_row(elements)
# With dynamic average, both first and second element should be same row
assert len(rows) == 2
assert len(rows[0]) == 2 # Short and Tall item
assert len(rows[1]) == 1 # Next row
def test_group_by_row_empty_input(self, extractor):
"""Test grouping with empty input returns empty list."""
rows = extractor._group_by_row([])
assert rows == []
def test_looks_like_line_item_with_amount(self, extractor):
"""Test line item detection with amount."""
row = [
@@ -253,6 +280,67 @@ class TestTextLineItemsExtractor:
assert len(elements) == 4
class TestExceptionHandling:
"""Tests for exception handling in text element extraction."""
def test_extract_text_elements_handles_missing_bbox(self):
"""Test that missing bbox is handled gracefully."""
extractor = TextLineItemsExtractor()
parsing_res = [
{"label": "text", "text": "No bbox"}, # Missing bbox
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Valid"},
]
elements = extractor._extract_text_elements(parsing_res)
# Should only have 1 valid element
assert len(elements) == 1
assert elements[0].text == "Valid"
def test_extract_text_elements_handles_invalid_bbox(self):
"""Test that invalid bbox (less than 4 values) is handled."""
extractor = TextLineItemsExtractor()
parsing_res = [
{"label": "text", "bbox": [0, 100], "text": "Invalid bbox"}, # Only 2 values
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Valid"},
]
elements = extractor._extract_text_elements(parsing_res)
assert len(elements) == 1
assert elements[0].text == "Valid"
def test_extract_text_elements_handles_none_text(self):
"""Test that None text is handled."""
extractor = TextLineItemsExtractor()
parsing_res = [
{"label": "text", "bbox": [0, 100, 200, 120], "text": None},
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Valid"},
]
elements = extractor._extract_text_elements(parsing_res)
assert len(elements) == 1
assert elements[0].text == "Valid"
def test_extract_text_elements_handles_empty_string(self):
"""Test that empty string text is skipped."""
extractor = TextLineItemsExtractor()
parsing_res = [
{"label": "text", "bbox": [0, 100, 200, 120], "text": ""},
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Valid"},
]
elements = extractor._extract_text_elements(parsing_res)
assert len(elements) == 1
assert elements[0].text == "Valid"
def test_extract_text_elements_handles_malformed_element(self):
"""Test that completely malformed elements are handled."""
extractor = TextLineItemsExtractor()
parsing_res = [
"not a dict", # String instead of dict
123, # Number instead of dict
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Valid"},
]
elements = extractor._extract_text_elements(parsing_res)
assert len(elements) == 1
assert elements[0].text == "Valid"
class TestConvertTextLineItem:
"""Tests for convert_text_line_item function."""