refactor: split line_items_extractor into smaller modules with comprehensive tests

- Extract models.py (LineItem, LineItemsResult dataclasses)
- Extract html_table_parser.py (ColumnMapper, HtmlTableParser)
- Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells)
- Reduce line_items_extractor.py from 971 to 396 lines
- Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.)
- Fix row grouping algorithm in text_line_items_extractor.py
- Demote INFO logs to DEBUG level in structure_detector.py
- Add 209 tests achieving 85%+ coverage on main modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-02-03 23:02:00 +01:00
parent c2c8f2dd04
commit 8723ef4653
11 changed files with 2230 additions and 841 deletions

View File

@@ -15,6 +15,11 @@ import logging
logger = logging.getLogger(__name__)
# Configuration constants
DEFAULT_ROW_TOLERANCE = 15.0 # Max vertical distance (pixels) to consider same row
MIN_ITEMS_FOR_VALID_EXTRACTION = 2 # Minimum items required for valid extraction
MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5 # Minimum text elements needed to attempt extraction
@dataclass
class TextElement:
@@ -65,7 +70,10 @@ class TextLineItemsResult:
extraction_method: str = "text_spatial"
# Swedish amount pattern: 1 234,56 or 1234.56 or 1,234.56
# Amount pattern matches Swedish, US, and simple numeric formats
# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
# Does NOT handle: amounts with more than 2 decimal places, scientific notation
# See tests in test_text_line_items_extractor.py::TestAmountPattern
AMOUNT_PATTERN = re.compile(
r"(?<![0-9])(?:"
r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?" # Swedish: 1 234,56
@@ -128,17 +136,17 @@ class TextLineItemsExtractor:
def __init__(
self,
row_tolerance: float = 15.0, # Max vertical distance to consider same row
min_items_for_valid: int = 2, # Minimum items to consider extraction valid
row_tolerance: float = DEFAULT_ROW_TOLERANCE,
min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
):
"""
Initialize extractor.
Args:
row_tolerance: Maximum vertical distance (pixels) between elements
to consider them on the same row.
to consider them on the same row. Default: 15.0
min_items_for_valid: Minimum number of line items required for
extraction to be considered successful.
extraction to be considered successful. Default: 2
"""
self.row_tolerance = row_tolerance
self.min_items_for_valid = min_items_for_valid
@@ -161,10 +169,13 @@ class TextLineItemsExtractor:
# Extract text elements from parsing results
text_elements = self._extract_text_elements(parsing_res_list)
logger.info(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
if len(text_elements) < 5: # Need at least a few elements
logger.debug("Too few text elements for line item extraction")
if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
logger.debug(
f"Too few text elements ({len(text_elements)}) for line item extraction, "
f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
)
return None
return self.extract_from_text_elements(text_elements)
@@ -183,11 +194,11 @@ class TextLineItemsExtractor:
"""
# Group elements by row
rows = self._group_by_row(text_elements)
logger.info(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
# Find the line items section
item_rows = self._identify_line_item_rows(rows)
logger.info(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
if len(item_rows) < self.min_items_for_valid:
logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
@@ -195,7 +206,7 @@ class TextLineItemsExtractor:
# Extract structured items
items = self._parse_line_items(item_rows)
logger.info(f"TextLineItemsExtractor: extracted {len(items)} line items")
logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")
if len(items) < self.min_items_for_valid:
return None
@@ -209,7 +220,11 @@ class TextLineItemsExtractor:
def _extract_text_elements(
self, parsing_res_list: list[dict[str, Any]]
) -> list[TextElement]:
"""Extract TextElement objects from parsing_res_list."""
"""Extract TextElement objects from parsing_res_list.
Handles both dict and LayoutBlock object formats from PP-StructureV3.
Gracefully skips invalid elements with appropriate logging.
"""
elements = []
for elem in parsing_res_list:
@@ -220,11 +235,15 @@ class TextLineItemsExtractor:
bbox = elem.get("bbox", [])
# Try both 'text' and 'content' keys
text = elem.get("text", "") or elem.get("content", "")
else:
elif hasattr(elem, "label"):
label = getattr(elem, "label", "")
bbox = getattr(elem, "bbox", [])
# LayoutBlock objects use 'content' attribute
text = getattr(elem, "content", "") or getattr(elem, "text", "")
else:
# Element is neither dict nor has expected attributes
logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
continue
# Only process text elements (skip images, tables, etc.)
if label not in ("text", "paragraph_title", "aside_text"):
@@ -232,6 +251,7 @@ class TextLineItemsExtractor:
# Validate bbox
if not self._valid_bbox(bbox):
logger.debug(f"Skipping element with invalid bbox: {bbox}")
continue
# Clean text
@@ -250,8 +270,13 @@ class TextLineItemsExtractor:
),
)
)
except (KeyError, TypeError, ValueError, AttributeError) as e:
# Expected format issues - log at debug level
logger.debug(f"Skipping element due to format issue: {e}")
continue
except Exception as e:
logger.debug(f"Failed to parse element: {e}")
# Unexpected errors - log at warning level for visibility
logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
continue
return elements
@@ -270,6 +295,7 @@ class TextLineItemsExtractor:
Group text elements into rows based on vertical position.
Elements within row_tolerance of each other are considered same row.
Uses dynamic average center_y to handle varying element heights more accurately.
"""
if not elements:
return []
@@ -277,22 +303,22 @@ class TextLineItemsExtractor:
# Sort by vertical position
sorted_elements = sorted(elements, key=lambda e: e.center_y)
rows = []
current_row = [sorted_elements[0]]
current_y = sorted_elements[0].center_y
rows: list[list[TextElement]] = []
current_row: list[TextElement] = [sorted_elements[0]]
for elem in sorted_elements[1:]:
if abs(elem.center_y - current_y) <= self.row_tolerance:
# Same row
# Calculate dynamic average center_y for current row
avg_center_y = sum(e.center_y for e in current_row) / len(current_row)
if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
# Same row - add element and recalculate average on next iteration
current_row.append(elem)
else:
# New row
if current_row:
# Sort row by horizontal position
current_row.sort(key=lambda e: e.center_x)
rows.append(current_row)
# New row - finalize current row
# Sort row by horizontal position (left to right)
current_row.sort(key=lambda e: e.center_x)
rows.append(current_row)
current_row = [elem]
current_y = elem.center_y
# Don't forget last row
if current_row: