refactor: split line_items_extractor into smaller modules with comprehensive tests
- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,11 @@ import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration constants
|
||||
DEFAULT_ROW_TOLERANCE = 15.0 # Max vertical distance (pixels) to consider same row
|
||||
MIN_ITEMS_FOR_VALID_EXTRACTION = 2 # Minimum items required for valid extraction
|
||||
MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5 # Minimum text elements needed to attempt extraction
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextElement:
|
||||
@@ -65,7 +70,10 @@ class TextLineItemsResult:
|
||||
extraction_method: str = "text_spatial"
|
||||
|
||||
|
||||
# Swedish amount pattern: 1 234,56 or 1234.56 or 1,234.56
|
||||
# Amount pattern matches Swedish, US, and simple numeric formats
|
||||
# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
|
||||
# Does NOT handle: amounts with more than 2 decimal places, scientific notation
|
||||
# See tests in test_text_line_items_extractor.py::TestAmountPattern
|
||||
AMOUNT_PATTERN = re.compile(
|
||||
r"(?<![0-9])(?:"
|
||||
r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?" # Swedish: 1 234,56
|
||||
@@ -128,17 +136,17 @@ class TextLineItemsExtractor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
row_tolerance: float = 15.0, # Max vertical distance to consider same row
|
||||
min_items_for_valid: int = 2, # Minimum items to consider extraction valid
|
||||
row_tolerance: float = DEFAULT_ROW_TOLERANCE,
|
||||
min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
|
||||
):
|
||||
"""
|
||||
Initialize extractor.
|
||||
|
||||
Args:
|
||||
row_tolerance: Maximum vertical distance (pixels) between elements
|
||||
to consider them on the same row.
|
||||
to consider them on the same row. Default: 15.0
|
||||
min_items_for_valid: Minimum number of line items required for
|
||||
extraction to be considered successful.
|
||||
extraction to be considered successful. Default: 2
|
||||
"""
|
||||
self.row_tolerance = row_tolerance
|
||||
self.min_items_for_valid = min_items_for_valid
|
||||
@@ -161,10 +169,13 @@ class TextLineItemsExtractor:
|
||||
|
||||
# Extract text elements from parsing results
|
||||
text_elements = self._extract_text_elements(parsing_res_list)
|
||||
logger.info(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
|
||||
logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
|
||||
|
||||
if len(text_elements) < 5: # Need at least a few elements
|
||||
logger.debug("Too few text elements for line item extraction")
|
||||
if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
|
||||
logger.debug(
|
||||
f"Too few text elements ({len(text_elements)}) for line item extraction, "
|
||||
f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
|
||||
)
|
||||
return None
|
||||
|
||||
return self.extract_from_text_elements(text_elements)
|
||||
@@ -183,11 +194,11 @@ class TextLineItemsExtractor:
|
||||
"""
|
||||
# Group elements by row
|
||||
rows = self._group_by_row(text_elements)
|
||||
logger.info(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
|
||||
logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
|
||||
|
||||
# Find the line items section
|
||||
item_rows = self._identify_line_item_rows(rows)
|
||||
logger.info(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
|
||||
logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
|
||||
|
||||
if len(item_rows) < self.min_items_for_valid:
|
||||
logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
|
||||
@@ -195,7 +206,7 @@ class TextLineItemsExtractor:
|
||||
|
||||
# Extract structured items
|
||||
items = self._parse_line_items(item_rows)
|
||||
logger.info(f"TextLineItemsExtractor: extracted {len(items)} line items")
|
||||
logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")
|
||||
|
||||
if len(items) < self.min_items_for_valid:
|
||||
return None
|
||||
@@ -209,7 +220,11 @@ class TextLineItemsExtractor:
|
||||
def _extract_text_elements(
|
||||
self, parsing_res_list: list[dict[str, Any]]
|
||||
) -> list[TextElement]:
|
||||
"""Extract TextElement objects from parsing_res_list."""
|
||||
"""Extract TextElement objects from parsing_res_list.
|
||||
|
||||
Handles both dict and LayoutBlock object formats from PP-StructureV3.
|
||||
Gracefully skips invalid elements with appropriate logging.
|
||||
"""
|
||||
elements = []
|
||||
|
||||
for elem in parsing_res_list:
|
||||
@@ -220,11 +235,15 @@ class TextLineItemsExtractor:
|
||||
bbox = elem.get("bbox", [])
|
||||
# Try both 'text' and 'content' keys
|
||||
text = elem.get("text", "") or elem.get("content", "")
|
||||
else:
|
||||
elif hasattr(elem, "label"):
|
||||
label = getattr(elem, "label", "")
|
||||
bbox = getattr(elem, "bbox", [])
|
||||
# LayoutBlock objects use 'content' attribute
|
||||
text = getattr(elem, "content", "") or getattr(elem, "text", "")
|
||||
else:
|
||||
# Element is neither dict nor has expected attributes
|
||||
logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
|
||||
continue
|
||||
|
||||
# Only process text elements (skip images, tables, etc.)
|
||||
if label not in ("text", "paragraph_title", "aside_text"):
|
||||
@@ -232,6 +251,7 @@ class TextLineItemsExtractor:
|
||||
|
||||
# Validate bbox
|
||||
if not self._valid_bbox(bbox):
|
||||
logger.debug(f"Skipping element with invalid bbox: {bbox}")
|
||||
continue
|
||||
|
||||
# Clean text
|
||||
@@ -250,8 +270,13 @@ class TextLineItemsExtractor:
|
||||
),
|
||||
)
|
||||
)
|
||||
except (KeyError, TypeError, ValueError, AttributeError) as e:
|
||||
# Expected format issues - log at debug level
|
||||
logger.debug(f"Skipping element due to format issue: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to parse element: {e}")
|
||||
# Unexpected errors - log at warning level for visibility
|
||||
logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
|
||||
continue
|
||||
|
||||
return elements
|
||||
@@ -270,6 +295,7 @@ class TextLineItemsExtractor:
|
||||
Group text elements into rows based on vertical position.
|
||||
|
||||
Elements within row_tolerance of each other are considered same row.
|
||||
Uses dynamic average center_y to handle varying element heights more accurately.
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
@@ -277,22 +303,22 @@ class TextLineItemsExtractor:
|
||||
# Sort by vertical position
|
||||
sorted_elements = sorted(elements, key=lambda e: e.center_y)
|
||||
|
||||
rows = []
|
||||
current_row = [sorted_elements[0]]
|
||||
current_y = sorted_elements[0].center_y
|
||||
rows: list[list[TextElement]] = []
|
||||
current_row: list[TextElement] = [sorted_elements[0]]
|
||||
|
||||
for elem in sorted_elements[1:]:
|
||||
if abs(elem.center_y - current_y) <= self.row_tolerance:
|
||||
# Same row
|
||||
# Calculate dynamic average center_y for current row
|
||||
avg_center_y = sum(e.center_y for e in current_row) / len(current_row)
|
||||
|
||||
if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
|
||||
# Same row - add element and recalculate average on next iteration
|
||||
current_row.append(elem)
|
||||
else:
|
||||
# New row
|
||||
if current_row:
|
||||
# Sort row by horizontal position
|
||||
current_row.sort(key=lambda e: e.center_x)
|
||||
rows.append(current_row)
|
||||
# New row - finalize current row
|
||||
# Sort row by horizontal position (left to right)
|
||||
current_row.sort(key=lambda e: e.center_x)
|
||||
rows.append(current_row)
|
||||
current_row = [elem]
|
||||
current_y = elem.center_y
|
||||
|
||||
# Don't forget last row
|
||||
if current_row:
|
||||
|
||||
Reference in New Issue
Block a user