refactor: split line_items_extractor into smaller modules with comprehensive tests

- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 23:02:00 +01:00
parent c2c8f2dd04
commit 8723ef4653
11 changed files with 2230 additions and 841 deletions
--- a/packages/backend/backend/table/text_line_items_extractor.py
+++ b/packages/backend/backend/table/text_line_items_extractor.py
@@ -15,6 +15,11 @@ import logging

 logger = logging.getLogger(__name__)

+# Configuration constants
+DEFAULT_ROW_TOLERANCE = 15.0  # Max vertical distance (pixels) to consider same row
+MIN_ITEMS_FOR_VALID_EXTRACTION = 2  # Minimum items required for valid extraction
+MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5  # Minimum text elements needed to attempt extraction
+

@dataclass
 class TextElement:
@@ -65,7 +70,10 @@ class TextLineItemsResult:
    extraction_method: str = "text_spatial"


-# Swedish amount pattern: 1 234,56 or 1234.56 or 1,234.56
+# Amount pattern matches Swedish, US, and simple numeric formats
+# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
+# Does NOT handle: amounts with more than 2 decimal places, scientific notation
+# See tests in test_text_line_items_extractor.py::TestAmountPattern
 AMOUNT_PATTERN = re.compile(
    r"(?<![0-9])(?:"
    r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?"  # Swedish: 1 234,56
@@ -128,17 +136,17 @@ class TextLineItemsExtractor:

    def __init__(
        self,
-        row_tolerance: float = 15.0,  # Max vertical distance to consider same row
-        min_items_for_valid: int = 2,  # Minimum items to consider extraction valid
+        row_tolerance: float = DEFAULT_ROW_TOLERANCE,
+        min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
    ):
        """
        Initialize extractor.

        Args:
            row_tolerance: Maximum vertical distance (pixels) between elements
-                          to consider them on the same row.
+                          to consider them on the same row. Default: 15.0
            min_items_for_valid: Minimum number of line items required for
-                                extraction to be considered successful.
+                                extraction to be considered successful. Default: 2
        """
        self.row_tolerance = row_tolerance
        self.min_items_for_valid = min_items_for_valid
@@ -161,10 +169,13 @@ class TextLineItemsExtractor:

        # Extract text elements from parsing results
        text_elements = self._extract_text_elements(parsing_res_list)
-        logger.info(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
+        logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")

-        if len(text_elements) < 5:  # Need at least a few elements
-            logger.debug("Too few text elements for line item extraction")
+        if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
+            logger.debug(
+                f"Too few text elements ({len(text_elements)}) for line item extraction, "
+                f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
+            )
            return None

        return self.extract_from_text_elements(text_elements)
@@ -183,11 +194,11 @@ class TextLineItemsExtractor:
        """
        # Group elements by row
        rows = self._group_by_row(text_elements)
-        logger.info(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
+        logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")

        # Find the line items section
        item_rows = self._identify_line_item_rows(rows)
-        logger.info(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
+        logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")

        if len(item_rows) < self.min_items_for_valid:
            logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
@@ -195,7 +206,7 @@ class TextLineItemsExtractor:

        # Extract structured items
        items = self._parse_line_items(item_rows)
-        logger.info(f"TextLineItemsExtractor: extracted {len(items)} line items")
+        logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")

        if len(items) < self.min_items_for_valid:
            return None
@@ -209,7 +220,11 @@ class TextLineItemsExtractor:
    def _extract_text_elements(
        self, parsing_res_list: list[dict[str, Any]]
    ) -> list[TextElement]:
-        """Extract TextElement objects from parsing_res_list."""
+        """Extract TextElement objects from parsing_res_list.
+
+        Handles both dict and LayoutBlock object formats from PP-StructureV3.
+        Gracefully skips invalid elements with appropriate logging.
+        """
        elements = []

        for elem in parsing_res_list:
@@ -220,11 +235,15 @@ class TextLineItemsExtractor:
                    bbox = elem.get("bbox", [])
                    # Try both 'text' and 'content' keys
                    text = elem.get("text", "") or elem.get("content", "")
-                else:
+                elif hasattr(elem, "label"):
                    label = getattr(elem, "label", "")
                    bbox = getattr(elem, "bbox", [])
                    # LayoutBlock objects use 'content' attribute
                    text = getattr(elem, "content", "") or getattr(elem, "text", "")
+                else:
+                    # Element is neither dict nor has expected attributes
+                    logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
+                    continue

                # Only process text elements (skip images, tables, etc.)
                if label not in ("text", "paragraph_title", "aside_text"):
@@ -232,6 +251,7 @@ class TextLineItemsExtractor:

                # Validate bbox
                if not self._valid_bbox(bbox):
+                    logger.debug(f"Skipping element with invalid bbox: {bbox}")
                    continue

                # Clean text
@@ -250,8 +270,13 @@ class TextLineItemsExtractor:
                        ),
                    )
                )
+            except (KeyError, TypeError, ValueError, AttributeError) as e:
+                # Expected format issues - log at debug level
+                logger.debug(f"Skipping element due to format issue: {e}")
+                continue
            except Exception as e:
-                logger.debug(f"Failed to parse element: {e}")
+                # Unexpected errors - log at warning level for visibility
+                logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
                continue

        return elements
@@ -270,6 +295,7 @@ class TextLineItemsExtractor:
        Group text elements into rows based on vertical position.

        Elements within row_tolerance of each other are considered same row.
+        Uses dynamic average center_y to handle varying element heights more accurately.
        """
        if not elements:
            return []
@@ -277,22 +303,22 @@ class TextLineItemsExtractor:
        # Sort by vertical position
        sorted_elements = sorted(elements, key=lambda e: e.center_y)

-        rows = []
-        current_row = [sorted_elements[0]]
-        current_y = sorted_elements[0].center_y
+        rows: list[list[TextElement]] = []
+        current_row: list[TextElement] = [sorted_elements[0]]

        for elem in sorted_elements[1:]:
-            if abs(elem.center_y - current_y) <= self.row_tolerance:
-                # Same row
+            # Calculate dynamic average center_y for current row
+            avg_center_y = sum(e.center_y for e in current_row) / len(current_row)
+
+            if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
+                # Same row - add element and recalculate average on next iteration
                current_row.append(elem)
            else:
-                # New row
-                if current_row:
-                    # Sort row by horizontal position
-                    current_row.sort(key=lambda e: e.center_x)
-                    rows.append(current_row)
+                # New row - finalize current row
+                # Sort row by horizontal position (left to right)
+                current_row.sort(key=lambda e: e.center_x)
+                rows.append(current_row)
                current_row = [elem]
-                current_y = elem.center_y

        # Don't forget last row
        if current_row: