Update paddle, and support invoice line item

2026-02-03 21:28:06 +01:00
parent c4e3773df1
commit 35988b1ebf
41 changed files with 6832 additions and 48 deletions
--- a/packages/backend/backend/table/text_line_items_extractor.py
+++ b/packages/backend/backend/table/text_line_items_extractor.py
@@ -0,0 +1,449 @@
+"""
+Text-Based Line Items Extractor
+
+Fallback extraction for invoices where PP-StructureV3 cannot detect table structures
+(e.g., borderless/wireless tables). Uses spatial analysis of OCR text elements to
+identify and group line items.
+"""
+
+from dataclasses import dataclass, field
+from decimal import Decimal, InvalidOperation
+import re
+from typing import Any
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TextElement:
+    """Single text element from OCR."""
+
+    text: str
+    bbox: tuple[float, float, float, float]  # x1, y1, x2, y2
+    confidence: float = 1.0
+
+    @property
+    def center_y(self) -> float:
+        """Vertical center of the element."""
+        return (self.bbox[1] + self.bbox[3]) / 2
+
+    @property
+    def center_x(self) -> float:
+        """Horizontal center of the element."""
+        return (self.bbox[0] + self.bbox[2]) / 2
+
+    @property
+    def height(self) -> float:
+        """Height of the element."""
+        return self.bbox[3] - self.bbox[1]
+
+
+@dataclass
+class TextLineItem:
+    """Line item extracted from text elements."""
+
+    row_index: int
+    description: str | None = None
+    quantity: str | None = None
+    unit: str | None = None
+    unit_price: str | None = None
+    amount: str | None = None
+    article_number: str | None = None
+    vat_rate: str | None = None
+    is_deduction: bool = False  # True if this row is a deduction/discount
+    confidence: float = 0.7  # Lower default confidence for text-based extraction
+
+
+@dataclass
+class TextLineItemsResult:
+    """Result of text-based line items extraction."""
+
+    items: list[TextLineItem]
+    header_row: list[str]
+    extraction_method: str = "text_spatial"
+
+
+# Swedish amount pattern: 1 234,56 or 1234.56 or 1,234.56
+AMOUNT_PATTERN = re.compile(
+    r"(?<![0-9])(?:"
+    r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?"  # Swedish: 1 234,56
+    r"|-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"  # US: 1,234.56
+    r"|-?\d+(?:[.,]\d{2})?"  # Simple: 1234,56 or 1234.56
+    r")(?:\s*(?:kr|SEK|:-))?"  # Optional currency suffix
+    r"(?![0-9])"
+)
+
+# Quantity patterns
+QUANTITY_PATTERN = re.compile(
+    r"^(?:"
+    r"\d+(?:[.,]\d+)?\s*(?:st|pcs|m|kg|l|h|tim|timmar)?"  # Number with optional unit
+    r")$",
+    re.IGNORECASE,
+)
+
+# VAT rate patterns
+VAT_RATE_PATTERN = re.compile(r"(\d+)\s*%")
+
+# Keywords indicating a line item area
+LINE_ITEM_KEYWORDS = [
+    "beskrivning",
+    "artikel",
+    "produkt",
+    "belopp",
+    "summa",
+    "antal",
+    "pris",
+    "á-pris",
+    "a-pris",
+    "moms",
+]
+
+# Keywords indicating NOT line items (summary area)
+SUMMARY_KEYWORDS = [
+    "att betala",
+    "total",
+    "summa att betala",
+    "betalningsvillkor",
+    "förfallodatum",
+    "bankgiro",
+    "plusgiro",
+    "ocr-nummer",
+    "fakturabelopp",
+    "exkl. moms",
+    "inkl. moms",
+    "varav moms",
+]
+
+
+class TextLineItemsExtractor:
+    """
+    Extract line items from text elements using spatial analysis.
+
+    This is a fallback for when PP-StructureV3 cannot detect table structures.
+    It groups text elements by vertical position and identifies patterns
+    that match line item rows.
+    """
+
+    def __init__(
+        self,
+        row_tolerance: float = 15.0,  # Max vertical distance to consider same row
+        min_items_for_valid: int = 2,  # Minimum items to consider extraction valid
+    ):
+        """
+        Initialize extractor.
+
+        Args:
+            row_tolerance: Maximum vertical distance (pixels) between elements
+                          to consider them on the same row.
+            min_items_for_valid: Minimum number of line items required for
+                                extraction to be considered successful.
+        """
+        self.row_tolerance = row_tolerance
+        self.min_items_for_valid = min_items_for_valid
+
+    def extract_from_parsing_res(
+        self, parsing_res_list: list[dict[str, Any]]
+    ) -> TextLineItemsResult | None:
+        """
+        Extract line items from PP-StructureV3 parsing_res_list.
+
+        Args:
+            parsing_res_list: List of parsed elements from PP-StructureV3.
+
+        Returns:
+            TextLineItemsResult if line items found, None otherwise.
+        """
+        if not parsing_res_list:
+            logger.debug("No parsing_res_list provided")
+            return None
+
+        # Extract text elements from parsing results
+        text_elements = self._extract_text_elements(parsing_res_list)
+        logger.info(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
+
+        if len(text_elements) < 5:  # Need at least a few elements
+            logger.debug("Too few text elements for line item extraction")
+            return None
+
+        return self.extract_from_text_elements(text_elements)
+
+    def extract_from_text_elements(
+        self, text_elements: list[TextElement]
+    ) -> TextLineItemsResult | None:
+        """
+        Extract line items from a list of text elements.
+
+        Args:
+            text_elements: List of TextElement objects.
+
+        Returns:
+            TextLineItemsResult if line items found, None otherwise.
+        """
+        # Group elements by row
+        rows = self._group_by_row(text_elements)
+        logger.info(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
+
+        # Find the line items section
+        item_rows = self._identify_line_item_rows(rows)
+        logger.info(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
+
+        if len(item_rows) < self.min_items_for_valid:
+            logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
+            return None
+
+        # Extract structured items
+        items = self._parse_line_items(item_rows)
+        logger.info(f"TextLineItemsExtractor: extracted {len(items)} line items")
+
+        if len(items) < self.min_items_for_valid:
+            return None
+
+        return TextLineItemsResult(
+            items=items,
+            header_row=[],  # No explicit header in text-based extraction
+            extraction_method="text_spatial",
+        )
+
+    def _extract_text_elements(
+        self, parsing_res_list: list[dict[str, Any]]
+    ) -> list[TextElement]:
+        """Extract TextElement objects from parsing_res_list."""
+        elements = []
+
+        for elem in parsing_res_list:
+            try:
+                # Get label and bbox - handle both dict and LayoutBlock objects
+                if isinstance(elem, dict):
+                    label = elem.get("label", "")
+                    bbox = elem.get("bbox", [])
+                    # Try both 'text' and 'content' keys
+                    text = elem.get("text", "") or elem.get("content", "")
+                else:
+                    label = getattr(elem, "label", "")
+                    bbox = getattr(elem, "bbox", [])
+                    # LayoutBlock objects use 'content' attribute
+                    text = getattr(elem, "content", "") or getattr(elem, "text", "")
+
+                # Only process text elements (skip images, tables, etc.)
+                if label not in ("text", "paragraph_title", "aside_text"):
+                    continue
+
+                # Validate bbox
+                if not self._valid_bbox(bbox):
+                    continue
+
+                # Clean text
+                text = str(text).strip() if text else ""
+                if not text:
+                    continue
+
+                elements.append(
+                    TextElement(
+                        text=text,
+                        bbox=(
+                            float(bbox[0]),
+                            float(bbox[1]),
+                            float(bbox[2]),
+                            float(bbox[3]),
+                        ),
+                    )
+                )
+            except Exception as e:
+                logger.debug(f"Failed to parse element: {e}")
+                continue
+
+        return elements
+
+    def _valid_bbox(self, bbox: Any) -> bool:
+        """Check if bbox is valid (has 4 elements)."""
+        try:
+            return len(bbox) >= 4 if hasattr(bbox, "__len__") else False
+        except (TypeError, ValueError):
+            return False
+
+    def _group_by_row(
+        self, elements: list[TextElement]
+    ) -> list[list[TextElement]]:
+        """
+        Group text elements into rows based on vertical position.
+
+        Elements within row_tolerance of each other are considered same row.
+        """
+        if not elements:
+            return []
+
+        # Sort by vertical position
+        sorted_elements = sorted(elements, key=lambda e: e.center_y)
+
+        rows = []
+        current_row = [sorted_elements[0]]
+        current_y = sorted_elements[0].center_y
+
+        for elem in sorted_elements[1:]:
+            if abs(elem.center_y - current_y) <= self.row_tolerance:
+                # Same row
+                current_row.append(elem)
+            else:
+                # New row
+                if current_row:
+                    # Sort row by horizontal position
+                    current_row.sort(key=lambda e: e.center_x)
+                    rows.append(current_row)
+                current_row = [elem]
+                current_y = elem.center_y
+
+        # Don't forget last row
+        if current_row:
+            current_row.sort(key=lambda e: e.center_x)
+            rows.append(current_row)
+
+        return rows
+
+    def _identify_line_item_rows(
+        self, rows: list[list[TextElement]]
+    ) -> list[list[TextElement]]:
+        """
+        Identify which rows are likely line items.
+
+        Line item rows typically have:
+        - Multiple elements per row
+        - At least one amount-like value
+        - Description text
+        """
+        item_rows = []
+        in_item_section = False
+
+        for row in rows:
+            row_text = " ".join(e.text for e in row).lower()
+
+            # Check if we're entering summary section
+            if any(kw in row_text for kw in SUMMARY_KEYWORDS):
+                in_item_section = False
+                continue
+
+            # Check if this looks like a header row
+            if any(kw in row_text for kw in LINE_ITEM_KEYWORDS):
+                in_item_section = True
+                continue  # Skip header row itself
+
+            # Check if row looks like a line item
+            if in_item_section or self._looks_like_line_item(row):
+                if self._looks_like_line_item(row):
+                    item_rows.append(row)
+
+        return item_rows
+
+    def _looks_like_line_item(self, row: list[TextElement]) -> bool:
+        """Check if a row looks like a line item."""
+        if len(row) < 2:
+            return False
+
+        row_text = " ".join(e.text for e in row)
+
+        # Must have at least one amount
+        amounts = AMOUNT_PATTERN.findall(row_text)
+        if not amounts:
+            return False
+
+        # Should have some description text (not just numbers)
+        has_description = any(
+            len(e.text) > 3 and not AMOUNT_PATTERN.fullmatch(e.text.strip())
+            for e in row
+        )
+
+        return has_description
+
+    def _parse_line_items(
+        self, item_rows: list[list[TextElement]]
+    ) -> list[TextLineItem]:
+        """Parse line item rows into structured items."""
+        items = []
+
+        for idx, row in enumerate(item_rows):
+            item = self._parse_single_row(row, idx)
+            if item:
+                items.append(item)
+
+        return items
+
+    def _parse_single_row(
+        self, row: list[TextElement], row_index: int
+    ) -> TextLineItem | None:
+        """Parse a single row into a line item."""
+        if not row:
+            return None
+
+        # Combine all text for analysis
+        all_text = " ".join(e.text for e in row)
+
+        # Find amounts (rightmost is usually the total)
+        amounts = list(AMOUNT_PATTERN.finditer(all_text))
+        if not amounts:
+            return None
+
+        # Last amount is typically line total
+        amount_match = amounts[-1]
+        amount = amount_match.group(0).strip()
+
+        # Second to last might be unit price
+        unit_price = None
+        if len(amounts) >= 2:
+            unit_price = amounts[-2].group(0).strip()
+
+        # Look for quantity
+        quantity = None
+        for elem in row:
+            text = elem.text.strip()
+            if QUANTITY_PATTERN.match(text):
+                quantity = text
+                break
+
+        # Look for VAT rate
+        vat_rate = None
+        vat_match = VAT_RATE_PATTERN.search(all_text)
+        if vat_match:
+            vat_rate = vat_match.group(1)
+
+        # Description is typically the longest non-numeric text
+        description = None
+        max_len = 0
+        for elem in row:
+            text = elem.text.strip()
+            # Skip if it looks like a number/amount
+            if AMOUNT_PATTERN.fullmatch(text):
+                continue
+            if QUANTITY_PATTERN.match(text):
+                continue
+            if len(text) > max_len:
+                description = text
+                max_len = len(text)
+
+        return TextLineItem(
+            row_index=row_index,
+            description=description,
+            quantity=quantity,
+            unit_price=unit_price,
+            amount=amount,
+            vat_rate=vat_rate,
+            confidence=0.7,
+        )
+
+
+def convert_text_line_item(item: TextLineItem) -> "LineItem":
+    """Convert TextLineItem to standard LineItem dataclass."""
+    from .line_items_extractor import LineItem
+
+    return LineItem(
+        row_index=item.row_index,
+        description=item.description,
+        quantity=item.quantity,
+        unit=item.unit,
+        unit_price=item.unit_price,
+        amount=item.amount,
+        article_number=item.article_number,
+        vat_rate=item.vat_rate,
+        is_deduction=item.is_deduction,
+        confidence=item.confidence,
+    )