invoice-master-poc-v2/packages/backend/backend/table/text_line_items_extractor.py

"""
Text-Based Line Items Extractor

Fallback extraction for invoices where PP-StructureV3 cannot detect table structures
(e.g., borderless/wireless tables). Uses spatial analysis of OCR text elements to
identify and group line items.
"""

from dataclasses import dataclass, field
from decimal import Decimal, InvalidOperation
import re
from typing import Any

import logging

logger = logging.getLogger(__name__)

# Configuration constants
DEFAULT_ROW_TOLERANCE = 15.0  # Max vertical distance (pixels) to consider same row
MIN_ITEMS_FOR_VALID_EXTRACTION = 2  # Minimum items required for valid extraction
MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5  # Minimum text elements needed to attempt extraction


@dataclass
class TextElement:
    """Single text element from OCR."""

    text: str
    bbox: tuple[float, float, float, float]  # x1, y1, x2, y2
    confidence: float = 1.0

    @property
    def center_y(self) -> float:
        """Vertical center of the element."""
        return (self.bbox[1] + self.bbox[3]) / 2

    @property
    def center_x(self) -> float:
        """Horizontal center of the element."""
        return (self.bbox[0] + self.bbox[2]) / 2

    @property
    def height(self) -> float:
        """Height of the element."""
        return self.bbox[3] - self.bbox[1]


@dataclass
class TextLineItem:
    """Line item extracted from text elements."""

    row_index: int
    description: str | None = None
    quantity: str | None = None
    unit: str | None = None
    unit_price: str | None = None
    amount: str | None = None
    article_number: str | None = None
    vat_rate: str | None = None
    is_deduction: bool = False  # True if this row is a deduction/discount
    confidence: float = 0.7  # Lower default confidence for text-based extraction


@dataclass
class TextLineItemsResult:
    """Result of text-based line items extraction."""

    items: list[TextLineItem]
    header_row: list[str]
    extraction_method: str = "text_spatial"


# Amount pattern matches Swedish, US, and simple numeric formats
# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
# Does NOT handle: amounts with more than 2 decimal places, scientific notation
# See tests in test_text_line_items_extractor.py::TestAmountPattern
AMOUNT_PATTERN = re.compile(
    r"(?<![0-9])(?:"
    r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?"  # Swedish: 1 234,56
    r"|-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"  # US: 1,234.56
    r"|-?\d+(?:[.,]\d{2})?"  # Simple: 1234,56 or 1234.56
    r")(?:\s*(?:kr|SEK|:-))?"  # Optional currency suffix
    r"(?![0-9])"
)

# Quantity patterns
QUANTITY_PATTERN = re.compile(
    r"^(?:"
    r"\d+(?:[.,]\d+)?\s*(?:st|pcs|m|kg|l|h|tim|timmar)?"  # Number with optional unit
    r")$",
    re.IGNORECASE,
)

# VAT rate patterns
VAT_RATE_PATTERN = re.compile(r"(\d+)\s*%")

# Keywords indicating a line item area
LINE_ITEM_KEYWORDS = [
    "beskrivning",
    "artikel",
    "produkt",
    "belopp",
    "summa",
    "antal",
    "pris",
    "á-pris",
    "a-pris",
    "moms",
]

# Keywords indicating NOT line items (summary area)
SUMMARY_KEYWORDS = [
    "att betala",
    "total",
    "summa att betala",
    "betalningsvillkor",
    "förfallodatum",
    "bankgiro",
    "plusgiro",
    "ocr-nummer",
    "fakturabelopp",
    "exkl. moms",
    "inkl. moms",
    "varav moms",
]


class TextLineItemsExtractor:
    """
    Extract line items from text elements using spatial analysis.

    This is a fallback for when PP-StructureV3 cannot detect table structures.
    It groups text elements by vertical position and identifies patterns
    that match line item rows.
    """

    def __init__(
        self,
        row_tolerance: float = DEFAULT_ROW_TOLERANCE,
        min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
    ):
        """
        Initialize extractor.

        Args:
            row_tolerance: Maximum vertical distance (pixels) between elements
                          to consider them on the same row. Default: 15.0
            min_items_for_valid: Minimum number of line items required for
                                extraction to be considered successful. Default: 2
        """
        self.row_tolerance = row_tolerance
        self.min_items_for_valid = min_items_for_valid

    def extract_from_parsing_res(
        self, parsing_res_list: list[dict[str, Any]]
    ) -> TextLineItemsResult | None:
        """
        Extract line items from PP-StructureV3 parsing_res_list.

        Args:
            parsing_res_list: List of parsed elements from PP-StructureV3.

        Returns:
            TextLineItemsResult if line items found, None otherwise.
        """
        if not parsing_res_list:
            logger.debug("No parsing_res_list provided")
            return None

        # Extract text elements from parsing results
        text_elements = self._extract_text_elements(parsing_res_list)
        logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")

        if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
            logger.debug(
                f"Too few text elements ({len(text_elements)}) for line item extraction, "
                f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
            )
            return None

        return self.extract_from_text_elements(text_elements)

    def extract_from_text_elements(
        self, text_elements: list[TextElement]
    ) -> TextLineItemsResult | None:
        """
        Extract line items from a list of text elements.

        Args:
            text_elements: List of TextElement objects.

        Returns:
            TextLineItemsResult if line items found, None otherwise.
        """
        # Group elements by row
        rows = self._group_by_row(text_elements)
        logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")

        # Find the line items section
        item_rows = self._identify_line_item_rows(rows)
        logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")

        if len(item_rows) < self.min_items_for_valid:
            logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
            return None

        # Extract structured items
        items = self._parse_line_items(item_rows)
        logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")

        if len(items) < self.min_items_for_valid:
            return None

        return TextLineItemsResult(
            items=items,
            header_row=[],  # No explicit header in text-based extraction
            extraction_method="text_spatial",
        )

    def _extract_text_elements(
        self, parsing_res_list: list[dict[str, Any]]
    ) -> list[TextElement]:
        """Extract TextElement objects from parsing_res_list.

        Handles both dict and LayoutBlock object formats from PP-StructureV3.
        Gracefully skips invalid elements with appropriate logging.
        """
        elements = []

        for elem in parsing_res_list:
            try:
                # Get label and bbox - handle both dict and LayoutBlock objects
                if isinstance(elem, dict):
                    label = elem.get("label", "")
                    bbox = elem.get("bbox", [])
                    # Try both 'text' and 'content' keys
                    text = elem.get("text", "") or elem.get("content", "")
                elif hasattr(elem, "label"):
                    label = getattr(elem, "label", "")
                    bbox = getattr(elem, "bbox", [])
                    # LayoutBlock objects use 'content' attribute
                    text = getattr(elem, "content", "") or getattr(elem, "text", "")
                else:
                    # Element is neither dict nor has expected attributes
                    logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
                    continue

                # Only process text elements (skip images, tables, etc.)
                if label not in ("text", "paragraph_title", "aside_text"):
                    continue

                # Validate bbox
                if not self._valid_bbox(bbox):
                    logger.debug(f"Skipping element with invalid bbox: {bbox}")
                    continue

                # Clean text
                text = str(text).strip() if text else ""
                if not text:
                    continue

                elements.append(
                    TextElement(
                        text=text,
                        bbox=(
                            float(bbox[0]),
                            float(bbox[1]),
                            float(bbox[2]),
                            float(bbox[3]),
                        ),
                    )
                )
            except (KeyError, TypeError, ValueError, AttributeError) as e:
                # Expected format issues - log at debug level
                logger.debug(f"Skipping element due to format issue: {e}")
                continue
            except Exception as e:
                # Unexpected errors - log at warning level for visibility
                logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
                continue

        return elements

    def _valid_bbox(self, bbox: Any) -> bool:
        """Check if bbox is valid (has 4 elements)."""
        try:
            return len(bbox) >= 4 if hasattr(bbox, "__len__") else False
        except (TypeError, ValueError):
            return False

    def _group_by_row(
        self, elements: list[TextElement]
    ) -> list[list[TextElement]]:
        """
        Group text elements into rows based on vertical position.

        Elements within row_tolerance of each other are considered same row.
        Uses dynamic average center_y to handle varying element heights more accurately.
        """
        if not elements:
            return []

        # Sort by vertical position
        sorted_elements = sorted(elements, key=lambda e: e.center_y)

        rows: list[list[TextElement]] = []
        current_row: list[TextElement] = [sorted_elements[0]]

        for elem in sorted_elements[1:]:
            # Calculate dynamic average center_y for current row
            avg_center_y = sum(e.center_y for e in current_row) / len(current_row)

            if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
                # Same row - add element and recalculate average on next iteration
                current_row.append(elem)
            else:
                # New row - finalize current row
                # Sort row by horizontal position (left to right)
                current_row.sort(key=lambda e: e.center_x)
                rows.append(current_row)
                current_row = [elem]

        # Don't forget last row
        if current_row:
            current_row.sort(key=lambda e: e.center_x)
            rows.append(current_row)

        return rows

    def _identify_line_item_rows(
        self, rows: list[list[TextElement]]
    ) -> list[list[TextElement]]:
        """
        Identify which rows are likely line items.

        Line item rows typically have:
        - Multiple elements per row
        - At least one amount-like value
        - Description text
        """
        item_rows = []
        in_item_section = False

        for row in rows:
            row_text = " ".join(e.text for e in row).lower()

            # Check if we're entering summary section
            if any(kw in row_text for kw in SUMMARY_KEYWORDS):
                in_item_section = False
                continue

            # Check if this looks like a header row
            if any(kw in row_text for kw in LINE_ITEM_KEYWORDS):
                in_item_section = True
                continue  # Skip header row itself

            # Check if row looks like a line item
            if in_item_section or self._looks_like_line_item(row):
                if self._looks_like_line_item(row):
                    item_rows.append(row)

        return item_rows

    def _looks_like_line_item(self, row: list[TextElement]) -> bool:
        """Check if a row looks like a line item."""
        if len(row) < 2:
            return False

        row_text = " ".join(e.text for e in row)

        # Must have at least one amount
        amounts = AMOUNT_PATTERN.findall(row_text)
        if not amounts:
            return False

        # Should have some description text (not just numbers)
        has_description = any(
            len(e.text) > 3 and not AMOUNT_PATTERN.fullmatch(e.text.strip())
            for e in row
        )

        return has_description

    def _parse_line_items(
        self, item_rows: list[list[TextElement]]
    ) -> list[TextLineItem]:
        """Parse line item rows into structured items."""
        items = []

        for idx, row in enumerate(item_rows):
            item = self._parse_single_row(row, idx)
            if item:
                items.append(item)

        return items

    def _parse_single_row(
        self, row: list[TextElement], row_index: int
    ) -> TextLineItem | None:
        """Parse a single row into a line item."""
        if not row:
            return None

        # Combine all text for analysis
        all_text = " ".join(e.text for e in row)

        # Find amounts (rightmost is usually the total)
        amounts = list(AMOUNT_PATTERN.finditer(all_text))
        if not amounts:
            return None

        # Last amount is typically line total
        amount_match = amounts[-1]
        amount = amount_match.group(0).strip()

        # Second to last might be unit price
        unit_price = None
        if len(amounts) >= 2:
            unit_price = amounts[-2].group(0).strip()

        # Look for quantity
        quantity = None
        for elem in row:
            text = elem.text.strip()
            if QUANTITY_PATTERN.match(text):
                quantity = text
                break

        # Look for VAT rate
        vat_rate = None
        vat_match = VAT_RATE_PATTERN.search(all_text)
        if vat_match:
            vat_rate = vat_match.group(1)

        # Description is typically the longest non-numeric text
        description = None
        max_len = 0
        for elem in row:
            text = elem.text.strip()
            # Skip if it looks like a number/amount
            if AMOUNT_PATTERN.fullmatch(text):
                continue
            if QUANTITY_PATTERN.match(text):
                continue
            if len(text) > max_len:
                description = text
                max_len = len(text)

        return TextLineItem(
            row_index=row_index,
            description=description,
            quantity=quantity,
            unit_price=unit_price,
            amount=amount,
            vat_rate=vat_rate,
            confidence=0.7,
        )


def convert_text_line_item(item: TextLineItem) -> "LineItem":
    """Convert TextLineItem to standard LineItem dataclass."""
    from .line_items_extractor import LineItem

    return LineItem(
        row_index=item.row_index,
        description=item.description,
        quantity=item.quantity,
        unit=item.unit,
        unit_price=item.unit_price,
        amount=item.amount,
        article_number=item.article_number,
        vat_rate=item.vat_rate,
        is_deduction=item.is_deduction,
        confidence=item.confidence,
    )