Files
invoice-master-poc-v2/packages/backend/backend/table/text_line_items_extractor.py
Yaojia Wang 8723ef4653 refactor: split line_items_extractor into smaller modules with comprehensive tests
- Extract models.py (LineItem, LineItemsResult dataclasses)
- Extract html_table_parser.py (ColumnMapper, HtmlTableParser)
- Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells)
- Reduce line_items_extractor.py from 971 to 396 lines
- Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.)
- Fix row grouping algorithm in text_line_items_extractor.py
- Demote INFO logs to DEBUG level in structure_detector.py
- Add 209 tests achieving 85%+ coverage on main modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 23:02:00 +01:00

476 lines
15 KiB
Python

"""
Text-Based Line Items Extractor
Fallback extraction for invoices where PP-StructureV3 cannot detect table structures
(e.g., borderless/wireless tables). Uses spatial analysis of OCR text elements to
identify and group line items.
"""
from dataclasses import dataclass, field
from decimal import Decimal, InvalidOperation
import re
from typing import Any
import logging
logger = logging.getLogger(__name__)
# Configuration constants
DEFAULT_ROW_TOLERANCE = 15.0 # Max vertical distance (pixels) to consider same row
MIN_ITEMS_FOR_VALID_EXTRACTION = 2 # Minimum items required for valid extraction
MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5 # Minimum text elements needed to attempt extraction
@dataclass
class TextElement:
"""Single text element from OCR."""
text: str
bbox: tuple[float, float, float, float] # x1, y1, x2, y2
confidence: float = 1.0
@property
def center_y(self) -> float:
"""Vertical center of the element."""
return (self.bbox[1] + self.bbox[3]) / 2
@property
def center_x(self) -> float:
"""Horizontal center of the element."""
return (self.bbox[0] + self.bbox[2]) / 2
@property
def height(self) -> float:
"""Height of the element."""
return self.bbox[3] - self.bbox[1]
@dataclass
class TextLineItem:
"""Line item extracted from text elements."""
row_index: int
description: str | None = None
quantity: str | None = None
unit: str | None = None
unit_price: str | None = None
amount: str | None = None
article_number: str | None = None
vat_rate: str | None = None
is_deduction: bool = False # True if this row is a deduction/discount
confidence: float = 0.7 # Lower default confidence for text-based extraction
@dataclass
class TextLineItemsResult:
"""Result of text-based line items extraction."""
items: list[TextLineItem]
header_row: list[str]
extraction_method: str = "text_spatial"
# Amount pattern matches Swedish, US, and simple numeric formats
# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
# Does NOT handle: amounts with more than 2 decimal places, scientific notation
# See tests in test_text_line_items_extractor.py::TestAmountPattern
AMOUNT_PATTERN = re.compile(
r"(?<![0-9])(?:"
r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?" # Swedish: 1 234,56
r"|-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?" # US: 1,234.56
r"|-?\d+(?:[.,]\d{2})?" # Simple: 1234,56 or 1234.56
r")(?:\s*(?:kr|SEK|:-))?" # Optional currency suffix
r"(?![0-9])"
)
# Quantity patterns
QUANTITY_PATTERN = re.compile(
r"^(?:"
r"\d+(?:[.,]\d+)?\s*(?:st|pcs|m|kg|l|h|tim|timmar)?" # Number with optional unit
r")$",
re.IGNORECASE,
)
# VAT rate patterns
VAT_RATE_PATTERN = re.compile(r"(\d+)\s*%")
# Keywords indicating a line item area
LINE_ITEM_KEYWORDS = [
"beskrivning",
"artikel",
"produkt",
"belopp",
"summa",
"antal",
"pris",
"á-pris",
"a-pris",
"moms",
]
# Keywords indicating NOT line items (summary area)
SUMMARY_KEYWORDS = [
"att betala",
"total",
"summa att betala",
"betalningsvillkor",
"förfallodatum",
"bankgiro",
"plusgiro",
"ocr-nummer",
"fakturabelopp",
"exkl. moms",
"inkl. moms",
"varav moms",
]
class TextLineItemsExtractor:
"""
Extract line items from text elements using spatial analysis.
This is a fallback for when PP-StructureV3 cannot detect table structures.
It groups text elements by vertical position and identifies patterns
that match line item rows.
"""
def __init__(
self,
row_tolerance: float = DEFAULT_ROW_TOLERANCE,
min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
):
"""
Initialize extractor.
Args:
row_tolerance: Maximum vertical distance (pixels) between elements
to consider them on the same row. Default: 15.0
min_items_for_valid: Minimum number of line items required for
extraction to be considered successful. Default: 2
"""
self.row_tolerance = row_tolerance
self.min_items_for_valid = min_items_for_valid
def extract_from_parsing_res(
self, parsing_res_list: list[dict[str, Any]]
) -> TextLineItemsResult | None:
"""
Extract line items from PP-StructureV3 parsing_res_list.
Args:
parsing_res_list: List of parsed elements from PP-StructureV3.
Returns:
TextLineItemsResult if line items found, None otherwise.
"""
if not parsing_res_list:
logger.debug("No parsing_res_list provided")
return None
# Extract text elements from parsing results
text_elements = self._extract_text_elements(parsing_res_list)
logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
logger.debug(
f"Too few text elements ({len(text_elements)}) for line item extraction, "
f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
)
return None
return self.extract_from_text_elements(text_elements)
def extract_from_text_elements(
self, text_elements: list[TextElement]
) -> TextLineItemsResult | None:
"""
Extract line items from a list of text elements.
Args:
text_elements: List of TextElement objects.
Returns:
TextLineItemsResult if line items found, None otherwise.
"""
# Group elements by row
rows = self._group_by_row(text_elements)
logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
# Find the line items section
item_rows = self._identify_line_item_rows(rows)
logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
if len(item_rows) < self.min_items_for_valid:
logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
return None
# Extract structured items
items = self._parse_line_items(item_rows)
logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")
if len(items) < self.min_items_for_valid:
return None
return TextLineItemsResult(
items=items,
header_row=[], # No explicit header in text-based extraction
extraction_method="text_spatial",
)
def _extract_text_elements(
self, parsing_res_list: list[dict[str, Any]]
) -> list[TextElement]:
"""Extract TextElement objects from parsing_res_list.
Handles both dict and LayoutBlock object formats from PP-StructureV3.
Gracefully skips invalid elements with appropriate logging.
"""
elements = []
for elem in parsing_res_list:
try:
# Get label and bbox - handle both dict and LayoutBlock objects
if isinstance(elem, dict):
label = elem.get("label", "")
bbox = elem.get("bbox", [])
# Try both 'text' and 'content' keys
text = elem.get("text", "") or elem.get("content", "")
elif hasattr(elem, "label"):
label = getattr(elem, "label", "")
bbox = getattr(elem, "bbox", [])
# LayoutBlock objects use 'content' attribute
text = getattr(elem, "content", "") or getattr(elem, "text", "")
else:
# Element is neither dict nor has expected attributes
logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
continue
# Only process text elements (skip images, tables, etc.)
if label not in ("text", "paragraph_title", "aside_text"):
continue
# Validate bbox
if not self._valid_bbox(bbox):
logger.debug(f"Skipping element with invalid bbox: {bbox}")
continue
# Clean text
text = str(text).strip() if text else ""
if not text:
continue
elements.append(
TextElement(
text=text,
bbox=(
float(bbox[0]),
float(bbox[1]),
float(bbox[2]),
float(bbox[3]),
),
)
)
except (KeyError, TypeError, ValueError, AttributeError) as e:
# Expected format issues - log at debug level
logger.debug(f"Skipping element due to format issue: {e}")
continue
except Exception as e:
# Unexpected errors - log at warning level for visibility
logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
continue
return elements
def _valid_bbox(self, bbox: Any) -> bool:
"""Check if bbox is valid (has 4 elements)."""
try:
return len(bbox) >= 4 if hasattr(bbox, "__len__") else False
except (TypeError, ValueError):
return False
def _group_by_row(
self, elements: list[TextElement]
) -> list[list[TextElement]]:
"""
Group text elements into rows based on vertical position.
Elements within row_tolerance of each other are considered same row.
Uses dynamic average center_y to handle varying element heights more accurately.
"""
if not elements:
return []
# Sort by vertical position
sorted_elements = sorted(elements, key=lambda e: e.center_y)
rows: list[list[TextElement]] = []
current_row: list[TextElement] = [sorted_elements[0]]
for elem in sorted_elements[1:]:
# Calculate dynamic average center_y for current row
avg_center_y = sum(e.center_y for e in current_row) / len(current_row)
if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
# Same row - add element and recalculate average on next iteration
current_row.append(elem)
else:
# New row - finalize current row
# Sort row by horizontal position (left to right)
current_row.sort(key=lambda e: e.center_x)
rows.append(current_row)
current_row = [elem]
# Don't forget last row
if current_row:
current_row.sort(key=lambda e: e.center_x)
rows.append(current_row)
return rows
def _identify_line_item_rows(
self, rows: list[list[TextElement]]
) -> list[list[TextElement]]:
"""
Identify which rows are likely line items.
Line item rows typically have:
- Multiple elements per row
- At least one amount-like value
- Description text
"""
item_rows = []
in_item_section = False
for row in rows:
row_text = " ".join(e.text for e in row).lower()
# Check if we're entering summary section
if any(kw in row_text for kw in SUMMARY_KEYWORDS):
in_item_section = False
continue
# Check if this looks like a header row
if any(kw in row_text for kw in LINE_ITEM_KEYWORDS):
in_item_section = True
continue # Skip header row itself
# Check if row looks like a line item
if in_item_section or self._looks_like_line_item(row):
if self._looks_like_line_item(row):
item_rows.append(row)
return item_rows
def _looks_like_line_item(self, row: list[TextElement]) -> bool:
"""Check if a row looks like a line item."""
if len(row) < 2:
return False
row_text = " ".join(e.text for e in row)
# Must have at least one amount
amounts = AMOUNT_PATTERN.findall(row_text)
if not amounts:
return False
# Should have some description text (not just numbers)
has_description = any(
len(e.text) > 3 and not AMOUNT_PATTERN.fullmatch(e.text.strip())
for e in row
)
return has_description
def _parse_line_items(
self, item_rows: list[list[TextElement]]
) -> list[TextLineItem]:
"""Parse line item rows into structured items."""
items = []
for idx, row in enumerate(item_rows):
item = self._parse_single_row(row, idx)
if item:
items.append(item)
return items
def _parse_single_row(
self, row: list[TextElement], row_index: int
) -> TextLineItem | None:
"""Parse a single row into a line item."""
if not row:
return None
# Combine all text for analysis
all_text = " ".join(e.text for e in row)
# Find amounts (rightmost is usually the total)
amounts = list(AMOUNT_PATTERN.finditer(all_text))
if not amounts:
return None
# Last amount is typically line total
amount_match = amounts[-1]
amount = amount_match.group(0).strip()
# Second to last might be unit price
unit_price = None
if len(amounts) >= 2:
unit_price = amounts[-2].group(0).strip()
# Look for quantity
quantity = None
for elem in row:
text = elem.text.strip()
if QUANTITY_PATTERN.match(text):
quantity = text
break
# Look for VAT rate
vat_rate = None
vat_match = VAT_RATE_PATTERN.search(all_text)
if vat_match:
vat_rate = vat_match.group(1)
# Description is typically the longest non-numeric text
description = None
max_len = 0
for elem in row:
text = elem.text.strip()
# Skip if it looks like a number/amount
if AMOUNT_PATTERN.fullmatch(text):
continue
if QUANTITY_PATTERN.match(text):
continue
if len(text) > max_len:
description = text
max_len = len(text)
return TextLineItem(
row_index=row_index,
description=description,
quantity=quantity,
unit_price=unit_price,
amount=amount,
vat_rate=vat_rate,
confidence=0.7,
)
def convert_text_line_item(item: TextLineItem) -> "LineItem":
"""Convert TextLineItem to standard LineItem dataclass."""
from .line_items_extractor import LineItem
return LineItem(
row_index=item.row_index,
description=item.description,
quantity=item.quantity,
unit=item.unit,
unit_price=item.unit_price,
amount=item.amount,
article_number=item.article_number,
vat_rate=item.vat_rate,
is_deduction=item.is_deduction,
confidence=item.confidence,
)