Update paddle, and support invoice line item
This commit is contained in:
449
packages/backend/backend/table/text_line_items_extractor.py
Normal file
449
packages/backend/backend/table/text_line_items_extractor.py
Normal file
@@ -0,0 +1,449 @@
|
||||
"""
|
||||
Text-Based Line Items Extractor
|
||||
|
||||
Fallback extraction for invoices where PP-StructureV3 cannot detect table structures
|
||||
(e.g., borderless/wireless tables). Uses spatial analysis of OCR text elements to
|
||||
identify and group line items.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from decimal import Decimal, InvalidOperation
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextElement:
|
||||
"""Single text element from OCR."""
|
||||
|
||||
text: str
|
||||
bbox: tuple[float, float, float, float] # x1, y1, x2, y2
|
||||
confidence: float = 1.0
|
||||
|
||||
@property
|
||||
def center_y(self) -> float:
|
||||
"""Vertical center of the element."""
|
||||
return (self.bbox[1] + self.bbox[3]) / 2
|
||||
|
||||
@property
|
||||
def center_x(self) -> float:
|
||||
"""Horizontal center of the element."""
|
||||
return (self.bbox[0] + self.bbox[2]) / 2
|
||||
|
||||
@property
|
||||
def height(self) -> float:
|
||||
"""Height of the element."""
|
||||
return self.bbox[3] - self.bbox[1]
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextLineItem:
|
||||
"""Line item extracted from text elements."""
|
||||
|
||||
row_index: int
|
||||
description: str | None = None
|
||||
quantity: str | None = None
|
||||
unit: str | None = None
|
||||
unit_price: str | None = None
|
||||
amount: str | None = None
|
||||
article_number: str | None = None
|
||||
vat_rate: str | None = None
|
||||
is_deduction: bool = False # True if this row is a deduction/discount
|
||||
confidence: float = 0.7 # Lower default confidence for text-based extraction
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextLineItemsResult:
|
||||
"""Result of text-based line items extraction."""
|
||||
|
||||
items: list[TextLineItem]
|
||||
header_row: list[str]
|
||||
extraction_method: str = "text_spatial"
|
||||
|
||||
|
||||
# Swedish amount pattern: 1 234,56 or 1234.56 or 1,234.56
|
||||
AMOUNT_PATTERN = re.compile(
|
||||
r"(?<![0-9])(?:"
|
||||
r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?" # Swedish: 1 234,56
|
||||
r"|-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?" # US: 1,234.56
|
||||
r"|-?\d+(?:[.,]\d{2})?" # Simple: 1234,56 or 1234.56
|
||||
r")(?:\s*(?:kr|SEK|:-))?" # Optional currency suffix
|
||||
r"(?![0-9])"
|
||||
)
|
||||
|
||||
# Quantity patterns
|
||||
QUANTITY_PATTERN = re.compile(
|
||||
r"^(?:"
|
||||
r"\d+(?:[.,]\d+)?\s*(?:st|pcs|m|kg|l|h|tim|timmar)?" # Number with optional unit
|
||||
r")$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# VAT rate patterns
|
||||
VAT_RATE_PATTERN = re.compile(r"(\d+)\s*%")
|
||||
|
||||
# Keywords indicating a line item area
|
||||
LINE_ITEM_KEYWORDS = [
|
||||
"beskrivning",
|
||||
"artikel",
|
||||
"produkt",
|
||||
"belopp",
|
||||
"summa",
|
||||
"antal",
|
||||
"pris",
|
||||
"á-pris",
|
||||
"a-pris",
|
||||
"moms",
|
||||
]
|
||||
|
||||
# Keywords indicating NOT line items (summary area)
|
||||
SUMMARY_KEYWORDS = [
|
||||
"att betala",
|
||||
"total",
|
||||
"summa att betala",
|
||||
"betalningsvillkor",
|
||||
"förfallodatum",
|
||||
"bankgiro",
|
||||
"plusgiro",
|
||||
"ocr-nummer",
|
||||
"fakturabelopp",
|
||||
"exkl. moms",
|
||||
"inkl. moms",
|
||||
"varav moms",
|
||||
]
|
||||
|
||||
|
||||
class TextLineItemsExtractor:
|
||||
"""
|
||||
Extract line items from text elements using spatial analysis.
|
||||
|
||||
This is a fallback for when PP-StructureV3 cannot detect table structures.
|
||||
It groups text elements by vertical position and identifies patterns
|
||||
that match line item rows.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
row_tolerance: float = 15.0, # Max vertical distance to consider same row
|
||||
min_items_for_valid: int = 2, # Minimum items to consider extraction valid
|
||||
):
|
||||
"""
|
||||
Initialize extractor.
|
||||
|
||||
Args:
|
||||
row_tolerance: Maximum vertical distance (pixels) between elements
|
||||
to consider them on the same row.
|
||||
min_items_for_valid: Minimum number of line items required for
|
||||
extraction to be considered successful.
|
||||
"""
|
||||
self.row_tolerance = row_tolerance
|
||||
self.min_items_for_valid = min_items_for_valid
|
||||
|
||||
def extract_from_parsing_res(
|
||||
self, parsing_res_list: list[dict[str, Any]]
|
||||
) -> TextLineItemsResult | None:
|
||||
"""
|
||||
Extract line items from PP-StructureV3 parsing_res_list.
|
||||
|
||||
Args:
|
||||
parsing_res_list: List of parsed elements from PP-StructureV3.
|
||||
|
||||
Returns:
|
||||
TextLineItemsResult if line items found, None otherwise.
|
||||
"""
|
||||
if not parsing_res_list:
|
||||
logger.debug("No parsing_res_list provided")
|
||||
return None
|
||||
|
||||
# Extract text elements from parsing results
|
||||
text_elements = self._extract_text_elements(parsing_res_list)
|
||||
logger.info(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
|
||||
|
||||
if len(text_elements) < 5: # Need at least a few elements
|
||||
logger.debug("Too few text elements for line item extraction")
|
||||
return None
|
||||
|
||||
return self.extract_from_text_elements(text_elements)
|
||||
|
||||
def extract_from_text_elements(
|
||||
self, text_elements: list[TextElement]
|
||||
) -> TextLineItemsResult | None:
|
||||
"""
|
||||
Extract line items from a list of text elements.
|
||||
|
||||
Args:
|
||||
text_elements: List of TextElement objects.
|
||||
|
||||
Returns:
|
||||
TextLineItemsResult if line items found, None otherwise.
|
||||
"""
|
||||
# Group elements by row
|
||||
rows = self._group_by_row(text_elements)
|
||||
logger.info(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
|
||||
|
||||
# Find the line items section
|
||||
item_rows = self._identify_line_item_rows(rows)
|
||||
logger.info(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
|
||||
|
||||
if len(item_rows) < self.min_items_for_valid:
|
||||
logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
|
||||
return None
|
||||
|
||||
# Extract structured items
|
||||
items = self._parse_line_items(item_rows)
|
||||
logger.info(f"TextLineItemsExtractor: extracted {len(items)} line items")
|
||||
|
||||
if len(items) < self.min_items_for_valid:
|
||||
return None
|
||||
|
||||
return TextLineItemsResult(
|
||||
items=items,
|
||||
header_row=[], # No explicit header in text-based extraction
|
||||
extraction_method="text_spatial",
|
||||
)
|
||||
|
||||
def _extract_text_elements(
|
||||
self, parsing_res_list: list[dict[str, Any]]
|
||||
) -> list[TextElement]:
|
||||
"""Extract TextElement objects from parsing_res_list."""
|
||||
elements = []
|
||||
|
||||
for elem in parsing_res_list:
|
||||
try:
|
||||
# Get label and bbox - handle both dict and LayoutBlock objects
|
||||
if isinstance(elem, dict):
|
||||
label = elem.get("label", "")
|
||||
bbox = elem.get("bbox", [])
|
||||
# Try both 'text' and 'content' keys
|
||||
text = elem.get("text", "") or elem.get("content", "")
|
||||
else:
|
||||
label = getattr(elem, "label", "")
|
||||
bbox = getattr(elem, "bbox", [])
|
||||
# LayoutBlock objects use 'content' attribute
|
||||
text = getattr(elem, "content", "") or getattr(elem, "text", "")
|
||||
|
||||
# Only process text elements (skip images, tables, etc.)
|
||||
if label not in ("text", "paragraph_title", "aside_text"):
|
||||
continue
|
||||
|
||||
# Validate bbox
|
||||
if not self._valid_bbox(bbox):
|
||||
continue
|
||||
|
||||
# Clean text
|
||||
text = str(text).strip() if text else ""
|
||||
if not text:
|
||||
continue
|
||||
|
||||
elements.append(
|
||||
TextElement(
|
||||
text=text,
|
||||
bbox=(
|
||||
float(bbox[0]),
|
||||
float(bbox[1]),
|
||||
float(bbox[2]),
|
||||
float(bbox[3]),
|
||||
),
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to parse element: {e}")
|
||||
continue
|
||||
|
||||
return elements
|
||||
|
||||
def _valid_bbox(self, bbox: Any) -> bool:
|
||||
"""Check if bbox is valid (has 4 elements)."""
|
||||
try:
|
||||
return len(bbox) >= 4 if hasattr(bbox, "__len__") else False
|
||||
except (TypeError, ValueError):
|
||||
return False
|
||||
|
||||
def _group_by_row(
|
||||
self, elements: list[TextElement]
|
||||
) -> list[list[TextElement]]:
|
||||
"""
|
||||
Group text elements into rows based on vertical position.
|
||||
|
||||
Elements within row_tolerance of each other are considered same row.
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
|
||||
# Sort by vertical position
|
||||
sorted_elements = sorted(elements, key=lambda e: e.center_y)
|
||||
|
||||
rows = []
|
||||
current_row = [sorted_elements[0]]
|
||||
current_y = sorted_elements[0].center_y
|
||||
|
||||
for elem in sorted_elements[1:]:
|
||||
if abs(elem.center_y - current_y) <= self.row_tolerance:
|
||||
# Same row
|
||||
current_row.append(elem)
|
||||
else:
|
||||
# New row
|
||||
if current_row:
|
||||
# Sort row by horizontal position
|
||||
current_row.sort(key=lambda e: e.center_x)
|
||||
rows.append(current_row)
|
||||
current_row = [elem]
|
||||
current_y = elem.center_y
|
||||
|
||||
# Don't forget last row
|
||||
if current_row:
|
||||
current_row.sort(key=lambda e: e.center_x)
|
||||
rows.append(current_row)
|
||||
|
||||
return rows
|
||||
|
||||
def _identify_line_item_rows(
|
||||
self, rows: list[list[TextElement]]
|
||||
) -> list[list[TextElement]]:
|
||||
"""
|
||||
Identify which rows are likely line items.
|
||||
|
||||
Line item rows typically have:
|
||||
- Multiple elements per row
|
||||
- At least one amount-like value
|
||||
- Description text
|
||||
"""
|
||||
item_rows = []
|
||||
in_item_section = False
|
||||
|
||||
for row in rows:
|
||||
row_text = " ".join(e.text for e in row).lower()
|
||||
|
||||
# Check if we're entering summary section
|
||||
if any(kw in row_text for kw in SUMMARY_KEYWORDS):
|
||||
in_item_section = False
|
||||
continue
|
||||
|
||||
# Check if this looks like a header row
|
||||
if any(kw in row_text for kw in LINE_ITEM_KEYWORDS):
|
||||
in_item_section = True
|
||||
continue # Skip header row itself
|
||||
|
||||
# Check if row looks like a line item
|
||||
if in_item_section or self._looks_like_line_item(row):
|
||||
if self._looks_like_line_item(row):
|
||||
item_rows.append(row)
|
||||
|
||||
return item_rows
|
||||
|
||||
def _looks_like_line_item(self, row: list[TextElement]) -> bool:
|
||||
"""Check if a row looks like a line item."""
|
||||
if len(row) < 2:
|
||||
return False
|
||||
|
||||
row_text = " ".join(e.text for e in row)
|
||||
|
||||
# Must have at least one amount
|
||||
amounts = AMOUNT_PATTERN.findall(row_text)
|
||||
if not amounts:
|
||||
return False
|
||||
|
||||
# Should have some description text (not just numbers)
|
||||
has_description = any(
|
||||
len(e.text) > 3 and not AMOUNT_PATTERN.fullmatch(e.text.strip())
|
||||
for e in row
|
||||
)
|
||||
|
||||
return has_description
|
||||
|
||||
def _parse_line_items(
|
||||
self, item_rows: list[list[TextElement]]
|
||||
) -> list[TextLineItem]:
|
||||
"""Parse line item rows into structured items."""
|
||||
items = []
|
||||
|
||||
for idx, row in enumerate(item_rows):
|
||||
item = self._parse_single_row(row, idx)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
|
||||
def _parse_single_row(
|
||||
self, row: list[TextElement], row_index: int
|
||||
) -> TextLineItem | None:
|
||||
"""Parse a single row into a line item."""
|
||||
if not row:
|
||||
return None
|
||||
|
||||
# Combine all text for analysis
|
||||
all_text = " ".join(e.text for e in row)
|
||||
|
||||
# Find amounts (rightmost is usually the total)
|
||||
amounts = list(AMOUNT_PATTERN.finditer(all_text))
|
||||
if not amounts:
|
||||
return None
|
||||
|
||||
# Last amount is typically line total
|
||||
amount_match = amounts[-1]
|
||||
amount = amount_match.group(0).strip()
|
||||
|
||||
# Second to last might be unit price
|
||||
unit_price = None
|
||||
if len(amounts) >= 2:
|
||||
unit_price = amounts[-2].group(0).strip()
|
||||
|
||||
# Look for quantity
|
||||
quantity = None
|
||||
for elem in row:
|
||||
text = elem.text.strip()
|
||||
if QUANTITY_PATTERN.match(text):
|
||||
quantity = text
|
||||
break
|
||||
|
||||
# Look for VAT rate
|
||||
vat_rate = None
|
||||
vat_match = VAT_RATE_PATTERN.search(all_text)
|
||||
if vat_match:
|
||||
vat_rate = vat_match.group(1)
|
||||
|
||||
# Description is typically the longest non-numeric text
|
||||
description = None
|
||||
max_len = 0
|
||||
for elem in row:
|
||||
text = elem.text.strip()
|
||||
# Skip if it looks like a number/amount
|
||||
if AMOUNT_PATTERN.fullmatch(text):
|
||||
continue
|
||||
if QUANTITY_PATTERN.match(text):
|
||||
continue
|
||||
if len(text) > max_len:
|
||||
description = text
|
||||
max_len = len(text)
|
||||
|
||||
return TextLineItem(
|
||||
row_index=row_index,
|
||||
description=description,
|
||||
quantity=quantity,
|
||||
unit_price=unit_price,
|
||||
amount=amount,
|
||||
vat_rate=vat_rate,
|
||||
confidence=0.7,
|
||||
)
|
||||
|
||||
|
||||
def convert_text_line_item(item: TextLineItem) -> "LineItem":
|
||||
"""Convert TextLineItem to standard LineItem dataclass."""
|
||||
from .line_items_extractor import LineItem
|
||||
|
||||
return LineItem(
|
||||
row_index=item.row_index,
|
||||
description=item.description,
|
||||
quantity=item.quantity,
|
||||
unit=item.unit,
|
||||
unit_price=item.unit_price,
|
||||
amount=item.amount,
|
||||
article_number=item.article_number,
|
||||
vat_rate=item.vat_rate,
|
||||
is_deduction=item.is_deduction,
|
||||
confidence=item.confidence,
|
||||
)
|
||||
Reference in New Issue
Block a user