- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
476 lines
15 KiB
Python
476 lines
15 KiB
Python
"""
|
|
Text-Based Line Items Extractor
|
|
|
|
Fallback extraction for invoices where PP-StructureV3 cannot detect table structures
|
|
(e.g., borderless/wireless tables). Uses spatial analysis of OCR text elements to
|
|
identify and group line items.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from decimal import Decimal, InvalidOperation
|
|
import re
|
|
from typing import Any
|
|
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration constants
|
|
DEFAULT_ROW_TOLERANCE = 15.0 # Max vertical distance (pixels) to consider same row
|
|
MIN_ITEMS_FOR_VALID_EXTRACTION = 2 # Minimum items required for valid extraction
|
|
MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5 # Minimum text elements needed to attempt extraction
|
|
|
|
|
|
@dataclass
|
|
class TextElement:
|
|
"""Single text element from OCR."""
|
|
|
|
text: str
|
|
bbox: tuple[float, float, float, float] # x1, y1, x2, y2
|
|
confidence: float = 1.0
|
|
|
|
@property
|
|
def center_y(self) -> float:
|
|
"""Vertical center of the element."""
|
|
return (self.bbox[1] + self.bbox[3]) / 2
|
|
|
|
@property
|
|
def center_x(self) -> float:
|
|
"""Horizontal center of the element."""
|
|
return (self.bbox[0] + self.bbox[2]) / 2
|
|
|
|
@property
|
|
def height(self) -> float:
|
|
"""Height of the element."""
|
|
return self.bbox[3] - self.bbox[1]
|
|
|
|
|
|
@dataclass
|
|
class TextLineItem:
|
|
"""Line item extracted from text elements."""
|
|
|
|
row_index: int
|
|
description: str | None = None
|
|
quantity: str | None = None
|
|
unit: str | None = None
|
|
unit_price: str | None = None
|
|
amount: str | None = None
|
|
article_number: str | None = None
|
|
vat_rate: str | None = None
|
|
is_deduction: bool = False # True if this row is a deduction/discount
|
|
confidence: float = 0.7 # Lower default confidence for text-based extraction
|
|
|
|
|
|
@dataclass
|
|
class TextLineItemsResult:
|
|
"""Result of text-based line items extraction."""
|
|
|
|
items: list[TextLineItem]
|
|
header_row: list[str]
|
|
extraction_method: str = "text_spatial"
|
|
|
|
|
|
# Amount pattern matches Swedish, US, and simple numeric formats
|
|
# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
|
|
# Does NOT handle: amounts with more than 2 decimal places, scientific notation
|
|
# See tests in test_text_line_items_extractor.py::TestAmountPattern
|
|
AMOUNT_PATTERN = re.compile(
|
|
r"(?<![0-9])(?:"
|
|
r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?" # Swedish: 1 234,56
|
|
r"|-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?" # US: 1,234.56
|
|
r"|-?\d+(?:[.,]\d{2})?" # Simple: 1234,56 or 1234.56
|
|
r")(?:\s*(?:kr|SEK|:-))?" # Optional currency suffix
|
|
r"(?![0-9])"
|
|
)
|
|
|
|
# Quantity patterns
|
|
QUANTITY_PATTERN = re.compile(
|
|
r"^(?:"
|
|
r"\d+(?:[.,]\d+)?\s*(?:st|pcs|m|kg|l|h|tim|timmar)?" # Number with optional unit
|
|
r")$",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# VAT rate patterns
|
|
VAT_RATE_PATTERN = re.compile(r"(\d+)\s*%")
|
|
|
|
# Keywords indicating a line item area
|
|
LINE_ITEM_KEYWORDS = [
|
|
"beskrivning",
|
|
"artikel",
|
|
"produkt",
|
|
"belopp",
|
|
"summa",
|
|
"antal",
|
|
"pris",
|
|
"á-pris",
|
|
"a-pris",
|
|
"moms",
|
|
]
|
|
|
|
# Keywords indicating NOT line items (summary area)
|
|
SUMMARY_KEYWORDS = [
|
|
"att betala",
|
|
"total",
|
|
"summa att betala",
|
|
"betalningsvillkor",
|
|
"förfallodatum",
|
|
"bankgiro",
|
|
"plusgiro",
|
|
"ocr-nummer",
|
|
"fakturabelopp",
|
|
"exkl. moms",
|
|
"inkl. moms",
|
|
"varav moms",
|
|
]
|
|
|
|
|
|
class TextLineItemsExtractor:
|
|
"""
|
|
Extract line items from text elements using spatial analysis.
|
|
|
|
This is a fallback for when PP-StructureV3 cannot detect table structures.
|
|
It groups text elements by vertical position and identifies patterns
|
|
that match line item rows.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
row_tolerance: float = DEFAULT_ROW_TOLERANCE,
|
|
min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
|
|
):
|
|
"""
|
|
Initialize extractor.
|
|
|
|
Args:
|
|
row_tolerance: Maximum vertical distance (pixels) between elements
|
|
to consider them on the same row. Default: 15.0
|
|
min_items_for_valid: Minimum number of line items required for
|
|
extraction to be considered successful. Default: 2
|
|
"""
|
|
self.row_tolerance = row_tolerance
|
|
self.min_items_for_valid = min_items_for_valid
|
|
|
|
def extract_from_parsing_res(
|
|
self, parsing_res_list: list[dict[str, Any]]
|
|
) -> TextLineItemsResult | None:
|
|
"""
|
|
Extract line items from PP-StructureV3 parsing_res_list.
|
|
|
|
Args:
|
|
parsing_res_list: List of parsed elements from PP-StructureV3.
|
|
|
|
Returns:
|
|
TextLineItemsResult if line items found, None otherwise.
|
|
"""
|
|
if not parsing_res_list:
|
|
logger.debug("No parsing_res_list provided")
|
|
return None
|
|
|
|
# Extract text elements from parsing results
|
|
text_elements = self._extract_text_elements(parsing_res_list)
|
|
logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
|
|
|
|
if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
|
|
logger.debug(
|
|
f"Too few text elements ({len(text_elements)}) for line item extraction, "
|
|
f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
|
|
)
|
|
return None
|
|
|
|
return self.extract_from_text_elements(text_elements)
|
|
|
|
def extract_from_text_elements(
|
|
self, text_elements: list[TextElement]
|
|
) -> TextLineItemsResult | None:
|
|
"""
|
|
Extract line items from a list of text elements.
|
|
|
|
Args:
|
|
text_elements: List of TextElement objects.
|
|
|
|
Returns:
|
|
TextLineItemsResult if line items found, None otherwise.
|
|
"""
|
|
# Group elements by row
|
|
rows = self._group_by_row(text_elements)
|
|
logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
|
|
|
|
# Find the line items section
|
|
item_rows = self._identify_line_item_rows(rows)
|
|
logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
|
|
|
|
if len(item_rows) < self.min_items_for_valid:
|
|
logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
|
|
return None
|
|
|
|
# Extract structured items
|
|
items = self._parse_line_items(item_rows)
|
|
logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")
|
|
|
|
if len(items) < self.min_items_for_valid:
|
|
return None
|
|
|
|
return TextLineItemsResult(
|
|
items=items,
|
|
header_row=[], # No explicit header in text-based extraction
|
|
extraction_method="text_spatial",
|
|
)
|
|
|
|
def _extract_text_elements(
|
|
self, parsing_res_list: list[dict[str, Any]]
|
|
) -> list[TextElement]:
|
|
"""Extract TextElement objects from parsing_res_list.
|
|
|
|
Handles both dict and LayoutBlock object formats from PP-StructureV3.
|
|
Gracefully skips invalid elements with appropriate logging.
|
|
"""
|
|
elements = []
|
|
|
|
for elem in parsing_res_list:
|
|
try:
|
|
# Get label and bbox - handle both dict and LayoutBlock objects
|
|
if isinstance(elem, dict):
|
|
label = elem.get("label", "")
|
|
bbox = elem.get("bbox", [])
|
|
# Try both 'text' and 'content' keys
|
|
text = elem.get("text", "") or elem.get("content", "")
|
|
elif hasattr(elem, "label"):
|
|
label = getattr(elem, "label", "")
|
|
bbox = getattr(elem, "bbox", [])
|
|
# LayoutBlock objects use 'content' attribute
|
|
text = getattr(elem, "content", "") or getattr(elem, "text", "")
|
|
else:
|
|
# Element is neither dict nor has expected attributes
|
|
logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
|
|
continue
|
|
|
|
# Only process text elements (skip images, tables, etc.)
|
|
if label not in ("text", "paragraph_title", "aside_text"):
|
|
continue
|
|
|
|
# Validate bbox
|
|
if not self._valid_bbox(bbox):
|
|
logger.debug(f"Skipping element with invalid bbox: {bbox}")
|
|
continue
|
|
|
|
# Clean text
|
|
text = str(text).strip() if text else ""
|
|
if not text:
|
|
continue
|
|
|
|
elements.append(
|
|
TextElement(
|
|
text=text,
|
|
bbox=(
|
|
float(bbox[0]),
|
|
float(bbox[1]),
|
|
float(bbox[2]),
|
|
float(bbox[3]),
|
|
),
|
|
)
|
|
)
|
|
except (KeyError, TypeError, ValueError, AttributeError) as e:
|
|
# Expected format issues - log at debug level
|
|
logger.debug(f"Skipping element due to format issue: {e}")
|
|
continue
|
|
except Exception as e:
|
|
# Unexpected errors - log at warning level for visibility
|
|
logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
|
|
continue
|
|
|
|
return elements
|
|
|
|
def _valid_bbox(self, bbox: Any) -> bool:
|
|
"""Check if bbox is valid (has 4 elements)."""
|
|
try:
|
|
return len(bbox) >= 4 if hasattr(bbox, "__len__") else False
|
|
except (TypeError, ValueError):
|
|
return False
|
|
|
|
def _group_by_row(
|
|
self, elements: list[TextElement]
|
|
) -> list[list[TextElement]]:
|
|
"""
|
|
Group text elements into rows based on vertical position.
|
|
|
|
Elements within row_tolerance of each other are considered same row.
|
|
Uses dynamic average center_y to handle varying element heights more accurately.
|
|
"""
|
|
if not elements:
|
|
return []
|
|
|
|
# Sort by vertical position
|
|
sorted_elements = sorted(elements, key=lambda e: e.center_y)
|
|
|
|
rows: list[list[TextElement]] = []
|
|
current_row: list[TextElement] = [sorted_elements[0]]
|
|
|
|
for elem in sorted_elements[1:]:
|
|
# Calculate dynamic average center_y for current row
|
|
avg_center_y = sum(e.center_y for e in current_row) / len(current_row)
|
|
|
|
if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
|
|
# Same row - add element and recalculate average on next iteration
|
|
current_row.append(elem)
|
|
else:
|
|
# New row - finalize current row
|
|
# Sort row by horizontal position (left to right)
|
|
current_row.sort(key=lambda e: e.center_x)
|
|
rows.append(current_row)
|
|
current_row = [elem]
|
|
|
|
# Don't forget last row
|
|
if current_row:
|
|
current_row.sort(key=lambda e: e.center_x)
|
|
rows.append(current_row)
|
|
|
|
return rows
|
|
|
|
def _identify_line_item_rows(
|
|
self, rows: list[list[TextElement]]
|
|
) -> list[list[TextElement]]:
|
|
"""
|
|
Identify which rows are likely line items.
|
|
|
|
Line item rows typically have:
|
|
- Multiple elements per row
|
|
- At least one amount-like value
|
|
- Description text
|
|
"""
|
|
item_rows = []
|
|
in_item_section = False
|
|
|
|
for row in rows:
|
|
row_text = " ".join(e.text for e in row).lower()
|
|
|
|
# Check if we're entering summary section
|
|
if any(kw in row_text for kw in SUMMARY_KEYWORDS):
|
|
in_item_section = False
|
|
continue
|
|
|
|
# Check if this looks like a header row
|
|
if any(kw in row_text for kw in LINE_ITEM_KEYWORDS):
|
|
in_item_section = True
|
|
continue # Skip header row itself
|
|
|
|
# Check if row looks like a line item
|
|
if in_item_section or self._looks_like_line_item(row):
|
|
if self._looks_like_line_item(row):
|
|
item_rows.append(row)
|
|
|
|
return item_rows
|
|
|
|
def _looks_like_line_item(self, row: list[TextElement]) -> bool:
|
|
"""Check if a row looks like a line item."""
|
|
if len(row) < 2:
|
|
return False
|
|
|
|
row_text = " ".join(e.text for e in row)
|
|
|
|
# Must have at least one amount
|
|
amounts = AMOUNT_PATTERN.findall(row_text)
|
|
if not amounts:
|
|
return False
|
|
|
|
# Should have some description text (not just numbers)
|
|
has_description = any(
|
|
len(e.text) > 3 and not AMOUNT_PATTERN.fullmatch(e.text.strip())
|
|
for e in row
|
|
)
|
|
|
|
return has_description
|
|
|
|
def _parse_line_items(
|
|
self, item_rows: list[list[TextElement]]
|
|
) -> list[TextLineItem]:
|
|
"""Parse line item rows into structured items."""
|
|
items = []
|
|
|
|
for idx, row in enumerate(item_rows):
|
|
item = self._parse_single_row(row, idx)
|
|
if item:
|
|
items.append(item)
|
|
|
|
return items
|
|
|
|
def _parse_single_row(
|
|
self, row: list[TextElement], row_index: int
|
|
) -> TextLineItem | None:
|
|
"""Parse a single row into a line item."""
|
|
if not row:
|
|
return None
|
|
|
|
# Combine all text for analysis
|
|
all_text = " ".join(e.text for e in row)
|
|
|
|
# Find amounts (rightmost is usually the total)
|
|
amounts = list(AMOUNT_PATTERN.finditer(all_text))
|
|
if not amounts:
|
|
return None
|
|
|
|
# Last amount is typically line total
|
|
amount_match = amounts[-1]
|
|
amount = amount_match.group(0).strip()
|
|
|
|
# Second to last might be unit price
|
|
unit_price = None
|
|
if len(amounts) >= 2:
|
|
unit_price = amounts[-2].group(0).strip()
|
|
|
|
# Look for quantity
|
|
quantity = None
|
|
for elem in row:
|
|
text = elem.text.strip()
|
|
if QUANTITY_PATTERN.match(text):
|
|
quantity = text
|
|
break
|
|
|
|
# Look for VAT rate
|
|
vat_rate = None
|
|
vat_match = VAT_RATE_PATTERN.search(all_text)
|
|
if vat_match:
|
|
vat_rate = vat_match.group(1)
|
|
|
|
# Description is typically the longest non-numeric text
|
|
description = None
|
|
max_len = 0
|
|
for elem in row:
|
|
text = elem.text.strip()
|
|
# Skip if it looks like a number/amount
|
|
if AMOUNT_PATTERN.fullmatch(text):
|
|
continue
|
|
if QUANTITY_PATTERN.match(text):
|
|
continue
|
|
if len(text) > max_len:
|
|
description = text
|
|
max_len = len(text)
|
|
|
|
return TextLineItem(
|
|
row_index=row_index,
|
|
description=description,
|
|
quantity=quantity,
|
|
unit_price=unit_price,
|
|
amount=amount,
|
|
vat_rate=vat_rate,
|
|
confidence=0.7,
|
|
)
|
|
|
|
|
|
def convert_text_line_item(item: TextLineItem) -> "LineItem":
|
|
"""Convert TextLineItem to standard LineItem dataclass."""
|
|
from .line_items_extractor import LineItem
|
|
|
|
return LineItem(
|
|
row_index=item.row_index,
|
|
description=item.description,
|
|
quantity=item.quantity,
|
|
unit=item.unit,
|
|
unit_price=item.unit_price,
|
|
amount=item.amount,
|
|
article_number=item.article_number,
|
|
vat_rate=item.vat_rate,
|
|
is_deduction=item.is_deduction,
|
|
confidence=item.confidence,
|
|
)
|