refactor: split line_items_extractor into smaller modules with comprehensive tests

- Extract models.py (LineItem, LineItemsResult dataclasses)
- Extract html_table_parser.py (ColumnMapper, HtmlTableParser)
- Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells)
- Reduce line_items_extractor.py from 971 to 396 lines
- Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.)
- Fix row grouping algorithm in text_line_items_extractor.py
- Demote INFO logs to DEBUG level in structure_detector.py
- Add 209 tests achieving 85%+ coverage on main modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-02-03 23:02:00 +01:00
parent c2c8f2dd04
commit 8723ef4653
11 changed files with 2230 additions and 841 deletions

View File

@@ -0,0 +1,204 @@
"""
HTML Table Parser
Parses HTML tables into structured data and maps columns to field names.
"""
from html.parser import HTMLParser
import logging
logger = logging.getLogger(__name__)
# Configuration constants
# Minimum pattern length to avoid false positives from short substrings
MIN_PATTERN_MATCH_LENGTH = 3
# Exact match bonus for column mapping priority
EXACT_MATCH_BONUS = 100
# Swedish column name mappings
# Extended to support multiple invoice types: product invoices, rental invoices, utility bills
COLUMN_MAPPINGS = {
"article_number": [
"art nummer",
"artikelnummer",
"artikel",
"artnr",
"art.nr",
"art nr",
"objektnummer", # Rental: property reference
"objekt",
],
"description": [
"beskrivning",
"produktbeskrivning",
"produkt",
"tjänst",
"text",
"benämning",
"vara/tjänst",
"vara",
# Rental invoice specific
"specifikation",
"spec",
"hyresperiod", # Rental period
"period",
"typ", # Type of charge
# Utility bills
"förbrukning", # Consumption
"avläsning", # Meter reading
],
"quantity": ["antal", "qty", "st", "pcs", "kvantitet", "", "kvm"],
"unit": ["enhet", "unit"],
"unit_price": ["á-pris", "a-pris", "pris", "styckpris", "enhetspris", "à pris"],
"amount": [
"belopp",
"summa",
"total",
"netto",
"rad summa",
# Rental specific
"hyra", # Rent
"avgift", # Fee
"kostnad", # Cost
"debitering", # Charge
"totalt", # Total
],
"vat_rate": ["moms", "moms%", "vat", "skatt", "moms %"],
# Additional field for rental: deductions/adjustments
"deduction": [
"avdrag", # Deduction
"rabatt", # Discount
"kredit", # Credit
],
}
# Keywords that indicate NOT a line items table
SUMMARY_KEYWORDS = [
"frakt",
"faktura.avg",
"fakturavg",
"exkl.moms",
"att betala",
"öresavr",
"bankgiro",
"plusgiro",
"ocr",
"forfallodatum",
"förfallodatum",
]
class _TableHTMLParser(HTMLParser):
"""Internal HTML parser for tables."""
def __init__(self):
super().__init__()
self.rows: list[list[str]] = []
self.current_row: list[str] = []
self.current_cell: str = ""
self.in_td = False
self.in_thead = False
self.header_row: list[str] = []
def handle_starttag(self, tag, attrs):
if tag == "tr":
self.current_row = []
elif tag in ("td", "th"):
self.in_td = True
self.current_cell = ""
elif tag == "thead":
self.in_thead = True
def handle_endtag(self, tag):
if tag in ("td", "th"):
self.in_td = False
self.current_row.append(self.current_cell.strip())
elif tag == "tr":
if self.current_row:
if self.in_thead:
self.header_row = self.current_row
else:
self.rows.append(self.current_row)
elif tag == "thead":
self.in_thead = False
def handle_data(self, data):
if self.in_td:
self.current_cell += data
class HTMLTableParser:
"""Parse HTML tables into structured data."""
def parse(self, html: str) -> tuple[list[str], list[list[str]]]:
"""
Parse HTML table and return header and rows.
Args:
html: HTML string containing table.
Returns:
Tuple of (header_row, data_rows).
"""
parser = _TableHTMLParser()
parser.feed(html)
return parser.header_row, parser.rows
class ColumnMapper:
"""Map column headers to field names."""
def __init__(self, mappings: dict[str, list[str]] | None = None):
"""
Initialize column mapper.
Args:
mappings: Custom column mappings. Uses Swedish defaults if None.
"""
self.mappings = mappings or COLUMN_MAPPINGS
def map(self, headers: list[str]) -> dict[int, str]:
"""
Map column indices to field names.
Args:
headers: List of column header strings.
Returns:
Dictionary mapping column index to field name.
"""
mapping = {}
for idx, header in enumerate(headers):
normalized = self._normalize(header)
if not normalized.strip():
continue
best_match = None
best_match_len = 0
for field_name, patterns in self.mappings.items():
for pattern in patterns:
if pattern == normalized:
# Exact match gets highest priority
best_match = field_name
best_match_len = len(pattern) + EXACT_MATCH_BONUS
break
elif pattern in normalized and len(pattern) > best_match_len:
# Partial match requires minimum length to avoid false positives
if len(pattern) >= MIN_PATTERN_MATCH_LENGTH:
best_match = field_name
best_match_len = len(pattern)
if best_match_len > EXACT_MATCH_BONUS:
# Found exact match, no need to check other fields
break
if best_match:
mapping[idx] = best_match
return mapping
def _normalize(self, header: str) -> str:
"""Normalize header text for matching."""
return header.lower().strip().replace(".", "").replace("-", " ")

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,423 @@
"""
Merged Cell Handler
Handles detection and extraction of data from tables with merged cells,
a common issue with PP-StructureV3 OCR output.
"""
import re
import logging
from typing import TYPE_CHECKING
from .models import LineItem
if TYPE_CHECKING:
from .html_table_parser import ColumnMapper
logger = logging.getLogger(__name__)
# Minimum positive amount to consider as line item (filters noise like row indices)
MIN_AMOUNT_THRESHOLD = 100
class MergedCellHandler:
"""Handles tables with vertically merged cells from PP-StructureV3."""
def __init__(self, mapper: "ColumnMapper"):
"""
Initialize handler.
Args:
mapper: ColumnMapper instance for header keyword detection.
"""
self.mapper = mapper
def has_vertically_merged_cells(self, rows: list[list[str]]) -> bool:
"""
Check if table rows contain vertically merged data in single cells.
PP-StructureV3 sometimes merges multiple table rows into single cells, e.g.:
["Produktnr 1457280 1457280 1060381", "", "Antal 6ST 6ST 1ST", "Pris 127,20 127,20 159,20"]
Detection: cells contain repeating patterns of numbers or keywords suggesting multiple lines.
"""
if not rows:
return False
for row in rows:
for cell in row:
if not cell or len(cell) < 20:
continue
# Check for multiple product numbers (7+ digit patterns)
product_nums = re.findall(r"\b\d{7}\b", cell)
if len(product_nums) >= 2:
logger.debug(f"has_vertically_merged_cells: found {len(product_nums)} product numbers in cell")
return True
# Check for multiple prices (Swedish format: 123,45 or 1 234,56)
prices = re.findall(r"\b\d{1,3}(?:\s?\d{3})*[,\.]\d{2}\b", cell)
if len(prices) >= 3:
logger.debug(f"has_vertically_merged_cells: found {len(prices)} prices in cell")
return True
# Check for multiple quantity patterns (e.g., "6ST 6ST 1ST")
quantities = re.findall(r"\b\d+\s*(?:ST|st|PCS|pcs)\b", cell)
if len(quantities) >= 2:
logger.debug(f"has_vertically_merged_cells: found {len(quantities)} quantities in cell")
return True
return False
def split_merged_rows(
self, rows: list[list[str]]
) -> tuple[list[str], list[list[str]]]:
"""
Split vertically merged cells back into separate rows.
Handles complex cases where PP-StructureV3 merges content across
multiple HTML rows. For example, 5 line items might be spread across
3 HTML rows with content mixed together.
Strategy:
1. Merge all row content per column
2. Detect how many actual data rows exist (by counting product numbers)
3. Split each column's content into that many lines
Returns header and data rows.
"""
if not rows:
return [], []
# Filter out completely empty rows
non_empty_rows = [r for r in rows if any(cell.strip() for cell in r)]
if not non_empty_rows:
return [], rows
# Determine column count
col_count = max(len(r) for r in non_empty_rows)
# Merge content from all rows for each column
merged_columns = []
for col_idx in range(col_count):
col_content = []
for row in non_empty_rows:
if col_idx < len(row) and row[col_idx].strip():
col_content.append(row[col_idx].strip())
merged_columns.append(" ".join(col_content))
logger.debug(f"split_merged_rows: merged columns = {merged_columns}")
# Count how many actual data rows we should have
# Use the column with most product numbers as reference
expected_rows = self._count_expected_rows(merged_columns)
logger.debug(f"split_merged_rows: expecting {expected_rows} data rows")
if expected_rows <= 1:
# Not enough data for splitting
return [], rows
# Split each column based on expected row count
split_columns = []
for col_idx, col_text in enumerate(merged_columns):
if not col_text.strip():
split_columns.append([""] * (expected_rows + 1)) # +1 for header
continue
lines = self._split_cell_content_for_rows(col_text, expected_rows)
split_columns.append(lines)
# Ensure all columns have same number of lines (immutable approach)
max_lines = max(len(col) for col in split_columns)
split_columns = [
col + [""] * (max_lines - len(col))
for col in split_columns
]
logger.debug(f"split_merged_rows: split into {max_lines} lines total")
# First line is header, rest are data rows
header = [col[0] for col in split_columns]
data_rows = []
for line_idx in range(1, max_lines):
row = [col[line_idx] if line_idx < len(col) else "" for col in split_columns]
if any(cell.strip() for cell in row):
data_rows.append(row)
logger.debug(f"split_merged_rows: header={header}, data_rows count={len(data_rows)}")
return header, data_rows
def _count_expected_rows(self, merged_columns: list[str]) -> int:
"""
Count how many data rows should exist based on content patterns.
Returns the maximum count found from:
- Product numbers (7 digits)
- Quantity patterns (number + ST/PCS)
- Amount patterns (in columns likely to be totals)
"""
max_count = 0
for col_text in merged_columns:
if not col_text:
continue
# Count product numbers (most reliable indicator)
product_nums = re.findall(r"\b\d{7}\b", col_text)
max_count = max(max_count, len(product_nums))
# Count quantities (e.g., "6ST 6ST 1ST 1ST 1ST")
quantities = re.findall(r"\b\d+\s*(?:ST|st|PCS|pcs)\b", col_text)
max_count = max(max_count, len(quantities))
return max_count
def _split_cell_content_for_rows(self, cell: str, expected_rows: int) -> list[str]:
"""
Split cell content knowing how many data rows we expect.
This is smarter than split_cell_content because it knows the target count.
"""
cell = cell.strip()
# Try product number split first
product_pattern = re.compile(r"(\b\d{7}\b)")
products = product_pattern.findall(cell)
if len(products) == expected_rows:
parts = product_pattern.split(cell)
header = parts[0].strip() if parts else ""
# Include description text after each product number
values = []
for i in range(1, len(parts), 2): # Odd indices are product numbers
if i < len(parts):
prod_num = parts[i].strip()
# Check if there's description text after
desc = parts[i + 1].strip() if i + 1 < len(parts) else ""
# If description looks like text (not another pattern), include it
if desc and not re.match(r"^\d{7}$", desc):
# Truncate at next product number pattern if any
desc_clean = re.split(r"\d{7}", desc)[0].strip()
if desc_clean:
values.append(f"{prod_num} {desc_clean}")
else:
values.append(prod_num)
else:
values.append(prod_num)
if len(values) == expected_rows:
return [header] + values
# Try quantity split
qty_pattern = re.compile(r"(\b\d+\s*(?:ST|st|PCS|pcs|M|m|KG|kg)\b)")
quantities = qty_pattern.findall(cell)
if len(quantities) == expected_rows:
parts = qty_pattern.split(cell)
header = parts[0].strip() if parts else ""
values = [p.strip() for p in parts[1:] if p.strip() and qty_pattern.match(p)]
if len(values) == expected_rows:
return [header] + values
# Try amount split for discount+totalsumma columns
cell_lower = cell.lower()
has_discount = any(kw in cell_lower for kw in ["rabatt", "discount"])
has_total = any(kw in cell_lower for kw in ["totalsumma", "total", "summa", "belopp"])
if has_discount and has_total:
# Extract only amounts (3+ digit numbers), skip discount percentages
amount_pattern = re.compile(r"\b(\d{3,}[,\.]\d{2})\b")
amounts = amount_pattern.findall(cell)
if len(amounts) >= expected_rows:
# Take the last expected_rows amounts (they are likely the totals)
return ["Totalsumma"] + amounts[:expected_rows]
# Try price split
price_pattern = re.compile(r"(\b\d{1,3}(?:\s?\d{3})*[,\.]\d{2}\b)")
prices = price_pattern.findall(cell)
if len(prices) >= expected_rows:
parts = price_pattern.split(cell)
header = parts[0].strip() if parts else ""
values = [p.strip() for p in parts[1:] if p.strip() and price_pattern.match(p)]
if len(values) >= expected_rows:
return [header] + values[:expected_rows]
# Fall back to original single-value behavior
return [cell]
def split_cell_content(self, cell: str) -> list[str]:
"""
Split a cell containing merged multi-line content.
Strategies:
1. Look for product number patterns (7 digits)
2. Look for quantity patterns (number + ST/PCS)
3. Look for price patterns (with decimal)
4. Handle interleaved discount+amount patterns
"""
cell = cell.strip()
# Strategy 1: Split by product numbers (common pattern: "Produktnr 1234567 1234568")
product_pattern = re.compile(r"(\b\d{7}\b)")
products = product_pattern.findall(cell)
if len(products) >= 2:
# Extract header (text before first product number) and values
parts = product_pattern.split(cell)
header = parts[0].strip() if parts else ""
values = [p for p in parts[1:] if p.strip() and re.match(r"\d{7}", p)]
return [header] + values
# Strategy 2: Split by quantities (e.g., "Antal 6ST 6ST 1ST")
qty_pattern = re.compile(r"(\b\d+\s*(?:ST|st|PCS|pcs|M|m|KG|kg)\b)")
quantities = qty_pattern.findall(cell)
if len(quantities) >= 2:
parts = qty_pattern.split(cell)
header = parts[0].strip() if parts else ""
values = [p.strip() for p in parts[1:] if p.strip() and qty_pattern.match(p)]
return [header] + values
# Strategy 3: Handle interleaved discount+amount (e.g., "Rabatt i% Totalsumma 10,0 686,88 10,0 686,88")
# Check if header contains two keywords indicating merged columns
cell_lower = cell.lower()
has_discount_header = any(kw in cell_lower for kw in ["rabatt", "discount"])
has_amount_header = any(kw in cell_lower for kw in ["totalsumma", "summa", "belopp", "total"])
if has_discount_header and has_amount_header:
# Extract all numbers and pair them (discount, amount, discount, amount, ...)
# Pattern for amounts: 3+ digit numbers with decimals (e.g., 686,88)
amount_pattern = re.compile(r"\b(\d{3,}[,\.]\d{2})\b")
amounts = amount_pattern.findall(cell)
if len(amounts) >= 2:
# Return header as "Totalsumma" (amount header) so it maps to amount field, not deduction
# This avoids the "Rabatt" keyword causing is_deduction=True
header = "Totalsumma"
return [header] + amounts
# Strategy 4: Split by prices (e.g., "Pris 127,20 127,20 159,20")
price_pattern = re.compile(r"(\b\d{1,3}(?:\s?\d{3})*[,\.]\d{2}\b)")
prices = price_pattern.findall(cell)
if len(prices) >= 2:
parts = price_pattern.split(cell)
header = parts[0].strip() if parts else ""
values = [p.strip() for p in parts[1:] if p.strip() and price_pattern.match(p)]
return [header] + values
# No pattern detected, return as single value
return [cell]
def has_merged_header(self, header: list[str] | None) -> bool:
"""
Check if header appears to be a merged cell containing multiple column names.
This happens when OCR merges table headers into a single cell, e.g.:
"Specifikation 0218103-1201 2 rum och kök Hyra Avdrag" instead of separate columns.
Also handles cases where PP-StructureV3 produces headers like:
["Specifikation ... Hyra Avdrag", "", "", ""] with empty trailing cells.
"""
if header is None or not header:
return False
# Filter out empty cells to find the actual content
non_empty_cells = [h for h in header if h.strip()]
# Check if we have a single non-empty cell that contains multiple keywords
if len(non_empty_cells) == 1:
header_text = non_empty_cells[0].lower()
# Count how many column keywords are in this single cell
keyword_count = 0
for patterns in self.mapper.mappings.values():
for pattern in patterns:
if pattern in header_text:
keyword_count += 1
break # Only count once per field type
logger.debug(f"has_merged_header: header_text='{header_text}', keyword_count={keyword_count}")
return keyword_count >= 2
return False
def extract_from_merged_cells(
self, header: list[str], rows: list[list[str]]
) -> list[LineItem]:
"""
Extract line items from tables with merged cells.
For poorly OCR'd tables like:
Header: ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
Row 1: ["", "", "", "8159"] <- amount row
Row 2: ["", "", "", "-2 000"] <- deduction row (separate line item)
Or:
Row: ["", "", "", "8159 -2 000"] <- both in same row -> 2 line items
Each amount becomes its own line item. Negative amounts are marked as is_deduction=True.
"""
items = []
# Amount pattern for Swedish format - match numbers like "8159" or "8 159" or "-2000" or "-2 000"
amount_pattern = re.compile(
r"(-?\d[\d\s]*(?:[,\.]\d+)?)"
)
# Try to parse header cell for description info
header_text = " ".join(h for h in header if h.strip()) if header else ""
logger.debug(f"extract_from_merged_cells: header_text='{header_text}'")
logger.debug(f"extract_from_merged_cells: rows={rows}")
# Extract description from header
description = None
article_number = None
# Look for object number pattern (e.g., "0218103-1201")
obj_match = re.search(r"(\d{7}-\d{4})", header_text)
if obj_match:
article_number = obj_match.group(1)
# Look for description after object number
desc_match = re.search(r"\d{7}-\d{4}\s+(.+?)(?:\s+(?:Hyra|Avdrag|Belopp))", header_text, re.IGNORECASE)
if desc_match:
description = desc_match.group(1).strip()
row_index = 0
for row in rows:
# Combine all non-empty cells in the row
row_text = " ".join(cell.strip() for cell in row if cell.strip())
logger.debug(f"extract_from_merged_cells: row text='{row_text}'")
if not row_text:
continue
# Find all amounts in the row
amounts = amount_pattern.findall(row_text)
logger.debug(f"extract_from_merged_cells: amounts={amounts}")
for amt_str in amounts:
# Clean the amount string
cleaned = amt_str.replace(" ", "").strip()
if not cleaned or cleaned == "-":
continue
is_deduction = cleaned.startswith("-")
# Skip small positive numbers that are likely not amounts
# (e.g., row indices, small percentages)
if not is_deduction:
try:
val = float(cleaned.replace(",", "."))
if val < MIN_AMOUNT_THRESHOLD:
continue
except ValueError:
continue
# Create a line item for each amount
item = LineItem(
row_index=row_index,
description=description if row_index == 0 else "Avdrag" if is_deduction else None,
article_number=article_number if row_index == 0 else None,
amount=cleaned,
is_deduction=is_deduction,
confidence=0.7,
)
items.append(item)
row_index += 1
logger.debug(f"extract_from_merged_cells: created item amount={cleaned}, is_deduction={is_deduction}")
return items

View File

@@ -0,0 +1,61 @@
"""
Line Items Data Models
Dataclasses for line item extraction results.
"""
from dataclasses import dataclass
from decimal import Decimal, InvalidOperation
@dataclass
class LineItem:
"""Single line item from invoice."""
row_index: int
description: str | None = None
quantity: str | None = None
unit: str | None = None
unit_price: str | None = None
amount: str | None = None
article_number: str | None = None
vat_rate: str | None = None
is_deduction: bool = False # True if this row is a deduction/discount
confidence: float = 0.9
@dataclass
class LineItemsResult:
"""Result of line items extraction."""
items: list[LineItem]
header_row: list[str]
raw_html: str
is_reversed: bool = False
@property
def total_amount(self) -> str | None:
"""Calculate total amount from line items (deduction rows have negative amounts)."""
if not self.items:
return None
total = Decimal("0")
for item in self.items:
if item.amount:
try:
# Parse Swedish number format (1 234,56)
amount_str = item.amount.replace(" ", "").replace(",", ".")
total += Decimal(amount_str)
except InvalidOperation:
pass
if total == 0:
return None
# Format back to Swedish format
formatted = f"{total:,.2f}".replace(",", " ").replace(".", ",")
# Fix the space/comma swap
parts = formatted.rsplit(",", 1)
if len(parts) == 2:
return parts[0].replace(" ", " ") + "," + parts[1]
return formatted

View File

@@ -158,36 +158,36 @@ class TableDetector:
return tables
# Log raw result type for debugging
logger.info(f"PP-StructureV3 raw results type: {type(results).__name__}")
logger.debug(f"PP-StructureV3 raw results type: {type(results).__name__}")
# Handle case where results is a single dict-like object (PaddleX 3.x)
# rather than a list of results
if hasattr(results, "get") and not isinstance(results, list):
# Single result object - wrap in list for uniform processing
logger.info("Results is dict-like, wrapping in list")
logger.debug("Results is dict-like, wrapping in list")
results = [results]
elif hasattr(results, "__iter__") and not isinstance(results, (list, tuple)):
# Iterator or generator - convert to list
try:
results = list(results)
logger.info(f"Converted iterator to list with {len(results)} items")
logger.debug(f"Converted iterator to list with {len(results)} items")
except Exception as e:
logger.warning(f"Failed to convert results to list: {e}")
return tables
logger.info(f"Processing {len(results)} result(s)")
logger.debug(f"Processing {len(results)} result(s)")
for i, result in enumerate(results):
try:
result_type = type(result).__name__
has_get = hasattr(result, "get")
has_layout = hasattr(result, "layout_elements")
logger.info(f"Result[{i}]: type={result_type}, has_get={has_get}, has_layout_elements={has_layout}")
logger.debug(f"Result[{i}]: type={result_type}, has_get={has_get}, has_layout_elements={has_layout}")
# Try PaddleX 3.x API first (dict-like with table_res_list)
if has_get:
parsed = self._parse_paddlex_result(result)
logger.info(f"Result[{i}]: parsed {len(parsed)} tables via PaddleX path")
logger.debug(f"Result[{i}]: parsed {len(parsed)} tables via PaddleX path")
tables.extend(parsed)
continue
@@ -201,14 +201,14 @@ class TableDetector:
if table_result and table_result.confidence >= self.config.min_confidence:
tables.append(table_result)
legacy_count += 1
logger.info(f"Result[{i}]: parsed {legacy_count} tables via legacy path")
logger.debug(f"Result[{i}]: parsed {legacy_count} tables via legacy path")
else:
logger.warning(f"Result[{i}]: no recognized API (not dict-like and no layout_elements)")
except Exception as e:
logger.warning(f"Failed to parse result: {type(result).__name__}, error: {e}")
continue
logger.info(f"Total tables detected: {len(tables)}")
logger.debug(f"Total tables detected: {len(tables)}")
return tables
def _parse_paddlex_result(self, result: Any) -> list[TableDetectionResult]:
@@ -223,7 +223,7 @@ class TableDetector:
result_keys = list(result.keys())
elif hasattr(result, "__dict__"):
result_keys = list(result.__dict__.keys())
logger.info(f"Parsing PaddleX result: type={result_type}, keys={result_keys}")
logger.debug(f"Parsing PaddleX result: type={result_type}, keys={result_keys}")
# Get table results from PaddleX 3.x API
# Handle both dict.get() and attribute access
@@ -234,8 +234,8 @@ class TableDetector:
table_res_list = getattr(result, "table_res_list", None)
parsing_res_list = getattr(result, "parsing_res_list", [])
logger.info(f"table_res_list: {type(table_res_list).__name__}, count={len(table_res_list) if table_res_list else 0}")
logger.info(f"parsing_res_list: {type(parsing_res_list).__name__}, count={len(parsing_res_list) if parsing_res_list else 0}")
logger.debug(f"table_res_list: {type(table_res_list).__name__}, count={len(table_res_list) if table_res_list else 0}")
logger.debug(f"parsing_res_list: {type(parsing_res_list).__name__}, count={len(parsing_res_list) if parsing_res_list else 0}")
if not table_res_list:
# Log available keys/attributes for debugging
@@ -330,7 +330,7 @@ class TableDetector:
# Default confidence for PaddleX 3.x results
confidence = 0.9
logger.info(f"Table {i}: html_len={len(html)}, cells={len(cells)}")
logger.debug(f"Table {i}: html_len={len(html)}, cells={len(cells)}")
tables.append(TableDetectionResult(
bbox=(float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])),
html=html,
@@ -467,14 +467,14 @@ class TableDetector:
if not pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {pdf_path}")
logger.info(f"detect_from_pdf: {pdf_path}, page={page_number}, dpi={dpi}")
logger.debug(f"detect_from_pdf: {pdf_path}, page={page_number}, dpi={dpi}")
# Render specific page
for page_no, image_bytes in render_pdf_to_images(str(pdf_path), dpi=dpi):
if page_no == page_number:
image = Image.open(io.BytesIO(image_bytes))
image_array = np.array(image)
logger.info(f"detect_from_pdf: rendered page {page_no}, image shape={image_array.shape}")
logger.debug(f"detect_from_pdf: rendered page {page_no}, image shape={image_array.shape}")
return self.detect(image_array)
raise ValueError(f"Page {page_number} not found in PDF")

View File

@@ -15,6 +15,11 @@ import logging
logger = logging.getLogger(__name__)
# Configuration constants
DEFAULT_ROW_TOLERANCE = 15.0 # Max vertical distance (pixels) to consider same row
MIN_ITEMS_FOR_VALID_EXTRACTION = 2 # Minimum items required for valid extraction
MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5 # Minimum text elements needed to attempt extraction
@dataclass
class TextElement:
@@ -65,7 +70,10 @@ class TextLineItemsResult:
extraction_method: str = "text_spatial"
# Swedish amount pattern: 1 234,56 or 1234.56 or 1,234.56
# Amount pattern matches Swedish, US, and simple numeric formats
# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
# Does NOT handle: amounts with more than 2 decimal places, scientific notation
# See tests in test_text_line_items_extractor.py::TestAmountPattern
AMOUNT_PATTERN = re.compile(
r"(?<![0-9])(?:"
r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?" # Swedish: 1 234,56
@@ -128,17 +136,17 @@ class TextLineItemsExtractor:
def __init__(
self,
row_tolerance: float = 15.0, # Max vertical distance to consider same row
min_items_for_valid: int = 2, # Minimum items to consider extraction valid
row_tolerance: float = DEFAULT_ROW_TOLERANCE,
min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
):
"""
Initialize extractor.
Args:
row_tolerance: Maximum vertical distance (pixels) between elements
to consider them on the same row.
to consider them on the same row. Default: 15.0
min_items_for_valid: Minimum number of line items required for
extraction to be considered successful.
extraction to be considered successful. Default: 2
"""
self.row_tolerance = row_tolerance
self.min_items_for_valid = min_items_for_valid
@@ -161,10 +169,13 @@ class TextLineItemsExtractor:
# Extract text elements from parsing results
text_elements = self._extract_text_elements(parsing_res_list)
logger.info(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
if len(text_elements) < 5: # Need at least a few elements
logger.debug("Too few text elements for line item extraction")
if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
logger.debug(
f"Too few text elements ({len(text_elements)}) for line item extraction, "
f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
)
return None
return self.extract_from_text_elements(text_elements)
@@ -183,11 +194,11 @@ class TextLineItemsExtractor:
"""
# Group elements by row
rows = self._group_by_row(text_elements)
logger.info(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
# Find the line items section
item_rows = self._identify_line_item_rows(rows)
logger.info(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
if len(item_rows) < self.min_items_for_valid:
logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
@@ -195,7 +206,7 @@ class TextLineItemsExtractor:
# Extract structured items
items = self._parse_line_items(item_rows)
logger.info(f"TextLineItemsExtractor: extracted {len(items)} line items")
logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")
if len(items) < self.min_items_for_valid:
return None
@@ -209,7 +220,11 @@ class TextLineItemsExtractor:
def _extract_text_elements(
self, parsing_res_list: list[dict[str, Any]]
) -> list[TextElement]:
"""Extract TextElement objects from parsing_res_list."""
"""Extract TextElement objects from parsing_res_list.
Handles both dict and LayoutBlock object formats from PP-StructureV3.
Gracefully skips invalid elements with appropriate logging.
"""
elements = []
for elem in parsing_res_list:
@@ -220,11 +235,15 @@ class TextLineItemsExtractor:
bbox = elem.get("bbox", [])
# Try both 'text' and 'content' keys
text = elem.get("text", "") or elem.get("content", "")
else:
elif hasattr(elem, "label"):
label = getattr(elem, "label", "")
bbox = getattr(elem, "bbox", [])
# LayoutBlock objects use 'content' attribute
text = getattr(elem, "content", "") or getattr(elem, "text", "")
else:
# Element is neither dict nor has expected attributes
logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
continue
# Only process text elements (skip images, tables, etc.)
if label not in ("text", "paragraph_title", "aside_text"):
@@ -232,6 +251,7 @@ class TextLineItemsExtractor:
# Validate bbox
if not self._valid_bbox(bbox):
logger.debug(f"Skipping element with invalid bbox: {bbox}")
continue
# Clean text
@@ -250,8 +270,13 @@ class TextLineItemsExtractor:
),
)
)
except (KeyError, TypeError, ValueError, AttributeError) as e:
# Expected format issues - log at debug level
logger.debug(f"Skipping element due to format issue: {e}")
continue
except Exception as e:
logger.debug(f"Failed to parse element: {e}")
# Unexpected errors - log at warning level for visibility
logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
continue
return elements
@@ -270,6 +295,7 @@ class TextLineItemsExtractor:
Group text elements into rows based on vertical position.
Elements within row_tolerance of each other are considered same row.
Uses dynamic average center_y to handle varying element heights more accurately.
"""
if not elements:
return []
@@ -277,22 +303,22 @@ class TextLineItemsExtractor:
# Sort by vertical position
sorted_elements = sorted(elements, key=lambda e: e.center_y)
rows = []
current_row = [sorted_elements[0]]
current_y = sorted_elements[0].center_y
rows: list[list[TextElement]] = []
current_row: list[TextElement] = [sorted_elements[0]]
for elem in sorted_elements[1:]:
if abs(elem.center_y - current_y) <= self.row_tolerance:
# Same row
# Calculate dynamic average center_y for current row
avg_center_y = sum(e.center_y for e in current_row) / len(current_row)
if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
# Same row - add element and recalculate average on next iteration
current_row.append(elem)
else:
# New row
if current_row:
# Sort row by horizontal position
current_row.sort(key=lambda e: e.center_x)
rows.append(current_row)
# New row - finalize current row
# Sort row by horizontal position (left to right)
current_row.sort(key=lambda e: e.center_x)
rows.append(current_row)
current_row = [elem]
current_y = elem.center_y
# Don't forget last row
if current_row: