refactor: split line_items_extractor into smaller modules with comprehensive tests
- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
204
packages/backend/backend/table/html_table_parser.py
Normal file
204
packages/backend/backend/table/html_table_parser.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
HTML Table Parser
|
||||
|
||||
Parses HTML tables into structured data and maps columns to field names.
|
||||
"""
|
||||
|
||||
from html.parser import HTMLParser
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration constants
|
||||
# Minimum pattern length to avoid false positives from short substrings
|
||||
MIN_PATTERN_MATCH_LENGTH = 3
|
||||
# Exact match bonus for column mapping priority
|
||||
EXACT_MATCH_BONUS = 100
|
||||
|
||||
# Swedish column name mappings
|
||||
# Extended to support multiple invoice types: product invoices, rental invoices, utility bills
|
||||
COLUMN_MAPPINGS = {
|
||||
"article_number": [
|
||||
"art nummer",
|
||||
"artikelnummer",
|
||||
"artikel",
|
||||
"artnr",
|
||||
"art.nr",
|
||||
"art nr",
|
||||
"objektnummer", # Rental: property reference
|
||||
"objekt",
|
||||
],
|
||||
"description": [
|
||||
"beskrivning",
|
||||
"produktbeskrivning",
|
||||
"produkt",
|
||||
"tjänst",
|
||||
"text",
|
||||
"benämning",
|
||||
"vara/tjänst",
|
||||
"vara",
|
||||
# Rental invoice specific
|
||||
"specifikation",
|
||||
"spec",
|
||||
"hyresperiod", # Rental period
|
||||
"period",
|
||||
"typ", # Type of charge
|
||||
# Utility bills
|
||||
"förbrukning", # Consumption
|
||||
"avläsning", # Meter reading
|
||||
],
|
||||
"quantity": ["antal", "qty", "st", "pcs", "kvantitet", "m²", "kvm"],
|
||||
"unit": ["enhet", "unit"],
|
||||
"unit_price": ["á-pris", "a-pris", "pris", "styckpris", "enhetspris", "à pris"],
|
||||
"amount": [
|
||||
"belopp",
|
||||
"summa",
|
||||
"total",
|
||||
"netto",
|
||||
"rad summa",
|
||||
# Rental specific
|
||||
"hyra", # Rent
|
||||
"avgift", # Fee
|
||||
"kostnad", # Cost
|
||||
"debitering", # Charge
|
||||
"totalt", # Total
|
||||
],
|
||||
"vat_rate": ["moms", "moms%", "vat", "skatt", "moms %"],
|
||||
# Additional field for rental: deductions/adjustments
|
||||
"deduction": [
|
||||
"avdrag", # Deduction
|
||||
"rabatt", # Discount
|
||||
"kredit", # Credit
|
||||
],
|
||||
}
|
||||
|
||||
# Keywords that indicate NOT a line items table
|
||||
SUMMARY_KEYWORDS = [
|
||||
"frakt",
|
||||
"faktura.avg",
|
||||
"fakturavg",
|
||||
"exkl.moms",
|
||||
"att betala",
|
||||
"öresavr",
|
||||
"bankgiro",
|
||||
"plusgiro",
|
||||
"ocr",
|
||||
"forfallodatum",
|
||||
"förfallodatum",
|
||||
]
|
||||
|
||||
|
||||
class _TableHTMLParser(HTMLParser):
|
||||
"""Internal HTML parser for tables."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.rows: list[list[str]] = []
|
||||
self.current_row: list[str] = []
|
||||
self.current_cell: str = ""
|
||||
self.in_td = False
|
||||
self.in_thead = False
|
||||
self.header_row: list[str] = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == "tr":
|
||||
self.current_row = []
|
||||
elif tag in ("td", "th"):
|
||||
self.in_td = True
|
||||
self.current_cell = ""
|
||||
elif tag == "thead":
|
||||
self.in_thead = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ("td", "th"):
|
||||
self.in_td = False
|
||||
self.current_row.append(self.current_cell.strip())
|
||||
elif tag == "tr":
|
||||
if self.current_row:
|
||||
if self.in_thead:
|
||||
self.header_row = self.current_row
|
||||
else:
|
||||
self.rows.append(self.current_row)
|
||||
elif tag == "thead":
|
||||
self.in_thead = False
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.in_td:
|
||||
self.current_cell += data
|
||||
|
||||
|
||||
class HTMLTableParser:
|
||||
"""Parse HTML tables into structured data."""
|
||||
|
||||
def parse(self, html: str) -> tuple[list[str], list[list[str]]]:
|
||||
"""
|
||||
Parse HTML table and return header and rows.
|
||||
|
||||
Args:
|
||||
html: HTML string containing table.
|
||||
|
||||
Returns:
|
||||
Tuple of (header_row, data_rows).
|
||||
"""
|
||||
parser = _TableHTMLParser()
|
||||
parser.feed(html)
|
||||
return parser.header_row, parser.rows
|
||||
|
||||
|
||||
class ColumnMapper:
|
||||
"""Map column headers to field names."""
|
||||
|
||||
def __init__(self, mappings: dict[str, list[str]] | None = None):
|
||||
"""
|
||||
Initialize column mapper.
|
||||
|
||||
Args:
|
||||
mappings: Custom column mappings. Uses Swedish defaults if None.
|
||||
"""
|
||||
self.mappings = mappings or COLUMN_MAPPINGS
|
||||
|
||||
def map(self, headers: list[str]) -> dict[int, str]:
|
||||
"""
|
||||
Map column indices to field names.
|
||||
|
||||
Args:
|
||||
headers: List of column header strings.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping column index to field name.
|
||||
"""
|
||||
mapping = {}
|
||||
for idx, header in enumerate(headers):
|
||||
normalized = self._normalize(header)
|
||||
|
||||
if not normalized.strip():
|
||||
continue
|
||||
|
||||
best_match = None
|
||||
best_match_len = 0
|
||||
|
||||
for field_name, patterns in self.mappings.items():
|
||||
for pattern in patterns:
|
||||
if pattern == normalized:
|
||||
# Exact match gets highest priority
|
||||
best_match = field_name
|
||||
best_match_len = len(pattern) + EXACT_MATCH_BONUS
|
||||
break
|
||||
elif pattern in normalized and len(pattern) > best_match_len:
|
||||
# Partial match requires minimum length to avoid false positives
|
||||
if len(pattern) >= MIN_PATTERN_MATCH_LENGTH:
|
||||
best_match = field_name
|
||||
best_match_len = len(pattern)
|
||||
|
||||
if best_match_len > EXACT_MATCH_BONUS:
|
||||
# Found exact match, no need to check other fields
|
||||
break
|
||||
|
||||
if best_match:
|
||||
mapping[idx] = best_match
|
||||
|
||||
return mapping
|
||||
|
||||
def _normalize(self, header: str) -> str:
|
||||
"""Normalize header text for matching."""
|
||||
return header.lower().strip().replace(".", "").replace("-", " ")
|
||||
File diff suppressed because it is too large
Load Diff
423
packages/backend/backend/table/merged_cell_handler.py
Normal file
423
packages/backend/backend/table/merged_cell_handler.py
Normal file
@@ -0,0 +1,423 @@
|
||||
"""
|
||||
Merged Cell Handler
|
||||
|
||||
Handles detection and extraction of data from tables with merged cells,
|
||||
a common issue with PP-StructureV3 OCR output.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from .models import LineItem
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .html_table_parser import ColumnMapper
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Minimum positive amount to consider as line item (filters noise like row indices)
|
||||
MIN_AMOUNT_THRESHOLD = 100
|
||||
|
||||
|
||||
class MergedCellHandler:
|
||||
"""Handles tables with vertically merged cells from PP-StructureV3."""
|
||||
|
||||
def __init__(self, mapper: "ColumnMapper"):
|
||||
"""
|
||||
Initialize handler.
|
||||
|
||||
Args:
|
||||
mapper: ColumnMapper instance for header keyword detection.
|
||||
"""
|
||||
self.mapper = mapper
|
||||
|
||||
def has_vertically_merged_cells(self, rows: list[list[str]]) -> bool:
|
||||
"""
|
||||
Check if table rows contain vertically merged data in single cells.
|
||||
|
||||
PP-StructureV3 sometimes merges multiple table rows into single cells, e.g.:
|
||||
["Produktnr 1457280 1457280 1060381", "", "Antal 6ST 6ST 1ST", "Pris 127,20 127,20 159,20"]
|
||||
|
||||
Detection: cells contain repeating patterns of numbers or keywords suggesting multiple lines.
|
||||
"""
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
for row in rows:
|
||||
for cell in row:
|
||||
if not cell or len(cell) < 20:
|
||||
continue
|
||||
|
||||
# Check for multiple product numbers (7+ digit patterns)
|
||||
product_nums = re.findall(r"\b\d{7}\b", cell)
|
||||
if len(product_nums) >= 2:
|
||||
logger.debug(f"has_vertically_merged_cells: found {len(product_nums)} product numbers in cell")
|
||||
return True
|
||||
|
||||
# Check for multiple prices (Swedish format: 123,45 or 1 234,56)
|
||||
prices = re.findall(r"\b\d{1,3}(?:\s?\d{3})*[,\.]\d{2}\b", cell)
|
||||
if len(prices) >= 3:
|
||||
logger.debug(f"has_vertically_merged_cells: found {len(prices)} prices in cell")
|
||||
return True
|
||||
|
||||
# Check for multiple quantity patterns (e.g., "6ST 6ST 1ST")
|
||||
quantities = re.findall(r"\b\d+\s*(?:ST|st|PCS|pcs)\b", cell)
|
||||
if len(quantities) >= 2:
|
||||
logger.debug(f"has_vertically_merged_cells: found {len(quantities)} quantities in cell")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def split_merged_rows(
|
||||
self, rows: list[list[str]]
|
||||
) -> tuple[list[str], list[list[str]]]:
|
||||
"""
|
||||
Split vertically merged cells back into separate rows.
|
||||
|
||||
Handles complex cases where PP-StructureV3 merges content across
|
||||
multiple HTML rows. For example, 5 line items might be spread across
|
||||
3 HTML rows with content mixed together.
|
||||
|
||||
Strategy:
|
||||
1. Merge all row content per column
|
||||
2. Detect how many actual data rows exist (by counting product numbers)
|
||||
3. Split each column's content into that many lines
|
||||
|
||||
Returns header and data rows.
|
||||
"""
|
||||
if not rows:
|
||||
return [], []
|
||||
|
||||
# Filter out completely empty rows
|
||||
non_empty_rows = [r for r in rows if any(cell.strip() for cell in r)]
|
||||
if not non_empty_rows:
|
||||
return [], rows
|
||||
|
||||
# Determine column count
|
||||
col_count = max(len(r) for r in non_empty_rows)
|
||||
|
||||
# Merge content from all rows for each column
|
||||
merged_columns = []
|
||||
for col_idx in range(col_count):
|
||||
col_content = []
|
||||
for row in non_empty_rows:
|
||||
if col_idx < len(row) and row[col_idx].strip():
|
||||
col_content.append(row[col_idx].strip())
|
||||
merged_columns.append(" ".join(col_content))
|
||||
|
||||
logger.debug(f"split_merged_rows: merged columns = {merged_columns}")
|
||||
|
||||
# Count how many actual data rows we should have
|
||||
# Use the column with most product numbers as reference
|
||||
expected_rows = self._count_expected_rows(merged_columns)
|
||||
logger.debug(f"split_merged_rows: expecting {expected_rows} data rows")
|
||||
|
||||
if expected_rows <= 1:
|
||||
# Not enough data for splitting
|
||||
return [], rows
|
||||
|
||||
# Split each column based on expected row count
|
||||
split_columns = []
|
||||
for col_idx, col_text in enumerate(merged_columns):
|
||||
if not col_text.strip():
|
||||
split_columns.append([""] * (expected_rows + 1)) # +1 for header
|
||||
continue
|
||||
lines = self._split_cell_content_for_rows(col_text, expected_rows)
|
||||
split_columns.append(lines)
|
||||
|
||||
# Ensure all columns have same number of lines (immutable approach)
|
||||
max_lines = max(len(col) for col in split_columns)
|
||||
split_columns = [
|
||||
col + [""] * (max_lines - len(col))
|
||||
for col in split_columns
|
||||
]
|
||||
|
||||
logger.debug(f"split_merged_rows: split into {max_lines} lines total")
|
||||
|
||||
# First line is header, rest are data rows
|
||||
header = [col[0] for col in split_columns]
|
||||
data_rows = []
|
||||
for line_idx in range(1, max_lines):
|
||||
row = [col[line_idx] if line_idx < len(col) else "" for col in split_columns]
|
||||
if any(cell.strip() for cell in row):
|
||||
data_rows.append(row)
|
||||
|
||||
logger.debug(f"split_merged_rows: header={header}, data_rows count={len(data_rows)}")
|
||||
return header, data_rows
|
||||
|
||||
def _count_expected_rows(self, merged_columns: list[str]) -> int:
|
||||
"""
|
||||
Count how many data rows should exist based on content patterns.
|
||||
|
||||
Returns the maximum count found from:
|
||||
- Product numbers (7 digits)
|
||||
- Quantity patterns (number + ST/PCS)
|
||||
- Amount patterns (in columns likely to be totals)
|
||||
"""
|
||||
max_count = 0
|
||||
|
||||
for col_text in merged_columns:
|
||||
if not col_text:
|
||||
continue
|
||||
|
||||
# Count product numbers (most reliable indicator)
|
||||
product_nums = re.findall(r"\b\d{7}\b", col_text)
|
||||
max_count = max(max_count, len(product_nums))
|
||||
|
||||
# Count quantities (e.g., "6ST 6ST 1ST 1ST 1ST")
|
||||
quantities = re.findall(r"\b\d+\s*(?:ST|st|PCS|pcs)\b", col_text)
|
||||
max_count = max(max_count, len(quantities))
|
||||
|
||||
return max_count
|
||||
|
||||
def _split_cell_content_for_rows(self, cell: str, expected_rows: int) -> list[str]:
|
||||
"""
|
||||
Split cell content knowing how many data rows we expect.
|
||||
|
||||
This is smarter than split_cell_content because it knows the target count.
|
||||
"""
|
||||
cell = cell.strip()
|
||||
|
||||
# Try product number split first
|
||||
product_pattern = re.compile(r"(\b\d{7}\b)")
|
||||
products = product_pattern.findall(cell)
|
||||
if len(products) == expected_rows:
|
||||
parts = product_pattern.split(cell)
|
||||
header = parts[0].strip() if parts else ""
|
||||
# Include description text after each product number
|
||||
values = []
|
||||
for i in range(1, len(parts), 2): # Odd indices are product numbers
|
||||
if i < len(parts):
|
||||
prod_num = parts[i].strip()
|
||||
# Check if there's description text after
|
||||
desc = parts[i + 1].strip() if i + 1 < len(parts) else ""
|
||||
# If description looks like text (not another pattern), include it
|
||||
if desc and not re.match(r"^\d{7}$", desc):
|
||||
# Truncate at next product number pattern if any
|
||||
desc_clean = re.split(r"\d{7}", desc)[0].strip()
|
||||
if desc_clean:
|
||||
values.append(f"{prod_num} {desc_clean}")
|
||||
else:
|
||||
values.append(prod_num)
|
||||
else:
|
||||
values.append(prod_num)
|
||||
if len(values) == expected_rows:
|
||||
return [header] + values
|
||||
|
||||
# Try quantity split
|
||||
qty_pattern = re.compile(r"(\b\d+\s*(?:ST|st|PCS|pcs|M|m|KG|kg)\b)")
|
||||
quantities = qty_pattern.findall(cell)
|
||||
if len(quantities) == expected_rows:
|
||||
parts = qty_pattern.split(cell)
|
||||
header = parts[0].strip() if parts else ""
|
||||
values = [p.strip() for p in parts[1:] if p.strip() and qty_pattern.match(p)]
|
||||
if len(values) == expected_rows:
|
||||
return [header] + values
|
||||
|
||||
# Try amount split for discount+totalsumma columns
|
||||
cell_lower = cell.lower()
|
||||
has_discount = any(kw in cell_lower for kw in ["rabatt", "discount"])
|
||||
has_total = any(kw in cell_lower for kw in ["totalsumma", "total", "summa", "belopp"])
|
||||
|
||||
if has_discount and has_total:
|
||||
# Extract only amounts (3+ digit numbers), skip discount percentages
|
||||
amount_pattern = re.compile(r"\b(\d{3,}[,\.]\d{2})\b")
|
||||
amounts = amount_pattern.findall(cell)
|
||||
if len(amounts) >= expected_rows:
|
||||
# Take the last expected_rows amounts (they are likely the totals)
|
||||
return ["Totalsumma"] + amounts[:expected_rows]
|
||||
|
||||
# Try price split
|
||||
price_pattern = re.compile(r"(\b\d{1,3}(?:\s?\d{3})*[,\.]\d{2}\b)")
|
||||
prices = price_pattern.findall(cell)
|
||||
if len(prices) >= expected_rows:
|
||||
parts = price_pattern.split(cell)
|
||||
header = parts[0].strip() if parts else ""
|
||||
values = [p.strip() for p in parts[1:] if p.strip() and price_pattern.match(p)]
|
||||
if len(values) >= expected_rows:
|
||||
return [header] + values[:expected_rows]
|
||||
|
||||
# Fall back to original single-value behavior
|
||||
return [cell]
|
||||
|
||||
def split_cell_content(self, cell: str) -> list[str]:
|
||||
"""
|
||||
Split a cell containing merged multi-line content.
|
||||
|
||||
Strategies:
|
||||
1. Look for product number patterns (7 digits)
|
||||
2. Look for quantity patterns (number + ST/PCS)
|
||||
3. Look for price patterns (with decimal)
|
||||
4. Handle interleaved discount+amount patterns
|
||||
"""
|
||||
cell = cell.strip()
|
||||
|
||||
# Strategy 1: Split by product numbers (common pattern: "Produktnr 1234567 1234568")
|
||||
product_pattern = re.compile(r"(\b\d{7}\b)")
|
||||
products = product_pattern.findall(cell)
|
||||
if len(products) >= 2:
|
||||
# Extract header (text before first product number) and values
|
||||
parts = product_pattern.split(cell)
|
||||
header = parts[0].strip() if parts else ""
|
||||
values = [p for p in parts[1:] if p.strip() and re.match(r"\d{7}", p)]
|
||||
return [header] + values
|
||||
|
||||
# Strategy 2: Split by quantities (e.g., "Antal 6ST 6ST 1ST")
|
||||
qty_pattern = re.compile(r"(\b\d+\s*(?:ST|st|PCS|pcs|M|m|KG|kg)\b)")
|
||||
quantities = qty_pattern.findall(cell)
|
||||
if len(quantities) >= 2:
|
||||
parts = qty_pattern.split(cell)
|
||||
header = parts[0].strip() if parts else ""
|
||||
values = [p.strip() for p in parts[1:] if p.strip() and qty_pattern.match(p)]
|
||||
return [header] + values
|
||||
|
||||
# Strategy 3: Handle interleaved discount+amount (e.g., "Rabatt i% Totalsumma 10,0 686,88 10,0 686,88")
|
||||
# Check if header contains two keywords indicating merged columns
|
||||
cell_lower = cell.lower()
|
||||
has_discount_header = any(kw in cell_lower for kw in ["rabatt", "discount"])
|
||||
has_amount_header = any(kw in cell_lower for kw in ["totalsumma", "summa", "belopp", "total"])
|
||||
|
||||
if has_discount_header and has_amount_header:
|
||||
# Extract all numbers and pair them (discount, amount, discount, amount, ...)
|
||||
# Pattern for amounts: 3+ digit numbers with decimals (e.g., 686,88)
|
||||
amount_pattern = re.compile(r"\b(\d{3,}[,\.]\d{2})\b")
|
||||
amounts = amount_pattern.findall(cell)
|
||||
|
||||
if len(amounts) >= 2:
|
||||
# Return header as "Totalsumma" (amount header) so it maps to amount field, not deduction
|
||||
# This avoids the "Rabatt" keyword causing is_deduction=True
|
||||
header = "Totalsumma"
|
||||
return [header] + amounts
|
||||
|
||||
# Strategy 4: Split by prices (e.g., "Pris 127,20 127,20 159,20")
|
||||
price_pattern = re.compile(r"(\b\d{1,3}(?:\s?\d{3})*[,\.]\d{2}\b)")
|
||||
prices = price_pattern.findall(cell)
|
||||
if len(prices) >= 2:
|
||||
parts = price_pattern.split(cell)
|
||||
header = parts[0].strip() if parts else ""
|
||||
values = [p.strip() for p in parts[1:] if p.strip() and price_pattern.match(p)]
|
||||
return [header] + values
|
||||
|
||||
# No pattern detected, return as single value
|
||||
return [cell]
|
||||
|
||||
def has_merged_header(self, header: list[str] | None) -> bool:
|
||||
"""
|
||||
Check if header appears to be a merged cell containing multiple column names.
|
||||
|
||||
This happens when OCR merges table headers into a single cell, e.g.:
|
||||
"Specifikation 0218103-1201 2 rum och kök Hyra Avdrag" instead of separate columns.
|
||||
|
||||
Also handles cases where PP-StructureV3 produces headers like:
|
||||
["Specifikation ... Hyra Avdrag", "", "", ""] with empty trailing cells.
|
||||
"""
|
||||
if header is None or not header:
|
||||
return False
|
||||
|
||||
# Filter out empty cells to find the actual content
|
||||
non_empty_cells = [h for h in header if h.strip()]
|
||||
|
||||
# Check if we have a single non-empty cell that contains multiple keywords
|
||||
if len(non_empty_cells) == 1:
|
||||
header_text = non_empty_cells[0].lower()
|
||||
# Count how many column keywords are in this single cell
|
||||
keyword_count = 0
|
||||
for patterns in self.mapper.mappings.values():
|
||||
for pattern in patterns:
|
||||
if pattern in header_text:
|
||||
keyword_count += 1
|
||||
break # Only count once per field type
|
||||
|
||||
logger.debug(f"has_merged_header: header_text='{header_text}', keyword_count={keyword_count}")
|
||||
return keyword_count >= 2
|
||||
|
||||
return False
|
||||
|
||||
def extract_from_merged_cells(
|
||||
self, header: list[str], rows: list[list[str]]
|
||||
) -> list[LineItem]:
|
||||
"""
|
||||
Extract line items from tables with merged cells.
|
||||
|
||||
For poorly OCR'd tables like:
|
||||
Header: ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
||||
Row 1: ["", "", "", "8159"] <- amount row
|
||||
Row 2: ["", "", "", "-2 000"] <- deduction row (separate line item)
|
||||
|
||||
Or:
|
||||
Row: ["", "", "", "8159 -2 000"] <- both in same row -> 2 line items
|
||||
|
||||
Each amount becomes its own line item. Negative amounts are marked as is_deduction=True.
|
||||
"""
|
||||
items = []
|
||||
|
||||
# Amount pattern for Swedish format - match numbers like "8159" or "8 159" or "-2000" or "-2 000"
|
||||
amount_pattern = re.compile(
|
||||
r"(-?\d[\d\s]*(?:[,\.]\d+)?)"
|
||||
)
|
||||
|
||||
# Try to parse header cell for description info
|
||||
header_text = " ".join(h for h in header if h.strip()) if header else ""
|
||||
logger.debug(f"extract_from_merged_cells: header_text='{header_text}'")
|
||||
logger.debug(f"extract_from_merged_cells: rows={rows}")
|
||||
|
||||
# Extract description from header
|
||||
description = None
|
||||
article_number = None
|
||||
|
||||
# Look for object number pattern (e.g., "0218103-1201")
|
||||
obj_match = re.search(r"(\d{7}-\d{4})", header_text)
|
||||
if obj_match:
|
||||
article_number = obj_match.group(1)
|
||||
|
||||
# Look for description after object number
|
||||
desc_match = re.search(r"\d{7}-\d{4}\s+(.+?)(?:\s+(?:Hyra|Avdrag|Belopp))", header_text, re.IGNORECASE)
|
||||
if desc_match:
|
||||
description = desc_match.group(1).strip()
|
||||
|
||||
row_index = 0
|
||||
for row in rows:
|
||||
# Combine all non-empty cells in the row
|
||||
row_text = " ".join(cell.strip() for cell in row if cell.strip())
|
||||
logger.debug(f"extract_from_merged_cells: row text='{row_text}'")
|
||||
|
||||
if not row_text:
|
||||
continue
|
||||
|
||||
# Find all amounts in the row
|
||||
amounts = amount_pattern.findall(row_text)
|
||||
logger.debug(f"extract_from_merged_cells: amounts={amounts}")
|
||||
|
||||
for amt_str in amounts:
|
||||
# Clean the amount string
|
||||
cleaned = amt_str.replace(" ", "").strip()
|
||||
if not cleaned or cleaned == "-":
|
||||
continue
|
||||
|
||||
is_deduction = cleaned.startswith("-")
|
||||
|
||||
# Skip small positive numbers that are likely not amounts
|
||||
# (e.g., row indices, small percentages)
|
||||
if not is_deduction:
|
||||
try:
|
||||
val = float(cleaned.replace(",", "."))
|
||||
if val < MIN_AMOUNT_THRESHOLD:
|
||||
continue
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Create a line item for each amount
|
||||
item = LineItem(
|
||||
row_index=row_index,
|
||||
description=description if row_index == 0 else "Avdrag" if is_deduction else None,
|
||||
article_number=article_number if row_index == 0 else None,
|
||||
amount=cleaned,
|
||||
is_deduction=is_deduction,
|
||||
confidence=0.7,
|
||||
)
|
||||
items.append(item)
|
||||
row_index += 1
|
||||
logger.debug(f"extract_from_merged_cells: created item amount={cleaned}, is_deduction={is_deduction}")
|
||||
|
||||
return items
|
||||
61
packages/backend/backend/table/models.py
Normal file
61
packages/backend/backend/table/models.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Line Items Data Models
|
||||
|
||||
Dataclasses for line item extraction results.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from decimal import Decimal, InvalidOperation
|
||||
|
||||
|
||||
@dataclass
|
||||
class LineItem:
|
||||
"""Single line item from invoice."""
|
||||
|
||||
row_index: int
|
||||
description: str | None = None
|
||||
quantity: str | None = None
|
||||
unit: str | None = None
|
||||
unit_price: str | None = None
|
||||
amount: str | None = None
|
||||
article_number: str | None = None
|
||||
vat_rate: str | None = None
|
||||
is_deduction: bool = False # True if this row is a deduction/discount
|
||||
confidence: float = 0.9
|
||||
|
||||
|
||||
@dataclass
|
||||
class LineItemsResult:
|
||||
"""Result of line items extraction."""
|
||||
|
||||
items: list[LineItem]
|
||||
header_row: list[str]
|
||||
raw_html: str
|
||||
is_reversed: bool = False
|
||||
|
||||
@property
|
||||
def total_amount(self) -> str | None:
|
||||
"""Calculate total amount from line items (deduction rows have negative amounts)."""
|
||||
if not self.items:
|
||||
return None
|
||||
|
||||
total = Decimal("0")
|
||||
for item in self.items:
|
||||
if item.amount:
|
||||
try:
|
||||
# Parse Swedish number format (1 234,56)
|
||||
amount_str = item.amount.replace(" ", "").replace(",", ".")
|
||||
total += Decimal(amount_str)
|
||||
except InvalidOperation:
|
||||
pass
|
||||
|
||||
if total == 0:
|
||||
return None
|
||||
|
||||
# Format back to Swedish format
|
||||
formatted = f"{total:,.2f}".replace(",", " ").replace(".", ",")
|
||||
# Fix the space/comma swap
|
||||
parts = formatted.rsplit(",", 1)
|
||||
if len(parts) == 2:
|
||||
return parts[0].replace(" ", " ") + "," + parts[1]
|
||||
return formatted
|
||||
@@ -158,36 +158,36 @@ class TableDetector:
|
||||
return tables
|
||||
|
||||
# Log raw result type for debugging
|
||||
logger.info(f"PP-StructureV3 raw results type: {type(results).__name__}")
|
||||
logger.debug(f"PP-StructureV3 raw results type: {type(results).__name__}")
|
||||
|
||||
# Handle case where results is a single dict-like object (PaddleX 3.x)
|
||||
# rather than a list of results
|
||||
if hasattr(results, "get") and not isinstance(results, list):
|
||||
# Single result object - wrap in list for uniform processing
|
||||
logger.info("Results is dict-like, wrapping in list")
|
||||
logger.debug("Results is dict-like, wrapping in list")
|
||||
results = [results]
|
||||
elif hasattr(results, "__iter__") and not isinstance(results, (list, tuple)):
|
||||
# Iterator or generator - convert to list
|
||||
try:
|
||||
results = list(results)
|
||||
logger.info(f"Converted iterator to list with {len(results)} items")
|
||||
logger.debug(f"Converted iterator to list with {len(results)} items")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to convert results to list: {e}")
|
||||
return tables
|
||||
|
||||
logger.info(f"Processing {len(results)} result(s)")
|
||||
logger.debug(f"Processing {len(results)} result(s)")
|
||||
|
||||
for i, result in enumerate(results):
|
||||
try:
|
||||
result_type = type(result).__name__
|
||||
has_get = hasattr(result, "get")
|
||||
has_layout = hasattr(result, "layout_elements")
|
||||
logger.info(f"Result[{i}]: type={result_type}, has_get={has_get}, has_layout_elements={has_layout}")
|
||||
logger.debug(f"Result[{i}]: type={result_type}, has_get={has_get}, has_layout_elements={has_layout}")
|
||||
|
||||
# Try PaddleX 3.x API first (dict-like with table_res_list)
|
||||
if has_get:
|
||||
parsed = self._parse_paddlex_result(result)
|
||||
logger.info(f"Result[{i}]: parsed {len(parsed)} tables via PaddleX path")
|
||||
logger.debug(f"Result[{i}]: parsed {len(parsed)} tables via PaddleX path")
|
||||
tables.extend(parsed)
|
||||
continue
|
||||
|
||||
@@ -201,14 +201,14 @@ class TableDetector:
|
||||
if table_result and table_result.confidence >= self.config.min_confidence:
|
||||
tables.append(table_result)
|
||||
legacy_count += 1
|
||||
logger.info(f"Result[{i}]: parsed {legacy_count} tables via legacy path")
|
||||
logger.debug(f"Result[{i}]: parsed {legacy_count} tables via legacy path")
|
||||
else:
|
||||
logger.warning(f"Result[{i}]: no recognized API (not dict-like and no layout_elements)")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse result: {type(result).__name__}, error: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Total tables detected: {len(tables)}")
|
||||
logger.debug(f"Total tables detected: {len(tables)}")
|
||||
return tables
|
||||
|
||||
def _parse_paddlex_result(self, result: Any) -> list[TableDetectionResult]:
|
||||
@@ -223,7 +223,7 @@ class TableDetector:
|
||||
result_keys = list(result.keys())
|
||||
elif hasattr(result, "__dict__"):
|
||||
result_keys = list(result.__dict__.keys())
|
||||
logger.info(f"Parsing PaddleX result: type={result_type}, keys={result_keys}")
|
||||
logger.debug(f"Parsing PaddleX result: type={result_type}, keys={result_keys}")
|
||||
|
||||
# Get table results from PaddleX 3.x API
|
||||
# Handle both dict.get() and attribute access
|
||||
@@ -234,8 +234,8 @@ class TableDetector:
|
||||
table_res_list = getattr(result, "table_res_list", None)
|
||||
parsing_res_list = getattr(result, "parsing_res_list", [])
|
||||
|
||||
logger.info(f"table_res_list: {type(table_res_list).__name__}, count={len(table_res_list) if table_res_list else 0}")
|
||||
logger.info(f"parsing_res_list: {type(parsing_res_list).__name__}, count={len(parsing_res_list) if parsing_res_list else 0}")
|
||||
logger.debug(f"table_res_list: {type(table_res_list).__name__}, count={len(table_res_list) if table_res_list else 0}")
|
||||
logger.debug(f"parsing_res_list: {type(parsing_res_list).__name__}, count={len(parsing_res_list) if parsing_res_list else 0}")
|
||||
|
||||
if not table_res_list:
|
||||
# Log available keys/attributes for debugging
|
||||
@@ -330,7 +330,7 @@ class TableDetector:
|
||||
# Default confidence for PaddleX 3.x results
|
||||
confidence = 0.9
|
||||
|
||||
logger.info(f"Table {i}: html_len={len(html)}, cells={len(cells)}")
|
||||
logger.debug(f"Table {i}: html_len={len(html)}, cells={len(cells)}")
|
||||
tables.append(TableDetectionResult(
|
||||
bbox=(float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])),
|
||||
html=html,
|
||||
@@ -467,14 +467,14 @@ class TableDetector:
|
||||
if not pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
||||
|
||||
logger.info(f"detect_from_pdf: {pdf_path}, page={page_number}, dpi={dpi}")
|
||||
logger.debug(f"detect_from_pdf: {pdf_path}, page={page_number}, dpi={dpi}")
|
||||
|
||||
# Render specific page
|
||||
for page_no, image_bytes in render_pdf_to_images(str(pdf_path), dpi=dpi):
|
||||
if page_no == page_number:
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
image_array = np.array(image)
|
||||
logger.info(f"detect_from_pdf: rendered page {page_no}, image shape={image_array.shape}")
|
||||
logger.debug(f"detect_from_pdf: rendered page {page_no}, image shape={image_array.shape}")
|
||||
return self.detect(image_array)
|
||||
|
||||
raise ValueError(f"Page {page_number} not found in PDF")
|
||||
|
||||
@@ -15,6 +15,11 @@ import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration constants
|
||||
DEFAULT_ROW_TOLERANCE = 15.0 # Max vertical distance (pixels) to consider same row
|
||||
MIN_ITEMS_FOR_VALID_EXTRACTION = 2 # Minimum items required for valid extraction
|
||||
MIN_TEXT_ELEMENTS_FOR_EXTRACTION = 5 # Minimum text elements needed to attempt extraction
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextElement:
|
||||
@@ -65,7 +70,10 @@ class TextLineItemsResult:
|
||||
extraction_method: str = "text_spatial"
|
||||
|
||||
|
||||
# Swedish amount pattern: 1 234,56 or 1234.56 or 1,234.56
|
||||
# Amount pattern matches Swedish, US, and simple numeric formats
|
||||
# Handles: "1 234,56", "1,234.56", "1234.56", "100 kr", "50:-", "-100,00"
|
||||
# Does NOT handle: amounts with more than 2 decimal places, scientific notation
|
||||
# See tests in test_text_line_items_extractor.py::TestAmountPattern
|
||||
AMOUNT_PATTERN = re.compile(
|
||||
r"(?<![0-9])(?:"
|
||||
r"-?\d{1,3}(?:\s\d{3})*(?:,\d{2})?" # Swedish: 1 234,56
|
||||
@@ -128,17 +136,17 @@ class TextLineItemsExtractor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
row_tolerance: float = 15.0, # Max vertical distance to consider same row
|
||||
min_items_for_valid: int = 2, # Minimum items to consider extraction valid
|
||||
row_tolerance: float = DEFAULT_ROW_TOLERANCE,
|
||||
min_items_for_valid: int = MIN_ITEMS_FOR_VALID_EXTRACTION,
|
||||
):
|
||||
"""
|
||||
Initialize extractor.
|
||||
|
||||
Args:
|
||||
row_tolerance: Maximum vertical distance (pixels) between elements
|
||||
to consider them on the same row.
|
||||
to consider them on the same row. Default: 15.0
|
||||
min_items_for_valid: Minimum number of line items required for
|
||||
extraction to be considered successful.
|
||||
extraction to be considered successful. Default: 2
|
||||
"""
|
||||
self.row_tolerance = row_tolerance
|
||||
self.min_items_for_valid = min_items_for_valid
|
||||
@@ -161,10 +169,13 @@ class TextLineItemsExtractor:
|
||||
|
||||
# Extract text elements from parsing results
|
||||
text_elements = self._extract_text_elements(parsing_res_list)
|
||||
logger.info(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
|
||||
logger.debug(f"TextLineItemsExtractor: found {len(text_elements)} text elements")
|
||||
|
||||
if len(text_elements) < 5: # Need at least a few elements
|
||||
logger.debug("Too few text elements for line item extraction")
|
||||
if len(text_elements) < MIN_TEXT_ELEMENTS_FOR_EXTRACTION:
|
||||
logger.debug(
|
||||
f"Too few text elements ({len(text_elements)}) for line item extraction, "
|
||||
f"need at least {MIN_TEXT_ELEMENTS_FOR_EXTRACTION}"
|
||||
)
|
||||
return None
|
||||
|
||||
return self.extract_from_text_elements(text_elements)
|
||||
@@ -183,11 +194,11 @@ class TextLineItemsExtractor:
|
||||
"""
|
||||
# Group elements by row
|
||||
rows = self._group_by_row(text_elements)
|
||||
logger.info(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
|
||||
logger.debug(f"TextLineItemsExtractor: grouped into {len(rows)} rows")
|
||||
|
||||
# Find the line items section
|
||||
item_rows = self._identify_line_item_rows(rows)
|
||||
logger.info(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
|
||||
logger.debug(f"TextLineItemsExtractor: identified {len(item_rows)} potential item rows")
|
||||
|
||||
if len(item_rows) < self.min_items_for_valid:
|
||||
logger.debug(f"Found only {len(item_rows)} item rows, need at least {self.min_items_for_valid}")
|
||||
@@ -195,7 +206,7 @@ class TextLineItemsExtractor:
|
||||
|
||||
# Extract structured items
|
||||
items = self._parse_line_items(item_rows)
|
||||
logger.info(f"TextLineItemsExtractor: extracted {len(items)} line items")
|
||||
logger.debug(f"TextLineItemsExtractor: extracted {len(items)} line items")
|
||||
|
||||
if len(items) < self.min_items_for_valid:
|
||||
return None
|
||||
@@ -209,7 +220,11 @@ class TextLineItemsExtractor:
|
||||
def _extract_text_elements(
|
||||
self, parsing_res_list: list[dict[str, Any]]
|
||||
) -> list[TextElement]:
|
||||
"""Extract TextElement objects from parsing_res_list."""
|
||||
"""Extract TextElement objects from parsing_res_list.
|
||||
|
||||
Handles both dict and LayoutBlock object formats from PP-StructureV3.
|
||||
Gracefully skips invalid elements with appropriate logging.
|
||||
"""
|
||||
elements = []
|
||||
|
||||
for elem in parsing_res_list:
|
||||
@@ -220,11 +235,15 @@ class TextLineItemsExtractor:
|
||||
bbox = elem.get("bbox", [])
|
||||
# Try both 'text' and 'content' keys
|
||||
text = elem.get("text", "") or elem.get("content", "")
|
||||
else:
|
||||
elif hasattr(elem, "label"):
|
||||
label = getattr(elem, "label", "")
|
||||
bbox = getattr(elem, "bbox", [])
|
||||
# LayoutBlock objects use 'content' attribute
|
||||
text = getattr(elem, "content", "") or getattr(elem, "text", "")
|
||||
else:
|
||||
# Element is neither dict nor has expected attributes
|
||||
logger.debug(f"Skipping element with unexpected type: {type(elem).__name__}")
|
||||
continue
|
||||
|
||||
# Only process text elements (skip images, tables, etc.)
|
||||
if label not in ("text", "paragraph_title", "aside_text"):
|
||||
@@ -232,6 +251,7 @@ class TextLineItemsExtractor:
|
||||
|
||||
# Validate bbox
|
||||
if not self._valid_bbox(bbox):
|
||||
logger.debug(f"Skipping element with invalid bbox: {bbox}")
|
||||
continue
|
||||
|
||||
# Clean text
|
||||
@@ -250,8 +270,13 @@ class TextLineItemsExtractor:
|
||||
),
|
||||
)
|
||||
)
|
||||
except (KeyError, TypeError, ValueError, AttributeError) as e:
|
||||
# Expected format issues - log at debug level
|
||||
logger.debug(f"Skipping element due to format issue: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to parse element: {e}")
|
||||
# Unexpected errors - log at warning level for visibility
|
||||
logger.warning(f"Unexpected error parsing element: {type(e).__name__}: {e}")
|
||||
continue
|
||||
|
||||
return elements
|
||||
@@ -270,6 +295,7 @@ class TextLineItemsExtractor:
|
||||
Group text elements into rows based on vertical position.
|
||||
|
||||
Elements within row_tolerance of each other are considered same row.
|
||||
Uses dynamic average center_y to handle varying element heights more accurately.
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
@@ -277,22 +303,22 @@ class TextLineItemsExtractor:
|
||||
# Sort by vertical position
|
||||
sorted_elements = sorted(elements, key=lambda e: e.center_y)
|
||||
|
||||
rows = []
|
||||
current_row = [sorted_elements[0]]
|
||||
current_y = sorted_elements[0].center_y
|
||||
rows: list[list[TextElement]] = []
|
||||
current_row: list[TextElement] = [sorted_elements[0]]
|
||||
|
||||
for elem in sorted_elements[1:]:
|
||||
if abs(elem.center_y - current_y) <= self.row_tolerance:
|
||||
# Same row
|
||||
# Calculate dynamic average center_y for current row
|
||||
avg_center_y = sum(e.center_y for e in current_row) / len(current_row)
|
||||
|
||||
if abs(elem.center_y - avg_center_y) <= self.row_tolerance:
|
||||
# Same row - add element and recalculate average on next iteration
|
||||
current_row.append(elem)
|
||||
else:
|
||||
# New row
|
||||
if current_row:
|
||||
# Sort row by horizontal position
|
||||
current_row.sort(key=lambda e: e.center_x)
|
||||
rows.append(current_row)
|
||||
# New row - finalize current row
|
||||
# Sort row by horizontal position (left to right)
|
||||
current_row.sort(key=lambda e: e.center_x)
|
||||
rows.append(current_row)
|
||||
current_row = [elem]
|
||||
current_y = elem.center_y
|
||||
|
||||
# Don't forget last row
|
||||
if current_row:
|
||||
|
||||
Reference in New Issue
Block a user