- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
205 lines
5.6 KiB
Python
205 lines
5.6 KiB
Python
"""
|
|
HTML Table Parser
|
|
|
|
Parses HTML tables into structured data and maps columns to field names.
|
|
"""
|
|
|
|
from html.parser import HTMLParser
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration constants
|
|
# Minimum pattern length to avoid false positives from short substrings
|
|
MIN_PATTERN_MATCH_LENGTH = 3
|
|
# Exact match bonus for column mapping priority
|
|
EXACT_MATCH_BONUS = 100
|
|
|
|
# Swedish column name mappings
|
|
# Extended to support multiple invoice types: product invoices, rental invoices, utility bills
|
|
COLUMN_MAPPINGS = {
|
|
"article_number": [
|
|
"art nummer",
|
|
"artikelnummer",
|
|
"artikel",
|
|
"artnr",
|
|
"art.nr",
|
|
"art nr",
|
|
"objektnummer", # Rental: property reference
|
|
"objekt",
|
|
],
|
|
"description": [
|
|
"beskrivning",
|
|
"produktbeskrivning",
|
|
"produkt",
|
|
"tjänst",
|
|
"text",
|
|
"benämning",
|
|
"vara/tjänst",
|
|
"vara",
|
|
# Rental invoice specific
|
|
"specifikation",
|
|
"spec",
|
|
"hyresperiod", # Rental period
|
|
"period",
|
|
"typ", # Type of charge
|
|
# Utility bills
|
|
"förbrukning", # Consumption
|
|
"avläsning", # Meter reading
|
|
],
|
|
"quantity": ["antal", "qty", "st", "pcs", "kvantitet", "m²", "kvm"],
|
|
"unit": ["enhet", "unit"],
|
|
"unit_price": ["á-pris", "a-pris", "pris", "styckpris", "enhetspris", "à pris"],
|
|
"amount": [
|
|
"belopp",
|
|
"summa",
|
|
"total",
|
|
"netto",
|
|
"rad summa",
|
|
# Rental specific
|
|
"hyra", # Rent
|
|
"avgift", # Fee
|
|
"kostnad", # Cost
|
|
"debitering", # Charge
|
|
"totalt", # Total
|
|
],
|
|
"vat_rate": ["moms", "moms%", "vat", "skatt", "moms %"],
|
|
# Additional field for rental: deductions/adjustments
|
|
"deduction": [
|
|
"avdrag", # Deduction
|
|
"rabatt", # Discount
|
|
"kredit", # Credit
|
|
],
|
|
}
|
|
|
|
# Keywords that indicate NOT a line items table
|
|
SUMMARY_KEYWORDS = [
|
|
"frakt",
|
|
"faktura.avg",
|
|
"fakturavg",
|
|
"exkl.moms",
|
|
"att betala",
|
|
"öresavr",
|
|
"bankgiro",
|
|
"plusgiro",
|
|
"ocr",
|
|
"forfallodatum",
|
|
"förfallodatum",
|
|
]
|
|
|
|
|
|
class _TableHTMLParser(HTMLParser):
|
|
"""Internal HTML parser for tables."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.rows: list[list[str]] = []
|
|
self.current_row: list[str] = []
|
|
self.current_cell: str = ""
|
|
self.in_td = False
|
|
self.in_thead = False
|
|
self.header_row: list[str] = []
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == "tr":
|
|
self.current_row = []
|
|
elif tag in ("td", "th"):
|
|
self.in_td = True
|
|
self.current_cell = ""
|
|
elif tag == "thead":
|
|
self.in_thead = True
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ("td", "th"):
|
|
self.in_td = False
|
|
self.current_row.append(self.current_cell.strip())
|
|
elif tag == "tr":
|
|
if self.current_row:
|
|
if self.in_thead:
|
|
self.header_row = self.current_row
|
|
else:
|
|
self.rows.append(self.current_row)
|
|
elif tag == "thead":
|
|
self.in_thead = False
|
|
|
|
def handle_data(self, data):
|
|
if self.in_td:
|
|
self.current_cell += data
|
|
|
|
|
|
class HTMLTableParser:
|
|
"""Parse HTML tables into structured data."""
|
|
|
|
def parse(self, html: str) -> tuple[list[str], list[list[str]]]:
|
|
"""
|
|
Parse HTML table and return header and rows.
|
|
|
|
Args:
|
|
html: HTML string containing table.
|
|
|
|
Returns:
|
|
Tuple of (header_row, data_rows).
|
|
"""
|
|
parser = _TableHTMLParser()
|
|
parser.feed(html)
|
|
return parser.header_row, parser.rows
|
|
|
|
|
|
class ColumnMapper:
|
|
"""Map column headers to field names."""
|
|
|
|
def __init__(self, mappings: dict[str, list[str]] | None = None):
|
|
"""
|
|
Initialize column mapper.
|
|
|
|
Args:
|
|
mappings: Custom column mappings. Uses Swedish defaults if None.
|
|
"""
|
|
self.mappings = mappings or COLUMN_MAPPINGS
|
|
|
|
def map(self, headers: list[str]) -> dict[int, str]:
|
|
"""
|
|
Map column indices to field names.
|
|
|
|
Args:
|
|
headers: List of column header strings.
|
|
|
|
Returns:
|
|
Dictionary mapping column index to field name.
|
|
"""
|
|
mapping = {}
|
|
for idx, header in enumerate(headers):
|
|
normalized = self._normalize(header)
|
|
|
|
if not normalized.strip():
|
|
continue
|
|
|
|
best_match = None
|
|
best_match_len = 0
|
|
|
|
for field_name, patterns in self.mappings.items():
|
|
for pattern in patterns:
|
|
if pattern == normalized:
|
|
# Exact match gets highest priority
|
|
best_match = field_name
|
|
best_match_len = len(pattern) + EXACT_MATCH_BONUS
|
|
break
|
|
elif pattern in normalized and len(pattern) > best_match_len:
|
|
# Partial match requires minimum length to avoid false positives
|
|
if len(pattern) >= MIN_PATTERN_MATCH_LENGTH:
|
|
best_match = field_name
|
|
best_match_len = len(pattern)
|
|
|
|
if best_match_len > EXACT_MATCH_BONUS:
|
|
# Found exact match, no need to check other fields
|
|
break
|
|
|
|
if best_match:
|
|
mapping[idx] = best_match
|
|
|
|
return mapping
|
|
|
|
def _normalize(self, header: str) -> str:
|
|
"""Normalize header text for matching."""
|
|
return header.lower().strip().replace(".", "").replace("-", " ")
|