Files
invoice-master-poc-v2/packages/backend/backend/table/html_table_parser.py
Yaojia Wang 8723ef4653 refactor: split line_items_extractor into smaller modules with comprehensive tests
- Extract models.py (LineItem, LineItemsResult dataclasses)
- Extract html_table_parser.py (ColumnMapper, HtmlTableParser)
- Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells)
- Reduce line_items_extractor.py from 971 to 396 lines
- Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.)
- Fix row grouping algorithm in text_line_items_extractor.py
- Demote INFO logs to DEBUG level in structure_detector.py
- Add 209 tests achieving 85%+ coverage on main modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 23:02:00 +01:00

205 lines
5.6 KiB
Python

"""
HTML Table Parser
Parses HTML tables into structured data and maps columns to field names.
"""
from html.parser import HTMLParser
import logging
logger = logging.getLogger(__name__)
# Configuration constants
# Minimum pattern length to avoid false positives from short substrings
MIN_PATTERN_MATCH_LENGTH = 3
# Exact match bonus for column mapping priority
EXACT_MATCH_BONUS = 100
# Swedish column name mappings
# Extended to support multiple invoice types: product invoices, rental invoices, utility bills
COLUMN_MAPPINGS = {
"article_number": [
"art nummer",
"artikelnummer",
"artikel",
"artnr",
"art.nr",
"art nr",
"objektnummer", # Rental: property reference
"objekt",
],
"description": [
"beskrivning",
"produktbeskrivning",
"produkt",
"tjänst",
"text",
"benämning",
"vara/tjänst",
"vara",
# Rental invoice specific
"specifikation",
"spec",
"hyresperiod", # Rental period
"period",
"typ", # Type of charge
# Utility bills
"förbrukning", # Consumption
"avläsning", # Meter reading
],
"quantity": ["antal", "qty", "st", "pcs", "kvantitet", "", "kvm"],
"unit": ["enhet", "unit"],
"unit_price": ["á-pris", "a-pris", "pris", "styckpris", "enhetspris", "à pris"],
"amount": [
"belopp",
"summa",
"total",
"netto",
"rad summa",
# Rental specific
"hyra", # Rent
"avgift", # Fee
"kostnad", # Cost
"debitering", # Charge
"totalt", # Total
],
"vat_rate": ["moms", "moms%", "vat", "skatt", "moms %"],
# Additional field for rental: deductions/adjustments
"deduction": [
"avdrag", # Deduction
"rabatt", # Discount
"kredit", # Credit
],
}
# Keywords that indicate NOT a line items table
SUMMARY_KEYWORDS = [
"frakt",
"faktura.avg",
"fakturavg",
"exkl.moms",
"att betala",
"öresavr",
"bankgiro",
"plusgiro",
"ocr",
"forfallodatum",
"förfallodatum",
]
class _TableHTMLParser(HTMLParser):
"""Internal HTML parser for tables."""
def __init__(self):
super().__init__()
self.rows: list[list[str]] = []
self.current_row: list[str] = []
self.current_cell: str = ""
self.in_td = False
self.in_thead = False
self.header_row: list[str] = []
def handle_starttag(self, tag, attrs):
if tag == "tr":
self.current_row = []
elif tag in ("td", "th"):
self.in_td = True
self.current_cell = ""
elif tag == "thead":
self.in_thead = True
def handle_endtag(self, tag):
if tag in ("td", "th"):
self.in_td = False
self.current_row.append(self.current_cell.strip())
elif tag == "tr":
if self.current_row:
if self.in_thead:
self.header_row = self.current_row
else:
self.rows.append(self.current_row)
elif tag == "thead":
self.in_thead = False
def handle_data(self, data):
if self.in_td:
self.current_cell += data
class HTMLTableParser:
"""Parse HTML tables into structured data."""
def parse(self, html: str) -> tuple[list[str], list[list[str]]]:
"""
Parse HTML table and return header and rows.
Args:
html: HTML string containing table.
Returns:
Tuple of (header_row, data_rows).
"""
parser = _TableHTMLParser()
parser.feed(html)
return parser.header_row, parser.rows
class ColumnMapper:
"""Map column headers to field names."""
def __init__(self, mappings: dict[str, list[str]] | None = None):
"""
Initialize column mapper.
Args:
mappings: Custom column mappings. Uses Swedish defaults if None.
"""
self.mappings = mappings or COLUMN_MAPPINGS
def map(self, headers: list[str]) -> dict[int, str]:
"""
Map column indices to field names.
Args:
headers: List of column header strings.
Returns:
Dictionary mapping column index to field name.
"""
mapping = {}
for idx, header in enumerate(headers):
normalized = self._normalize(header)
if not normalized.strip():
continue
best_match = None
best_match_len = 0
for field_name, patterns in self.mappings.items():
for pattern in patterns:
if pattern == normalized:
# Exact match gets highest priority
best_match = field_name
best_match_len = len(pattern) + EXACT_MATCH_BONUS
break
elif pattern in normalized and len(pattern) > best_match_len:
# Partial match requires minimum length to avoid false positives
if len(pattern) >= MIN_PATTERN_MATCH_LENGTH:
best_match = field_name
best_match_len = len(pattern)
if best_match_len > EXACT_MATCH_BONUS:
# Found exact match, no need to check other fields
break
if best_match:
mapping[idx] = best_match
return mapping
def _normalize(self, header: str) -> str:
"""Normalize header text for matching."""
return header.lower().strip().replace(".", "").replace("-", " ")