refactor: split line_items_extractor into smaller modules with comprehensive tests

- Extract models.py (LineItem, LineItemsResult dataclasses)
- Extract html_table_parser.py (ColumnMapper, HtmlTableParser)
- Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells)
- Reduce line_items_extractor.py from 971 to 396 lines
- Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.)
- Fix row grouping algorithm in text_line_items_extractor.py
- Demote INFO logs to DEBUG level in structure_detector.py
- Add 209 tests achieving 85%+ coverage on main modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-02-03 23:02:00 +01:00
parent c2c8f2dd04
commit 8723ef4653
11 changed files with 2230 additions and 841 deletions

View File

@@ -0,0 +1,448 @@
"""
Tests for Merged Cell Handler
Tests the detection and extraction of data from tables with merged cells,
a common issue with PP-StructureV3 OCR output.
"""
import pytest
from backend.table.merged_cell_handler import MergedCellHandler, MIN_AMOUNT_THRESHOLD
from backend.table.html_table_parser import ColumnMapper
@pytest.fixture
def handler():
"""Create a MergedCellHandler with default ColumnMapper."""
return MergedCellHandler(ColumnMapper())
class TestHasVerticallyMergedCells:
"""Tests for has_vertically_merged_cells detection."""
def test_empty_rows_returns_false(self, handler):
"""Test empty rows returns False."""
assert handler.has_vertically_merged_cells([]) is False
def test_short_cells_ignored(self, handler):
"""Test cells shorter than 20 chars are ignored."""
rows = [["Short cell", "Also short"]]
assert handler.has_vertically_merged_cells(rows) is False
def test_detects_multiple_product_numbers(self, handler):
"""Test detection of multiple 7-digit product numbers in cell."""
rows = [["Produktnr 1457280 1457281 1060381 and more text here"]]
assert handler.has_vertically_merged_cells(rows) is True
def test_single_product_number_not_merged(self, handler):
"""Test single product number doesn't trigger detection."""
rows = [["Produktnr 1457280 and more text here for length"]]
assert handler.has_vertically_merged_cells(rows) is False
def test_detects_multiple_prices(self, handler):
"""Test detection of 3+ prices in cell (Swedish format)."""
rows = [["Pris 127,20 234,56 159,20 total amounts"]]
assert handler.has_vertically_merged_cells(rows) is True
def test_two_prices_not_merged(self, handler):
"""Test two prices doesn't trigger detection (needs 3+)."""
rows = [["Pris 127,20 234,56 total amount here"]]
assert handler.has_vertically_merged_cells(rows) is False
def test_detects_multiple_quantities(self, handler):
"""Test detection of multiple quantity patterns."""
rows = [["Antal 6ST 6ST 1ST more text here"]]
assert handler.has_vertically_merged_cells(rows) is True
def test_single_quantity_not_merged(self, handler):
"""Test single quantity doesn't trigger detection."""
rows = [["Antal 6ST and more text here for length"]]
assert handler.has_vertically_merged_cells(rows) is False
def test_empty_cell_skipped(self, handler):
"""Test empty cells are skipped."""
rows = [["", None, "Valid but short"]]
assert handler.has_vertically_merged_cells(rows) is False
def test_multiple_rows_checked(self, handler):
"""Test all rows are checked for merged content."""
rows = [
["Normal row with nothing special"],
["Produktnr 1457280 1457281 1060381 merged content"],
]
assert handler.has_vertically_merged_cells(rows) is True
class TestSplitMergedRows:
"""Tests for split_merged_rows method."""
def test_empty_rows_returns_empty(self, handler):
"""Test empty rows returns empty result."""
header, data = handler.split_merged_rows([])
assert header == []
assert data == []
def test_all_empty_rows_returns_original(self, handler):
"""Test all empty rows returns original rows."""
rows = [["", ""], ["", ""]]
header, data = handler.split_merged_rows(rows)
assert header == []
assert data == rows
def test_splits_by_product_numbers(self, handler):
"""Test splitting rows by product numbers."""
rows = [
["Produktnr 1234567 1234568", "Antal 2ST 3ST", "Pris 100,00 200,00"],
]
header, data = handler.split_merged_rows(rows)
assert len(header) == 3
assert header[0] == "Produktnr"
assert len(data) == 2
def test_splits_by_quantities(self, handler):
"""Test splitting rows by quantity patterns."""
rows = [
["Description text", "Antal 5ST 10ST", "Belopp 500,00 1000,00"],
]
header, data = handler.split_merged_rows(rows)
# Should detect 2 quantities and split accordingly
assert len(data) >= 1
def test_single_row_not_split(self, handler):
"""Test single item row is not split."""
rows = [
["Produktnr 1234567", "Antal 2ST", "Pris 100,00"],
]
header, data = handler.split_merged_rows(rows)
# Only 1 product number, so expected_rows <= 1
assert header == []
assert data == rows
def test_handles_missing_columns(self, handler):
"""Test handles rows with different column counts."""
rows = [
["Produktnr 1234567 1234568", ""],
["Antal 2ST 3ST"],
]
header, data = handler.split_merged_rows(rows)
# Should handle gracefully
assert isinstance(header, list)
assert isinstance(data, list)
class TestCountExpectedRows:
"""Tests for _count_expected_rows helper."""
def test_counts_product_numbers(self, handler):
"""Test counting product numbers."""
columns = ["Produktnr 1234567 1234568 1234569", "Other"]
count = handler._count_expected_rows(columns)
assert count == 3
def test_counts_quantities(self, handler):
"""Test counting quantity patterns."""
columns = ["Nothing here", "Antal 5ST 10ST 15ST 20ST"]
count = handler._count_expected_rows(columns)
assert count == 4
def test_returns_max_count(self, handler):
"""Test returns maximum count across columns."""
columns = [
"Produktnr 1234567 1234568", # 2 products
"Antal 5ST 10ST 15ST", # 3 quantities
]
count = handler._count_expected_rows(columns)
assert count == 3
def test_empty_columns_return_zero(self, handler):
"""Test empty columns return 0."""
columns = ["", None, "Short"]
count = handler._count_expected_rows(columns)
assert count == 0
class TestSplitCellContentForRows:
"""Tests for _split_cell_content_for_rows helper."""
def test_splits_by_product_numbers(self, handler):
"""Test splitting by product numbers with expected count."""
cell = "Produktnr 1234567 1234568"
result = handler._split_cell_content_for_rows(cell, 2)
assert len(result) == 3 # header + 2 values
assert result[0] == "Produktnr"
assert "1234567" in result[1]
assert "1234568" in result[2]
def test_splits_by_quantities(self, handler):
"""Test splitting by quantity patterns."""
cell = "Antal 5ST 10ST"
result = handler._split_cell_content_for_rows(cell, 2)
assert len(result) == 3 # header + 2 values
assert result[0] == "Antal"
def test_splits_discount_totalsumma(self, handler):
"""Test splitting discount+totalsumma columns."""
cell = "Rabatt i% Totalsumma 686,88 123,45"
result = handler._split_cell_content_for_rows(cell, 2)
assert result[0] == "Totalsumma"
assert "686,88" in result[1]
assert "123,45" in result[2]
def test_splits_by_prices(self, handler):
"""Test splitting by price patterns."""
cell = "Pris 127,20 234,56"
result = handler._split_cell_content_for_rows(cell, 2)
assert len(result) >= 2
def test_fallback_returns_original(self, handler):
"""Test fallback returns original cell."""
cell = "No patterns here"
result = handler._split_cell_content_for_rows(cell, 2)
assert result == ["No patterns here"]
def test_product_number_with_description(self, handler):
"""Test product numbers include trailing description text."""
cell = "Art 1234567 Widget A 1234568 Widget B"
result = handler._split_cell_content_for_rows(cell, 2)
assert len(result) == 3
class TestSplitCellContent:
"""Tests for split_cell_content method."""
def test_splits_by_product_numbers(self, handler):
"""Test splitting by multiple product numbers."""
cell = "Produktnr 1234567 1234568 1234569"
result = handler.split_cell_content(cell)
assert result[0] == "Produktnr"
assert "1234567" in result
assert "1234568" in result
assert "1234569" in result
def test_splits_by_quantities(self, handler):
"""Test splitting by multiple quantities."""
cell = "Antal 6ST 6ST 1ST"
result = handler.split_cell_content(cell)
assert result[0] == "Antal"
assert len(result) >= 3
def test_splits_discount_amount_interleaved(self, handler):
"""Test splitting interleaved discount+amount patterns."""
cell = "Rabatt i% Totalsumma 10,0 686,88 10,0 123,45"
result = handler.split_cell_content(cell)
# Should extract amounts (3+ digit numbers with decimals)
assert result[0] == "Totalsumma"
assert "686,88" in result
assert "123,45" in result
def test_splits_by_prices(self, handler):
"""Test splitting by prices."""
cell = "Pris 127,20 127,20 159,20"
result = handler.split_cell_content(cell)
assert result[0] == "Pris"
def test_single_value_not_split(self, handler):
"""Test single value is not split."""
cell = "Single value"
result = handler.split_cell_content(cell)
assert result == ["Single value"]
def test_single_product_not_split(self, handler):
"""Test single product number is not split."""
cell = "Produktnr 1234567"
result = handler.split_cell_content(cell)
assert result == ["Produktnr 1234567"]
class TestHasMergedHeader:
"""Tests for has_merged_header method."""
def test_none_header_returns_false(self, handler):
"""Test None header returns False."""
assert handler.has_merged_header(None) is False
def test_empty_header_returns_false(self, handler):
"""Test empty header returns False."""
assert handler.has_merged_header([]) is False
def test_multiple_non_empty_cells_returns_false(self, handler):
"""Test multiple non-empty cells returns False."""
header = ["Beskrivning", "Antal", "Belopp"]
assert handler.has_merged_header(header) is False
def test_single_cell_with_keywords_returns_true(self, handler):
"""Test single cell with multiple keywords returns True."""
header = ["Specifikation 0218103-1201 rum och kök Hyra Avdrag"]
assert handler.has_merged_header(header) is True
def test_single_cell_one_keyword_returns_false(self, handler):
"""Test single cell with only one keyword returns False."""
header = ["Beskrivning only"]
assert handler.has_merged_header(header) is False
def test_ignores_empty_trailing_cells(self, handler):
"""Test ignores empty trailing cells."""
header = ["Specifikation Hyra Avdrag", "", "", ""]
assert handler.has_merged_header(header) is True
class TestExtractFromMergedCells:
"""Tests for extract_from_merged_cells method."""
def test_extracts_single_amount(self, handler):
"""Test extracting a single amount."""
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
rows = [["", "", "", "8159"]]
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 1
assert items[0].amount == "8159"
assert items[0].is_deduction is False
assert items[0].article_number == "0218103-1201"
assert items[0].description == "2 rum och kök"
def test_extracts_deduction(self, handler):
"""Test extracting a deduction (negative amount)."""
header = ["Specifikation"]
rows = [["", "", "", "-2000"]]
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 1
assert items[0].amount == "-2000"
assert items[0].is_deduction is True
# First item (row_index=0) gets description from header, not "Avdrag"
# "Avdrag" is only set for subsequent deduction items
assert items[0].description is None
def test_extracts_multiple_amounts_same_row(self, handler):
"""Test extracting multiple amounts from same row."""
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
rows = [["", "", "", "8159 -2000"]]
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 2
assert items[0].amount == "8159"
assert items[1].amount == "-2000"
def test_extracts_amounts_from_multiple_rows(self, handler):
"""Test extracting amounts from multiple rows."""
header = ["Specifikation"]
rows = [
["", "", "", "8159"],
["", "", "", "-2000"],
]
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 2
def test_skips_small_amounts(self, handler):
"""Test skipping small amounts below threshold."""
header = ["Specifikation"]
rows = [["", "", "", "50"]] # Below MIN_AMOUNT_THRESHOLD (100)
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 0
def test_skips_empty_rows(self, handler):
"""Test skipping empty rows."""
header = ["Specifikation"]
rows = [["", "", "", ""]]
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 0
def test_handles_swedish_format_with_spaces(self, handler):
"""Test handling Swedish number format with spaces."""
header = ["Specifikation"]
rows = [["", "", "", "8 159"]]
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 1
assert items[0].amount == "8159"
def test_confidence_is_lower_for_merged(self, handler):
"""Test confidence is 0.7 for merged cell extraction."""
header = ["Specifikation"]
rows = [["", "", "", "8159"]]
items = handler.extract_from_merged_cells(header, rows)
assert items[0].confidence == 0.7
def test_empty_header_still_extracts(self, handler):
"""Test extraction works with empty header."""
header = []
rows = [["", "", "", "8159"]]
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 1
assert items[0].description is None
assert items[0].article_number is None
def test_row_index_increments(self, handler):
"""Test row_index increments for each item."""
header = ["Specifikation"]
# Use separate rows to avoid regex grouping issues
rows = [
["", "", "", "8159"],
["", "", "", "5000"],
["", "", "", "-2000"],
]
items = handler.extract_from_merged_cells(header, rows)
# Should have 3 items from 3 rows
assert len(items) == 3
assert items[0].row_index == 0
assert items[1].row_index == 1
assert items[2].row_index == 2
class TestMinAmountThreshold:
"""Tests for MIN_AMOUNT_THRESHOLD constant."""
def test_threshold_value(self):
"""Test the threshold constant value."""
assert MIN_AMOUNT_THRESHOLD == 100
def test_amounts_at_threshold_included(self, handler):
"""Test amounts exactly at threshold are included."""
header = ["Specifikation"]
rows = [["", "", "", "100"]] # Exactly at threshold
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 1
assert items[0].amount == "100"
def test_amounts_below_threshold_excluded(self, handler):
"""Test amounts below threshold are excluded."""
header = ["Specifikation"]
rows = [["", "", "", "99"]] # Below threshold
items = handler.extract_from_merged_cells(header, rows)
assert len(items) == 0