- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
449 lines
16 KiB
Python
449 lines
16 KiB
Python
"""
|
|
Tests for Merged Cell Handler
|
|
|
|
Tests the detection and extraction of data from tables with merged cells,
|
|
a common issue with PP-StructureV3 OCR output.
|
|
"""
|
|
|
|
import pytest
|
|
from backend.table.merged_cell_handler import MergedCellHandler, MIN_AMOUNT_THRESHOLD
|
|
from backend.table.html_table_parser import ColumnMapper
|
|
|
|
|
|
@pytest.fixture
|
|
def handler():
|
|
"""Create a MergedCellHandler with default ColumnMapper."""
|
|
return MergedCellHandler(ColumnMapper())
|
|
|
|
|
|
class TestHasVerticallyMergedCells:
|
|
"""Tests for has_vertically_merged_cells detection."""
|
|
|
|
def test_empty_rows_returns_false(self, handler):
|
|
"""Test empty rows returns False."""
|
|
assert handler.has_vertically_merged_cells([]) is False
|
|
|
|
def test_short_cells_ignored(self, handler):
|
|
"""Test cells shorter than 20 chars are ignored."""
|
|
rows = [["Short cell", "Also short"]]
|
|
assert handler.has_vertically_merged_cells(rows) is False
|
|
|
|
def test_detects_multiple_product_numbers(self, handler):
|
|
"""Test detection of multiple 7-digit product numbers in cell."""
|
|
rows = [["Produktnr 1457280 1457281 1060381 and more text here"]]
|
|
assert handler.has_vertically_merged_cells(rows) is True
|
|
|
|
def test_single_product_number_not_merged(self, handler):
|
|
"""Test single product number doesn't trigger detection."""
|
|
rows = [["Produktnr 1457280 and more text here for length"]]
|
|
assert handler.has_vertically_merged_cells(rows) is False
|
|
|
|
def test_detects_multiple_prices(self, handler):
|
|
"""Test detection of 3+ prices in cell (Swedish format)."""
|
|
rows = [["Pris 127,20 234,56 159,20 total amounts"]]
|
|
assert handler.has_vertically_merged_cells(rows) is True
|
|
|
|
def test_two_prices_not_merged(self, handler):
|
|
"""Test two prices doesn't trigger detection (needs 3+)."""
|
|
rows = [["Pris 127,20 234,56 total amount here"]]
|
|
assert handler.has_vertically_merged_cells(rows) is False
|
|
|
|
def test_detects_multiple_quantities(self, handler):
|
|
"""Test detection of multiple quantity patterns."""
|
|
rows = [["Antal 6ST 6ST 1ST more text here"]]
|
|
assert handler.has_vertically_merged_cells(rows) is True
|
|
|
|
def test_single_quantity_not_merged(self, handler):
|
|
"""Test single quantity doesn't trigger detection."""
|
|
rows = [["Antal 6ST and more text here for length"]]
|
|
assert handler.has_vertically_merged_cells(rows) is False
|
|
|
|
def test_empty_cell_skipped(self, handler):
|
|
"""Test empty cells are skipped."""
|
|
rows = [["", None, "Valid but short"]]
|
|
assert handler.has_vertically_merged_cells(rows) is False
|
|
|
|
def test_multiple_rows_checked(self, handler):
|
|
"""Test all rows are checked for merged content."""
|
|
rows = [
|
|
["Normal row with nothing special"],
|
|
["Produktnr 1457280 1457281 1060381 merged content"],
|
|
]
|
|
assert handler.has_vertically_merged_cells(rows) is True
|
|
|
|
|
|
class TestSplitMergedRows:
|
|
"""Tests for split_merged_rows method."""
|
|
|
|
def test_empty_rows_returns_empty(self, handler):
|
|
"""Test empty rows returns empty result."""
|
|
header, data = handler.split_merged_rows([])
|
|
assert header == []
|
|
assert data == []
|
|
|
|
def test_all_empty_rows_returns_original(self, handler):
|
|
"""Test all empty rows returns original rows."""
|
|
rows = [["", ""], ["", ""]]
|
|
header, data = handler.split_merged_rows(rows)
|
|
assert header == []
|
|
assert data == rows
|
|
|
|
def test_splits_by_product_numbers(self, handler):
|
|
"""Test splitting rows by product numbers."""
|
|
rows = [
|
|
["Produktnr 1234567 1234568", "Antal 2ST 3ST", "Pris 100,00 200,00"],
|
|
]
|
|
header, data = handler.split_merged_rows(rows)
|
|
|
|
assert len(header) == 3
|
|
assert header[0] == "Produktnr"
|
|
assert len(data) == 2
|
|
|
|
def test_splits_by_quantities(self, handler):
|
|
"""Test splitting rows by quantity patterns."""
|
|
rows = [
|
|
["Description text", "Antal 5ST 10ST", "Belopp 500,00 1000,00"],
|
|
]
|
|
header, data = handler.split_merged_rows(rows)
|
|
|
|
# Should detect 2 quantities and split accordingly
|
|
assert len(data) >= 1
|
|
|
|
def test_single_row_not_split(self, handler):
|
|
"""Test single item row is not split."""
|
|
rows = [
|
|
["Produktnr 1234567", "Antal 2ST", "Pris 100,00"],
|
|
]
|
|
header, data = handler.split_merged_rows(rows)
|
|
|
|
# Only 1 product number, so expected_rows <= 1
|
|
assert header == []
|
|
assert data == rows
|
|
|
|
def test_handles_missing_columns(self, handler):
|
|
"""Test handles rows with different column counts."""
|
|
rows = [
|
|
["Produktnr 1234567 1234568", ""],
|
|
["Antal 2ST 3ST"],
|
|
]
|
|
header, data = handler.split_merged_rows(rows)
|
|
|
|
# Should handle gracefully
|
|
assert isinstance(header, list)
|
|
assert isinstance(data, list)
|
|
|
|
|
|
class TestCountExpectedRows:
|
|
"""Tests for _count_expected_rows helper."""
|
|
|
|
def test_counts_product_numbers(self, handler):
|
|
"""Test counting product numbers."""
|
|
columns = ["Produktnr 1234567 1234568 1234569", "Other"]
|
|
count = handler._count_expected_rows(columns)
|
|
assert count == 3
|
|
|
|
def test_counts_quantities(self, handler):
|
|
"""Test counting quantity patterns."""
|
|
columns = ["Nothing here", "Antal 5ST 10ST 15ST 20ST"]
|
|
count = handler._count_expected_rows(columns)
|
|
assert count == 4
|
|
|
|
def test_returns_max_count(self, handler):
|
|
"""Test returns maximum count across columns."""
|
|
columns = [
|
|
"Produktnr 1234567 1234568", # 2 products
|
|
"Antal 5ST 10ST 15ST", # 3 quantities
|
|
]
|
|
count = handler._count_expected_rows(columns)
|
|
assert count == 3
|
|
|
|
def test_empty_columns_return_zero(self, handler):
|
|
"""Test empty columns return 0."""
|
|
columns = ["", None, "Short"]
|
|
count = handler._count_expected_rows(columns)
|
|
assert count == 0
|
|
|
|
|
|
class TestSplitCellContentForRows:
|
|
"""Tests for _split_cell_content_for_rows helper."""
|
|
|
|
def test_splits_by_product_numbers(self, handler):
|
|
"""Test splitting by product numbers with expected count."""
|
|
cell = "Produktnr 1234567 1234568"
|
|
result = handler._split_cell_content_for_rows(cell, 2)
|
|
|
|
assert len(result) == 3 # header + 2 values
|
|
assert result[0] == "Produktnr"
|
|
assert "1234567" in result[1]
|
|
assert "1234568" in result[2]
|
|
|
|
def test_splits_by_quantities(self, handler):
|
|
"""Test splitting by quantity patterns."""
|
|
cell = "Antal 5ST 10ST"
|
|
result = handler._split_cell_content_for_rows(cell, 2)
|
|
|
|
assert len(result) == 3 # header + 2 values
|
|
assert result[0] == "Antal"
|
|
|
|
def test_splits_discount_totalsumma(self, handler):
|
|
"""Test splitting discount+totalsumma columns."""
|
|
cell = "Rabatt i% Totalsumma 686,88 123,45"
|
|
result = handler._split_cell_content_for_rows(cell, 2)
|
|
|
|
assert result[0] == "Totalsumma"
|
|
assert "686,88" in result[1]
|
|
assert "123,45" in result[2]
|
|
|
|
def test_splits_by_prices(self, handler):
|
|
"""Test splitting by price patterns."""
|
|
cell = "Pris 127,20 234,56"
|
|
result = handler._split_cell_content_for_rows(cell, 2)
|
|
|
|
assert len(result) >= 2
|
|
|
|
def test_fallback_returns_original(self, handler):
|
|
"""Test fallback returns original cell."""
|
|
cell = "No patterns here"
|
|
result = handler._split_cell_content_for_rows(cell, 2)
|
|
|
|
assert result == ["No patterns here"]
|
|
|
|
def test_product_number_with_description(self, handler):
|
|
"""Test product numbers include trailing description text."""
|
|
cell = "Art 1234567 Widget A 1234568 Widget B"
|
|
result = handler._split_cell_content_for_rows(cell, 2)
|
|
|
|
assert len(result) == 3
|
|
|
|
|
|
class TestSplitCellContent:
|
|
"""Tests for split_cell_content method."""
|
|
|
|
def test_splits_by_product_numbers(self, handler):
|
|
"""Test splitting by multiple product numbers."""
|
|
cell = "Produktnr 1234567 1234568 1234569"
|
|
result = handler.split_cell_content(cell)
|
|
|
|
assert result[0] == "Produktnr"
|
|
assert "1234567" in result
|
|
assert "1234568" in result
|
|
assert "1234569" in result
|
|
|
|
def test_splits_by_quantities(self, handler):
|
|
"""Test splitting by multiple quantities."""
|
|
cell = "Antal 6ST 6ST 1ST"
|
|
result = handler.split_cell_content(cell)
|
|
|
|
assert result[0] == "Antal"
|
|
assert len(result) >= 3
|
|
|
|
def test_splits_discount_amount_interleaved(self, handler):
|
|
"""Test splitting interleaved discount+amount patterns."""
|
|
cell = "Rabatt i% Totalsumma 10,0 686,88 10,0 123,45"
|
|
result = handler.split_cell_content(cell)
|
|
|
|
# Should extract amounts (3+ digit numbers with decimals)
|
|
assert result[0] == "Totalsumma"
|
|
assert "686,88" in result
|
|
assert "123,45" in result
|
|
|
|
def test_splits_by_prices(self, handler):
|
|
"""Test splitting by prices."""
|
|
cell = "Pris 127,20 127,20 159,20"
|
|
result = handler.split_cell_content(cell)
|
|
|
|
assert result[0] == "Pris"
|
|
|
|
def test_single_value_not_split(self, handler):
|
|
"""Test single value is not split."""
|
|
cell = "Single value"
|
|
result = handler.split_cell_content(cell)
|
|
|
|
assert result == ["Single value"]
|
|
|
|
def test_single_product_not_split(self, handler):
|
|
"""Test single product number is not split."""
|
|
cell = "Produktnr 1234567"
|
|
result = handler.split_cell_content(cell)
|
|
|
|
assert result == ["Produktnr 1234567"]
|
|
|
|
|
|
class TestHasMergedHeader:
|
|
"""Tests for has_merged_header method."""
|
|
|
|
def test_none_header_returns_false(self, handler):
|
|
"""Test None header returns False."""
|
|
assert handler.has_merged_header(None) is False
|
|
|
|
def test_empty_header_returns_false(self, handler):
|
|
"""Test empty header returns False."""
|
|
assert handler.has_merged_header([]) is False
|
|
|
|
def test_multiple_non_empty_cells_returns_false(self, handler):
|
|
"""Test multiple non-empty cells returns False."""
|
|
header = ["Beskrivning", "Antal", "Belopp"]
|
|
assert handler.has_merged_header(header) is False
|
|
|
|
def test_single_cell_with_keywords_returns_true(self, handler):
|
|
"""Test single cell with multiple keywords returns True."""
|
|
header = ["Specifikation 0218103-1201 rum och kök Hyra Avdrag"]
|
|
assert handler.has_merged_header(header) is True
|
|
|
|
def test_single_cell_one_keyword_returns_false(self, handler):
|
|
"""Test single cell with only one keyword returns False."""
|
|
header = ["Beskrivning only"]
|
|
assert handler.has_merged_header(header) is False
|
|
|
|
def test_ignores_empty_trailing_cells(self, handler):
|
|
"""Test ignores empty trailing cells."""
|
|
header = ["Specifikation Hyra Avdrag", "", "", ""]
|
|
assert handler.has_merged_header(header) is True
|
|
|
|
|
|
class TestExtractFromMergedCells:
|
|
"""Tests for extract_from_merged_cells method."""
|
|
|
|
def test_extracts_single_amount(self, handler):
|
|
"""Test extracting a single amount."""
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [["", "", "", "8159"]]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 1
|
|
assert items[0].amount == "8159"
|
|
assert items[0].is_deduction is False
|
|
assert items[0].article_number == "0218103-1201"
|
|
assert items[0].description == "2 rum och kök"
|
|
|
|
def test_extracts_deduction(self, handler):
|
|
"""Test extracting a deduction (negative amount)."""
|
|
header = ["Specifikation"]
|
|
rows = [["", "", "", "-2000"]]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 1
|
|
assert items[0].amount == "-2000"
|
|
assert items[0].is_deduction is True
|
|
# First item (row_index=0) gets description from header, not "Avdrag"
|
|
# "Avdrag" is only set for subsequent deduction items
|
|
assert items[0].description is None
|
|
|
|
def test_extracts_multiple_amounts_same_row(self, handler):
|
|
"""Test extracting multiple amounts from same row."""
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [["", "", "", "8159 -2000"]]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 2
|
|
assert items[0].amount == "8159"
|
|
assert items[1].amount == "-2000"
|
|
|
|
def test_extracts_amounts_from_multiple_rows(self, handler):
|
|
"""Test extracting amounts from multiple rows."""
|
|
header = ["Specifikation"]
|
|
rows = [
|
|
["", "", "", "8159"],
|
|
["", "", "", "-2000"],
|
|
]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 2
|
|
|
|
def test_skips_small_amounts(self, handler):
|
|
"""Test skipping small amounts below threshold."""
|
|
header = ["Specifikation"]
|
|
rows = [["", "", "", "50"]] # Below MIN_AMOUNT_THRESHOLD (100)
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 0
|
|
|
|
def test_skips_empty_rows(self, handler):
|
|
"""Test skipping empty rows."""
|
|
header = ["Specifikation"]
|
|
rows = [["", "", "", ""]]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 0
|
|
|
|
def test_handles_swedish_format_with_spaces(self, handler):
|
|
"""Test handling Swedish number format with spaces."""
|
|
header = ["Specifikation"]
|
|
rows = [["", "", "", "8 159"]]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 1
|
|
assert items[0].amount == "8159"
|
|
|
|
def test_confidence_is_lower_for_merged(self, handler):
|
|
"""Test confidence is 0.7 for merged cell extraction."""
|
|
header = ["Specifikation"]
|
|
rows = [["", "", "", "8159"]]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert items[0].confidence == 0.7
|
|
|
|
def test_empty_header_still_extracts(self, handler):
|
|
"""Test extraction works with empty header."""
|
|
header = []
|
|
rows = [["", "", "", "8159"]]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 1
|
|
assert items[0].description is None
|
|
assert items[0].article_number is None
|
|
|
|
def test_row_index_increments(self, handler):
|
|
"""Test row_index increments for each item."""
|
|
header = ["Specifikation"]
|
|
# Use separate rows to avoid regex grouping issues
|
|
rows = [
|
|
["", "", "", "8159"],
|
|
["", "", "", "5000"],
|
|
["", "", "", "-2000"],
|
|
]
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
# Should have 3 items from 3 rows
|
|
assert len(items) == 3
|
|
assert items[0].row_index == 0
|
|
assert items[1].row_index == 1
|
|
assert items[2].row_index == 2
|
|
|
|
|
|
class TestMinAmountThreshold:
|
|
"""Tests for MIN_AMOUNT_THRESHOLD constant."""
|
|
|
|
def test_threshold_value(self):
|
|
"""Test the threshold constant value."""
|
|
assert MIN_AMOUNT_THRESHOLD == 100
|
|
|
|
def test_amounts_at_threshold_included(self, handler):
|
|
"""Test amounts exactly at threshold are included."""
|
|
header = ["Specifikation"]
|
|
rows = [["", "", "", "100"]] # Exactly at threshold
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 1
|
|
assert items[0].amount == "100"
|
|
|
|
def test_amounts_below_threshold_excluded(self, handler):
|
|
"""Test amounts below threshold are excluded."""
|
|
header = ["Specifikation"]
|
|
rows = [["", "", "", "99"]] # Below threshold
|
|
|
|
items = handler.extract_from_merged_cells(header, rows)
|
|
|
|
assert len(items) == 0
|