""" Tests for Merged Cell Handler Tests the detection and extraction of data from tables with merged cells, a common issue with PP-StructureV3 OCR output. """ import pytest from backend.table.merged_cell_handler import MergedCellHandler, MIN_AMOUNT_THRESHOLD from backend.table.html_table_parser import ColumnMapper @pytest.fixture def handler(): """Create a MergedCellHandler with default ColumnMapper.""" return MergedCellHandler(ColumnMapper()) class TestHasVerticallyMergedCells: """Tests for has_vertically_merged_cells detection.""" def test_empty_rows_returns_false(self, handler): """Test empty rows returns False.""" assert handler.has_vertically_merged_cells([]) is False def test_short_cells_ignored(self, handler): """Test cells shorter than 20 chars are ignored.""" rows = [["Short cell", "Also short"]] assert handler.has_vertically_merged_cells(rows) is False def test_detects_multiple_product_numbers(self, handler): """Test detection of multiple 7-digit product numbers in cell.""" rows = [["Produktnr 1457280 1457281 1060381 and more text here"]] assert handler.has_vertically_merged_cells(rows) is True def test_single_product_number_not_merged(self, handler): """Test single product number doesn't trigger detection.""" rows = [["Produktnr 1457280 and more text here for length"]] assert handler.has_vertically_merged_cells(rows) is False def test_detects_multiple_prices(self, handler): """Test detection of 3+ prices in cell (Swedish format).""" rows = [["Pris 127,20 234,56 159,20 total amounts"]] assert handler.has_vertically_merged_cells(rows) is True def test_two_prices_not_merged(self, handler): """Test two prices doesn't trigger detection (needs 3+).""" rows = [["Pris 127,20 234,56 total amount here"]] assert handler.has_vertically_merged_cells(rows) is False def test_detects_multiple_quantities(self, handler): """Test detection of multiple quantity patterns.""" rows = [["Antal 6ST 6ST 1ST more text here"]] assert handler.has_vertically_merged_cells(rows) is True def test_single_quantity_not_merged(self, handler): """Test single quantity doesn't trigger detection.""" rows = [["Antal 6ST and more text here for length"]] assert handler.has_vertically_merged_cells(rows) is False def test_empty_cell_skipped(self, handler): """Test empty cells are skipped.""" rows = [["", None, "Valid but short"]] assert handler.has_vertically_merged_cells(rows) is False def test_multiple_rows_checked(self, handler): """Test all rows are checked for merged content.""" rows = [ ["Normal row with nothing special"], ["Produktnr 1457280 1457281 1060381 merged content"], ] assert handler.has_vertically_merged_cells(rows) is True class TestSplitMergedRows: """Tests for split_merged_rows method.""" def test_empty_rows_returns_empty(self, handler): """Test empty rows returns empty result.""" header, data = handler.split_merged_rows([]) assert header == [] assert data == [] def test_all_empty_rows_returns_original(self, handler): """Test all empty rows returns original rows.""" rows = [["", ""], ["", ""]] header, data = handler.split_merged_rows(rows) assert header == [] assert data == rows def test_splits_by_product_numbers(self, handler): """Test splitting rows by product numbers.""" rows = [ ["Produktnr 1234567 1234568", "Antal 2ST 3ST", "Pris 100,00 200,00"], ] header, data = handler.split_merged_rows(rows) assert len(header) == 3 assert header[0] == "Produktnr" assert len(data) == 2 def test_splits_by_quantities(self, handler): """Test splitting rows by quantity patterns.""" rows = [ ["Description text", "Antal 5ST 10ST", "Belopp 500,00 1000,00"], ] header, data = handler.split_merged_rows(rows) # Should detect 2 quantities and split accordingly assert len(data) >= 1 def test_single_row_not_split(self, handler): """Test single item row is not split.""" rows = [ ["Produktnr 1234567", "Antal 2ST", "Pris 100,00"], ] header, data = handler.split_merged_rows(rows) # Only 1 product number, so expected_rows <= 1 assert header == [] assert data == rows def test_handles_missing_columns(self, handler): """Test handles rows with different column counts.""" rows = [ ["Produktnr 1234567 1234568", ""], ["Antal 2ST 3ST"], ] header, data = handler.split_merged_rows(rows) # Should handle gracefully assert isinstance(header, list) assert isinstance(data, list) class TestCountExpectedRows: """Tests for _count_expected_rows helper.""" def test_counts_product_numbers(self, handler): """Test counting product numbers.""" columns = ["Produktnr 1234567 1234568 1234569", "Other"] count = handler._count_expected_rows(columns) assert count == 3 def test_counts_quantities(self, handler): """Test counting quantity patterns.""" columns = ["Nothing here", "Antal 5ST 10ST 15ST 20ST"] count = handler._count_expected_rows(columns) assert count == 4 def test_returns_max_count(self, handler): """Test returns maximum count across columns.""" columns = [ "Produktnr 1234567 1234568", # 2 products "Antal 5ST 10ST 15ST", # 3 quantities ] count = handler._count_expected_rows(columns) assert count == 3 def test_empty_columns_return_zero(self, handler): """Test empty columns return 0.""" columns = ["", None, "Short"] count = handler._count_expected_rows(columns) assert count == 0 class TestSplitCellContentForRows: """Tests for _split_cell_content_for_rows helper.""" def test_splits_by_product_numbers(self, handler): """Test splitting by product numbers with expected count.""" cell = "Produktnr 1234567 1234568" result = handler._split_cell_content_for_rows(cell, 2) assert len(result) == 3 # header + 2 values assert result[0] == "Produktnr" assert "1234567" in result[1] assert "1234568" in result[2] def test_splits_by_quantities(self, handler): """Test splitting by quantity patterns.""" cell = "Antal 5ST 10ST" result = handler._split_cell_content_for_rows(cell, 2) assert len(result) == 3 # header + 2 values assert result[0] == "Antal" def test_splits_discount_totalsumma(self, handler): """Test splitting discount+totalsumma columns.""" cell = "Rabatt i% Totalsumma 686,88 123,45" result = handler._split_cell_content_for_rows(cell, 2) assert result[0] == "Totalsumma" assert "686,88" in result[1] assert "123,45" in result[2] def test_splits_by_prices(self, handler): """Test splitting by price patterns.""" cell = "Pris 127,20 234,56" result = handler._split_cell_content_for_rows(cell, 2) assert len(result) >= 2 def test_fallback_returns_original(self, handler): """Test fallback returns original cell.""" cell = "No patterns here" result = handler._split_cell_content_for_rows(cell, 2) assert result == ["No patterns here"] def test_product_number_with_description(self, handler): """Test product numbers include trailing description text.""" cell = "Art 1234567 Widget A 1234568 Widget B" result = handler._split_cell_content_for_rows(cell, 2) assert len(result) == 3 class TestSplitCellContent: """Tests for split_cell_content method.""" def test_splits_by_product_numbers(self, handler): """Test splitting by multiple product numbers.""" cell = "Produktnr 1234567 1234568 1234569" result = handler.split_cell_content(cell) assert result[0] == "Produktnr" assert "1234567" in result assert "1234568" in result assert "1234569" in result def test_splits_by_quantities(self, handler): """Test splitting by multiple quantities.""" cell = "Antal 6ST 6ST 1ST" result = handler.split_cell_content(cell) assert result[0] == "Antal" assert len(result) >= 3 def test_splits_discount_amount_interleaved(self, handler): """Test splitting interleaved discount+amount patterns.""" cell = "Rabatt i% Totalsumma 10,0 686,88 10,0 123,45" result = handler.split_cell_content(cell) # Should extract amounts (3+ digit numbers with decimals) assert result[0] == "Totalsumma" assert "686,88" in result assert "123,45" in result def test_splits_by_prices(self, handler): """Test splitting by prices.""" cell = "Pris 127,20 127,20 159,20" result = handler.split_cell_content(cell) assert result[0] == "Pris" def test_single_value_not_split(self, handler): """Test single value is not split.""" cell = "Single value" result = handler.split_cell_content(cell) assert result == ["Single value"] def test_single_product_not_split(self, handler): """Test single product number is not split.""" cell = "Produktnr 1234567" result = handler.split_cell_content(cell) assert result == ["Produktnr 1234567"] class TestHasMergedHeader: """Tests for has_merged_header method.""" def test_none_header_returns_false(self, handler): """Test None header returns False.""" assert handler.has_merged_header(None) is False def test_empty_header_returns_false(self, handler): """Test empty header returns False.""" assert handler.has_merged_header([]) is False def test_multiple_non_empty_cells_returns_false(self, handler): """Test multiple non-empty cells returns False.""" header = ["Beskrivning", "Antal", "Belopp"] assert handler.has_merged_header(header) is False def test_single_cell_with_keywords_returns_true(self, handler): """Test single cell with multiple keywords returns True.""" header = ["Specifikation 0218103-1201 rum och kök Hyra Avdrag"] assert handler.has_merged_header(header) is True def test_single_cell_one_keyword_returns_false(self, handler): """Test single cell with only one keyword returns False.""" header = ["Beskrivning only"] assert handler.has_merged_header(header) is False def test_ignores_empty_trailing_cells(self, handler): """Test ignores empty trailing cells.""" header = ["Specifikation Hyra Avdrag", "", "", ""] assert handler.has_merged_header(header) is True class TestExtractFromMergedCells: """Tests for extract_from_merged_cells method.""" def test_extracts_single_amount(self, handler): """Test extracting a single amount.""" header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [["", "", "", "8159"]] items = handler.extract_from_merged_cells(header, rows) assert len(items) == 1 assert items[0].amount == "8159" assert items[0].is_deduction is False assert items[0].article_number == "0218103-1201" assert items[0].description == "2 rum och kök" def test_extracts_deduction(self, handler): """Test extracting a deduction (negative amount).""" header = ["Specifikation"] rows = [["", "", "", "-2000"]] items = handler.extract_from_merged_cells(header, rows) assert len(items) == 1 assert items[0].amount == "-2000" assert items[0].is_deduction is True # First item (row_index=0) gets description from header, not "Avdrag" # "Avdrag" is only set for subsequent deduction items assert items[0].description is None def test_extracts_multiple_amounts_same_row(self, handler): """Test extracting multiple amounts from same row.""" header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [["", "", "", "8159 -2000"]] items = handler.extract_from_merged_cells(header, rows) assert len(items) == 2 assert items[0].amount == "8159" assert items[1].amount == "-2000" def test_extracts_amounts_from_multiple_rows(self, handler): """Test extracting amounts from multiple rows.""" header = ["Specifikation"] rows = [ ["", "", "", "8159"], ["", "", "", "-2000"], ] items = handler.extract_from_merged_cells(header, rows) assert len(items) == 2 def test_skips_small_amounts(self, handler): """Test skipping small amounts below threshold.""" header = ["Specifikation"] rows = [["", "", "", "50"]] # Below MIN_AMOUNT_THRESHOLD (100) items = handler.extract_from_merged_cells(header, rows) assert len(items) == 0 def test_skips_empty_rows(self, handler): """Test skipping empty rows.""" header = ["Specifikation"] rows = [["", "", "", ""]] items = handler.extract_from_merged_cells(header, rows) assert len(items) == 0 def test_handles_swedish_format_with_spaces(self, handler): """Test handling Swedish number format with spaces.""" header = ["Specifikation"] rows = [["", "", "", "8 159"]] items = handler.extract_from_merged_cells(header, rows) assert len(items) == 1 assert items[0].amount == "8159" def test_confidence_is_lower_for_merged(self, handler): """Test confidence is 0.7 for merged cell extraction.""" header = ["Specifikation"] rows = [["", "", "", "8159"]] items = handler.extract_from_merged_cells(header, rows) assert items[0].confidence == 0.7 def test_empty_header_still_extracts(self, handler): """Test extraction works with empty header.""" header = [] rows = [["", "", "", "8159"]] items = handler.extract_from_merged_cells(header, rows) assert len(items) == 1 assert items[0].description is None assert items[0].article_number is None def test_row_index_increments(self, handler): """Test row_index increments for each item.""" header = ["Specifikation"] # Use separate rows to avoid regex grouping issues rows = [ ["", "", "", "8159"], ["", "", "", "5000"], ["", "", "", "-2000"], ] items = handler.extract_from_merged_cells(header, rows) # Should have 3 items from 3 rows assert len(items) == 3 assert items[0].row_index == 0 assert items[1].row_index == 1 assert items[2].row_index == 2 class TestMinAmountThreshold: """Tests for MIN_AMOUNT_THRESHOLD constant.""" def test_threshold_value(self): """Test the threshold constant value.""" assert MIN_AMOUNT_THRESHOLD == 100 def test_amounts_at_threshold_included(self, handler): """Test amounts exactly at threshold are included.""" header = ["Specifikation"] rows = [["", "", "", "100"]] # Exactly at threshold items = handler.extract_from_merged_cells(header, rows) assert len(items) == 1 assert items[0].amount == "100" def test_amounts_below_threshold_excluded(self, handler): """Test amounts below threshold are excluded.""" header = ["Specifikation"] rows = [["", "", "", "99"]] # Below threshold items = handler.extract_from_merged_cells(header, rows) assert len(items) == 0