""" Tests for Line Items Extractor Tests extraction of structured line items from HTML tables. """ import pytest from backend.table.line_items_extractor import ( LineItem, LineItemsResult, LineItemsExtractor, ColumnMapper, HTMLTableParser, ) class TestLineItem: """Tests for LineItem dataclass.""" def test_create_line_item_with_all_fields(self): """Test creating a line item with all fields populated.""" item = LineItem( row_index=0, description="Samfällighetsavgift", quantity="1", unit="st", unit_price="6888,00", amount="6888,00", article_number="3035", vat_rate="25", confidence=0.95, ) assert item.description == "Samfällighetsavgift" assert item.quantity == "1" assert item.amount == "6888,00" assert item.article_number == "3035" def test_create_line_item_with_minimal_fields(self): """Test creating a line item with only required fields.""" item = LineItem( row_index=0, description="Test item", amount="100,00", ) assert item.description == "Test item" assert item.amount == "100,00" assert item.quantity is None assert item.unit_price is None class TestHTMLTableParser: """Tests for HTML table parsing.""" def test_parse_simple_table(self): """Test parsing a simple HTML table.""" html = """
AB
12
""" parser = HTMLTableParser() header, rows = parser.parse(html) assert header == [] # No thead assert len(rows) == 2 assert rows[0] == ["A", "B"] assert rows[1] == ["1", "2"] def test_parse_table_with_thead(self): """Test parsing a table with explicit thead.""" html = """
NamePrice
Item 1100
""" parser = HTMLTableParser() header, rows = parser.parse(html) assert header == ["Name", "Price"] assert len(rows) == 1 assert rows[0] == ["Item 1", "100"] def test_parse_empty_table(self): """Test parsing an empty table.""" html = "
" parser = HTMLTableParser() header, rows = parser.parse(html) assert header == [] assert rows == [] def test_parse_table_with_empty_cells(self): """Test parsing a table with empty cells.""" html = """
Value
""" parser = HTMLTableParser() header, rows = parser.parse(html) assert rows[0] == ["", "Value", ""] class TestColumnMapper: """Tests for column mapping.""" def test_map_swedish_headers(self): """Test mapping Swedish column headers.""" mapper = ColumnMapper() headers = ["Art nummer", "Produktbeskrivning", "Antal", "Enhet", "A-pris", "Belopp"] mapping = mapper.map(headers) assert mapping[0] == "article_number" assert mapping[1] == "description" assert mapping[2] == "quantity" assert mapping[3] == "unit" assert mapping[4] == "unit_price" assert mapping[5] == "amount" def test_map_merged_headers(self): """Test mapping merged column headers (e.g., 'Moms A-pris').""" mapper = ColumnMapper() headers = ["Belopp", "Moms A-pris", "Enhet Antal", "Vara/tjänst", "Art.nr"] mapping = mapper.map(headers) assert mapping.get(0) == "amount" assert mapping.get(3) == "description" # Vara/tjänst -> description assert mapping.get(4) == "article_number" # Art.nr -> article_number def test_map_empty_headers(self): """Test mapping empty headers.""" mapper = ColumnMapper() headers = ["", "", ""] mapping = mapper.map(headers) assert mapping == {} def test_map_unknown_headers(self): """Test mapping unknown headers.""" mapper = ColumnMapper() headers = ["Foo", "Bar", "Baz"] mapping = mapper.map(headers) assert mapping == {} class TestLineItemsExtractor: """Tests for LineItemsExtractor.""" def test_extract_from_simple_html(self): """Test extracting line items from simple HTML.""" html = """
BeskrivningAntalPrisBelopp
Product A250,00100,00
Product B175,0075,00
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 2 assert result.items[0].description == "Product A" assert result.items[0].quantity == "2" assert result.items[0].amount == "100,00" assert result.items[1].description == "Product B" def test_extract_from_reversed_table(self): """Test extracting from table with header at bottom (PP-StructureV3 quirk).""" html = """
6 888,006 888,001Samfällighetsavgift3035
4 811,444 811,441GA:1 Avgift303501
BeloppMoms A-prisEnhet AntalVara/tjänstArt.nr
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 2 assert result.items[0].amount == "6 888,00" assert result.items[0].description == "Samfällighetsavgift" assert result.items[1].description == "GA:1 Avgift" def test_extract_from_empty_html(self): """Test extracting from empty HTML.""" extractor = LineItemsExtractor() result = extractor.extract("
") assert result.items == [] def test_extract_returns_result_with_metadata(self): """Test that extraction returns LineItemsResult with metadata.""" html = """
BeskrivningBelopp
Test100
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert isinstance(result, LineItemsResult) assert result.raw_html == html assert result.header_row == ["Beskrivning", "Belopp"] def test_extract_skips_empty_rows(self): """Test that extraction skips rows with no content.""" html = """
BeskrivningBelopp
Real item100
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 1 assert result.items[0].description == "Real item" def test_is_line_items_table(self): """Test detection of line items table vs summary table.""" extractor = LineItemsExtractor() # Line items table line_items_headers = ["Art nummer", "Produktbeskrivning", "Antal", "Belopp"] assert extractor.is_line_items_table(line_items_headers) is True # Summary table summary_headers = ["Frakt", "Faktura.avg", "Exkl.moms", "Moms", "Belopp att betala"] assert extractor.is_line_items_table(summary_headers) is False # Payment table payment_headers = ["Bankgiro", "OCR", "Belopp"] assert extractor.is_line_items_table(payment_headers) is False class TestLineItemsExtractorFromPdf: """Tests for PDF extraction.""" def test_extract_from_pdf_no_tables(self): """Test extraction from PDF with no tables returns None.""" from unittest.mock import patch extractor = LineItemsExtractor() # Mock _detect_tables_with_parsing to return no tables and no parsing_res with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect: mock_detect.return_value = ([], []) result = extractor.extract_from_pdf("fake.pdf") assert result is None def test_extract_from_pdf_with_tables(self): """Test extraction from PDF with tables.""" from unittest.mock import patch, MagicMock from backend.table.structure_detector import TableDetectionResult extractor = LineItemsExtractor() # Create mock table detection result with proper thead/tbody structure mock_table = MagicMock(spec=TableDetectionResult) mock_table.html = """
BeskrivningAntalPrisBelopp
Product A2100,00200,00
""" # Mock _detect_tables_with_parsing to return table results with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect: mock_detect.return_value = ([mock_table], []) result = extractor.extract_from_pdf("fake.pdf") assert result is not None assert len(result.items) >= 1 class TestPdfPathValidation: """Tests for PDF path validation.""" def test_detect_tables_with_nonexistent_path(self): """Test that non-existent PDF path returns empty results.""" extractor = LineItemsExtractor() # Create detector and call _detect_tables_with_parsing with non-existent path from unittest.mock import MagicMock from backend.table.structure_detector import TableDetector mock_detector = MagicMock(spec=TableDetector) tables, parsing_res = extractor._detect_tables_with_parsing( mock_detector, "nonexistent.pdf" ) assert tables == [] assert parsing_res == [] def test_detect_tables_with_directory_path(self, tmp_path): """Test that directory path (not file) returns empty results.""" extractor = LineItemsExtractor() from unittest.mock import MagicMock from backend.table.structure_detector import TableDetector mock_detector = MagicMock(spec=TableDetector) # tmp_path is a directory, not a file tables, parsing_res = extractor._detect_tables_with_parsing( mock_detector, str(tmp_path) ) assert tables == [] assert parsing_res == [] def test_detect_tables_validates_file_exists(self, tmp_path): """Test path validation for file existence. This test verifies that the method correctly validates the path exists and is a file before attempting to process it. """ from unittest.mock import patch extractor = LineItemsExtractor() # Create a real file path that exists fake_pdf = tmp_path / "test.pdf" fake_pdf.write_bytes(b"not a real pdf") # Mock render_pdf_to_images to avoid actual PDF processing with patch("shared.pdf.renderer.render_pdf_to_images") as mock_render: # Return empty iterator - simulates file exists but no pages mock_render.return_value = iter([]) from unittest.mock import MagicMock from backend.table.structure_detector import TableDetector mock_detector = MagicMock(spec=TableDetector) mock_detector._ensure_initialized = MagicMock() mock_detector._pipeline = MagicMock() tables, parsing_res = extractor._detect_tables_with_parsing( mock_detector, str(fake_pdf) ) # render_pdf_to_images was called (path validation passed) mock_render.assert_called_once() assert tables == [] assert parsing_res == [] class TestLineItemsResult: """Tests for LineItemsResult dataclass.""" def test_create_result(self): """Test creating a LineItemsResult.""" items = [ LineItem(row_index=0, description="Item 1", amount="100"), LineItem(row_index=1, description="Item 2", amount="200"), ] result = LineItemsResult( items=items, header_row=["Beskrivning", "Belopp"], raw_html="...
", ) assert len(result.items) == 2 assert result.header_row == ["Beskrivning", "Belopp"] assert result.raw_html == "...
" def test_total_amount_calculation(self): """Test calculating total amount from line items.""" items = [ LineItem(row_index=0, description="Item 1", amount="100,00"), LineItem(row_index=1, description="Item 2", amount="200,50"), ] result = LineItemsResult(items=items, header_row=[], raw_html="") # Total should be calculated correctly assert result.total_amount == "300,50" def test_total_amount_with_deduction(self): """Test total amount calculation includes deductions (as separate rows).""" items = [ LineItem(row_index=0, description="Rent", amount="8159", is_deduction=False), LineItem(row_index=1, description="Avdrag", amount="-2000", is_deduction=True), ] result = LineItemsResult(items=items, header_row=[], raw_html="") # Total should be 8159 + (-2000) = 6159 assert result.total_amount == "6 159,00" def test_empty_result(self): """Test empty LineItemsResult.""" result = LineItemsResult(items=[], header_row=[], raw_html="") assert result.items == [] assert result.total_amount is None class TestMergedCellExtraction: """Tests for merged cell extraction (rental invoices).""" def test_has_merged_header_single_cell_with_keywords(self): """Test detection of merged header with multiple keywords.""" extractor = LineItemsExtractor() # Single cell with multiple keywords - should be detected as merged merged_header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] assert extractor._has_merged_header(merged_header) is True def test_has_merged_header_normal_header(self): """Test normal header is not detected as merged.""" extractor = LineItemsExtractor() # Normal separate headers normal_header = ["Beskrivning", "Antal", "Belopp"] assert extractor._has_merged_header(normal_header) is False def test_has_merged_header_empty(self): """Test empty header.""" extractor = LineItemsExtractor() assert extractor._has_merged_header([]) is False assert extractor._has_merged_header(None) is False def test_has_merged_header_with_empty_trailing_cells(self): """Test merged header detection with empty trailing cells.""" extractor = LineItemsExtractor() # PP-StructureV3 may produce headers with empty trailing cells merged_header_with_empty = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", "", "", ""] assert extractor._has_merged_header(merged_header_with_empty) is True # Should also work with leading empty cells merged_header_leading_empty = ["", "", "Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", ""] assert extractor._has_merged_header(merged_header_leading_empty) is True def test_extract_from_merged_cells_rental_invoice(self): """Test extracting from merged cells like rental invoice. Each amount becomes a separate row. Negative amounts are marked as is_deduction=True. """ extractor = LineItemsExtractor() header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [ ["", "", "", "8159 -2000"], ["", "", "", ""], ] items = extractor._extract_from_merged_cells(header, rows) # Should have 2 items: one for amount, one for deduction assert len(items) == 2 assert items[0].amount == "8159" assert items[0].is_deduction is False assert items[0].article_number == "0218103-1201" assert items[0].description == "2 rum och kök" assert items[1].amount == "-2000" assert items[1].is_deduction is True assert items[1].description == "Avdrag" def test_extract_from_merged_cells_separate_rows(self): """Test extracting when amount and deduction are in separate rows.""" extractor = LineItemsExtractor() header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [ ["", "", "", "8159"], # Amount in row 1 ["", "", "", "-2000"], # Deduction in row 2 ] items = extractor._extract_from_merged_cells(header, rows) # Should have 2 items: one for amount, one for deduction assert len(items) == 2 assert items[0].amount == "8159" assert items[0].is_deduction is False assert items[0].article_number == "0218103-1201" assert items[0].description == "2 rum och kök" assert items[1].amount == "-2000" assert items[1].is_deduction is True def test_extract_from_merged_cells_swedish_format(self): """Test extracting Swedish formatted amounts with spaces.""" extractor = LineItemsExtractor() header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [ ["", "", "", "8 159"], # Swedish format with space ["", "", "", "-2 000"], # Swedish format with space ] items = extractor._extract_from_merged_cells(header, rows) # Should have 2 items assert len(items) == 2 # Amounts are cleaned (spaces removed) assert items[0].amount == "8159" assert items[0].is_deduction is False assert items[1].amount == "-2000" assert items[1].is_deduction is True def test_extract_merged_cells_via_extract(self): """Test that extract() calls merged cell parsing when needed.""" html = """
Specifikation 0218103-1201 2 rum och kök Hyra Avdrag
8159 -2000
""" extractor = LineItemsExtractor() result = extractor.extract(html) # Should have extracted 2 items via merged cell parsing assert len(result.items) == 2 assert result.items[0].amount == "8159" assert result.items[0].is_deduction is False assert result.items[1].amount == "-2000" assert result.items[1].is_deduction is True class TestTextFallbackExtraction: """Tests for text-based fallback extraction.""" def test_text_fallback_disabled_by_default(self): """Test text fallback can be disabled.""" extractor = LineItemsExtractor(enable_text_fallback=False) assert extractor.enable_text_fallback is False def test_text_fallback_enabled_by_default(self): """Test text fallback is enabled by default.""" extractor = LineItemsExtractor() assert extractor.enable_text_fallback is True def test_try_text_fallback_with_valid_parsing_res(self): """Test text fallback with valid parsing results.""" from unittest.mock import patch, MagicMock from backend.table.text_line_items_extractor import ( TextLineItemsExtractor, TextLineItem, TextLineItemsResult, ) extractor = LineItemsExtractor() # Mock parsing_res_list with text elements parsing_res = [ {"label": "text", "bbox": [0, 100, 200, 120], "text": "Product A"}, {"label": "text", "bbox": [250, 100, 350, 120], "text": "1 234,56"}, {"label": "text", "bbox": [0, 150, 200, 170], "text": "Product B"}, {"label": "text", "bbox": [250, 150, 350, 170], "text": "2 345,67"}, ] # Create mock text extraction result mock_text_result = TextLineItemsResult( items=[ TextLineItem(row_index=0, description="Product A", amount="1 234,56"), TextLineItem(row_index=1, description="Product B", amount="2 345,67"), ], header_row=[], ) with patch.object(TextLineItemsExtractor, 'extract_from_parsing_res', return_value=mock_text_result): result = extractor._try_text_fallback(parsing_res) assert result is not None assert len(result.items) == 2 assert result.items[0].description == "Product A" assert result.items[1].description == "Product B" def test_try_text_fallback_returns_none_on_failure(self): """Test text fallback returns None when extraction fails.""" from unittest.mock import patch extractor = LineItemsExtractor() with patch('backend.table.text_line_items_extractor.TextLineItemsExtractor.extract_from_parsing_res', return_value=None): result = extractor._try_text_fallback([]) assert result is None def test_extract_from_pdf_uses_text_fallback(self): """Test extract_from_pdf uses text fallback when no tables found.""" from unittest.mock import patch, MagicMock from backend.table.text_line_items_extractor import TextLineItem, TextLineItemsResult extractor = LineItemsExtractor(enable_text_fallback=True) # Mock _detect_tables_with_parsing to return no tables but parsing_res mock_text_result = TextLineItemsResult( items=[ TextLineItem(row_index=0, description="Product", amount="100,00"), TextLineItem(row_index=1, description="Product 2", amount="200,00"), ], header_row=[], ) with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect: mock_detect.return_value = ([], [{"label": "text", "text": "test"}]) with patch.object(extractor, '_try_text_fallback', return_value=MagicMock(items=[MagicMock()])) as mock_fallback: result = extractor.extract_from_pdf("fake.pdf") # Text fallback should be called mock_fallback.assert_called_once() def test_extract_from_pdf_skips_fallback_when_disabled(self): """Test extract_from_pdf skips text fallback when disabled.""" from unittest.mock import patch extractor = LineItemsExtractor(enable_text_fallback=False) with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect: mock_detect.return_value = ([], [{"label": "text", "text": "test"}]) result = extractor.extract_from_pdf("fake.pdf") # Should return None, not use text fallback assert result is None class TestVerticallyMergedCellExtraction: """Tests for vertically merged cell extraction.""" def test_detects_vertically_merged_cells(self): """Test detection of vertically merged cells in rows.""" extractor = LineItemsExtractor() # Rows with multiple product numbers in single cell rows = [["Produktnr 1457280 1457281 1060381 merged text here"]] assert extractor._has_vertically_merged_cells(rows) is True def test_splits_vertically_merged_rows(self): """Test splitting vertically merged rows.""" extractor = LineItemsExtractor() rows = [ ["Produktnr 1234567 1234568", "Antal 2ST 3ST"], ] header, data = extractor._split_merged_rows(rows) # Should split into header + data rows assert isinstance(header, list) assert isinstance(data, list) class TestDeductionDetection: """Tests for deduction/discount detection.""" def test_detects_deduction_by_keyword_avdrag(self): """Test detection of deduction by 'avdrag' keyword.""" html = """
BeskrivningBelopp
Hyresavdrag januari-500,00
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 1 assert result.items[0].is_deduction is True def test_detects_deduction_by_keyword_rabatt(self): """Test detection of deduction by 'rabatt' keyword.""" html = """
BeskrivningBelopp
Rabatt 10%-100,00
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 1 assert result.items[0].is_deduction is True def test_detects_deduction_by_negative_amount(self): """Test detection of deduction by negative amount.""" html = """
BeskrivningBelopp
Some credit-250,00
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 1 assert result.items[0].is_deduction is True def test_normal_item_not_deduction(self): """Test normal item is not marked as deduction.""" html = """
BeskrivningBelopp
Normal product500,00
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 1 assert result.items[0].is_deduction is False class TestHeaderDetection: """Tests for header row detection.""" def test_detect_header_at_bottom(self): """Test detecting header at bottom of table (reversed).""" extractor = LineItemsExtractor() rows = [ ["100,00", "Product A", "1"], ["200,00", "Product B", "2"], ["Belopp", "Beskrivning", "Antal"], # Header at bottom ] header_idx, header, is_at_end = extractor._detect_header_row(rows) assert header_idx == 2 assert is_at_end is True assert "Belopp" in header def test_detect_header_at_top(self): """Test detecting header at top of table.""" extractor = LineItemsExtractor() rows = [ ["Belopp", "Beskrivning", "Antal"], # Header at top ["100,00", "Product A", "1"], ["200,00", "Product B", "2"], ] header_idx, header, is_at_end = extractor._detect_header_row(rows) assert header_idx == 0 assert is_at_end is False assert "Belopp" in header def test_no_header_detected(self): """Test when no header is detected.""" extractor = LineItemsExtractor() rows = [ ["100,00", "Product A", "1"], ["200,00", "Product B", "2"], ] header_idx, header, is_at_end = extractor._detect_header_row(rows) assert header_idx == -1 assert header == [] assert is_at_end is False