""" Tests for Line Items Extractor Tests extraction of structured line items from HTML tables. """ import pytest from backend.table.line_items_extractor import ( LineItem, LineItemsResult, LineItemsExtractor, ColumnMapper, HTMLTableParser, ) class TestLineItem: """Tests for LineItem dataclass.""" def test_create_line_item_with_all_fields(self): """Test creating a line item with all fields populated.""" item = LineItem( row_index=0, description="Samfällighetsavgift", quantity="1", unit="st", unit_price="6888,00", amount="6888,00", article_number="3035", vat_rate="25", confidence=0.95, ) assert item.description == "Samfällighetsavgift" assert item.quantity == "1" assert item.amount == "6888,00" assert item.article_number == "3035" def test_create_line_item_with_minimal_fields(self): """Test creating a line item with only required fields.""" item = LineItem( row_index=0, description="Test item", amount="100,00", ) assert item.description == "Test item" assert item.amount == "100,00" assert item.quantity is None assert item.unit_price is None class TestHTMLTableParser: """Tests for HTML table parsing.""" def test_parse_simple_table(self): """Test parsing a simple HTML table.""" html = """
AB
12
""" parser = HTMLTableParser() header, rows = parser.parse(html) assert header == [] # No thead assert len(rows) == 2 assert rows[0] == ["A", "B"] assert rows[1] == ["1", "2"] def test_parse_table_with_thead(self): """Test parsing a table with explicit thead.""" html = """
NamePrice
Item 1100
""" parser = HTMLTableParser() header, rows = parser.parse(html) assert header == ["Name", "Price"] assert len(rows) == 1 assert rows[0] == ["Item 1", "100"] def test_parse_empty_table(self): """Test parsing an empty table.""" html = "
" parser = HTMLTableParser() header, rows = parser.parse(html) assert header == [] assert rows == [] def test_parse_table_with_empty_cells(self): """Test parsing a table with empty cells.""" html = """
Value
""" parser = HTMLTableParser() header, rows = parser.parse(html) assert rows[0] == ["", "Value", ""] class TestColumnMapper: """Tests for column mapping.""" def test_map_swedish_headers(self): """Test mapping Swedish column headers.""" mapper = ColumnMapper() headers = ["Art nummer", "Produktbeskrivning", "Antal", "Enhet", "A-pris", "Belopp"] mapping = mapper.map(headers) assert mapping[0] == "article_number" assert mapping[1] == "description" assert mapping[2] == "quantity" assert mapping[3] == "unit" assert mapping[4] == "unit_price" assert mapping[5] == "amount" def test_map_merged_headers(self): """Test mapping merged column headers (e.g., 'Moms A-pris').""" mapper = ColumnMapper() headers = ["Belopp", "Moms A-pris", "Enhet Antal", "Vara/tjänst", "Art.nr"] mapping = mapper.map(headers) assert mapping.get(0) == "amount" assert mapping.get(3) == "description" # Vara/tjänst -> description assert mapping.get(4) == "article_number" # Art.nr -> article_number def test_map_empty_headers(self): """Test mapping empty headers.""" mapper = ColumnMapper() headers = ["", "", ""] mapping = mapper.map(headers) assert mapping == {} def test_map_unknown_headers(self): """Test mapping unknown headers.""" mapper = ColumnMapper() headers = ["Foo", "Bar", "Baz"] mapping = mapper.map(headers) assert mapping == {} class TestLineItemsExtractor: """Tests for LineItemsExtractor.""" def test_extract_from_simple_html(self): """Test extracting line items from simple HTML.""" html = """
BeskrivningAntalPrisBelopp
Product A250,00100,00
Product B175,0075,00
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 2 assert result.items[0].description == "Product A" assert result.items[0].quantity == "2" assert result.items[0].amount == "100,00" assert result.items[1].description == "Product B" def test_extract_from_reversed_table(self): """Test extracting from table with header at bottom (PP-StructureV3 quirk).""" html = """
6 888,006 888,001Samfällighetsavgift3035
4 811,444 811,441GA:1 Avgift303501
BeloppMoms A-prisEnhet AntalVara/tjänstArt.nr
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 2 assert result.items[0].amount == "6 888,00" assert result.items[0].description == "Samfällighetsavgift" assert result.items[1].description == "GA:1 Avgift" def test_extract_from_empty_html(self): """Test extracting from empty HTML.""" extractor = LineItemsExtractor() result = extractor.extract("
") assert result.items == [] def test_extract_returns_result_with_metadata(self): """Test that extraction returns LineItemsResult with metadata.""" html = """
BeskrivningBelopp
Test100
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert isinstance(result, LineItemsResult) assert result.raw_html == html assert result.header_row == ["Beskrivning", "Belopp"] def test_extract_skips_empty_rows(self): """Test that extraction skips rows with no content.""" html = """
BeskrivningBelopp
Real item100
""" extractor = LineItemsExtractor() result = extractor.extract(html) assert len(result.items) == 1 assert result.items[0].description == "Real item" def test_is_line_items_table(self): """Test detection of line items table vs summary table.""" extractor = LineItemsExtractor() # Line items table line_items_headers = ["Art nummer", "Produktbeskrivning", "Antal", "Belopp"] assert extractor.is_line_items_table(line_items_headers) is True # Summary table summary_headers = ["Frakt", "Faktura.avg", "Exkl.moms", "Moms", "Belopp att betala"] assert extractor.is_line_items_table(summary_headers) is False # Payment table payment_headers = ["Bankgiro", "OCR", "Belopp"] assert extractor.is_line_items_table(payment_headers) is False class TestLineItemsExtractorFromPdf: """Tests for PDF extraction.""" def test_extract_from_pdf_no_tables(self): """Test extraction from PDF with no tables returns None.""" from unittest.mock import patch extractor = LineItemsExtractor() # Mock _detect_tables_with_parsing to return no tables and no parsing_res with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect: mock_detect.return_value = ([], []) result = extractor.extract_from_pdf("fake.pdf") assert result is None def test_extract_from_pdf_with_tables(self): """Test extraction from PDF with tables.""" from unittest.mock import patch, MagicMock from backend.table.structure_detector import TableDetectionResult extractor = LineItemsExtractor() # Create mock table detection result mock_table = MagicMock(spec=TableDetectionResult) mock_table.html = """
BeskrivningAntalPrisBelopp
Product A2100,00200,00
""" # Mock _detect_tables_with_parsing to return table results with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect: mock_detect.return_value = ([mock_table], []) result = extractor.extract_from_pdf("fake.pdf") assert result is not None assert len(result.items) >= 1 class TestLineItemsResult: """Tests for LineItemsResult dataclass.""" def test_create_result(self): """Test creating a LineItemsResult.""" items = [ LineItem(row_index=0, description="Item 1", amount="100"), LineItem(row_index=1, description="Item 2", amount="200"), ] result = LineItemsResult( items=items, header_row=["Beskrivning", "Belopp"], raw_html="...
", ) assert len(result.items) == 2 assert result.header_row == ["Beskrivning", "Belopp"] assert result.raw_html == "...
" def test_total_amount_calculation(self): """Test calculating total amount from line items.""" items = [ LineItem(row_index=0, description="Item 1", amount="100,00"), LineItem(row_index=1, description="Item 2", amount="200,50"), ] result = LineItemsResult(items=items, header_row=[], raw_html="") # Total should be calculated correctly assert result.total_amount == "300,50" def test_total_amount_with_deduction(self): """Test total amount calculation includes deductions (as separate rows).""" items = [ LineItem(row_index=0, description="Rent", amount="8159", is_deduction=False), LineItem(row_index=1, description="Avdrag", amount="-2000", is_deduction=True), ] result = LineItemsResult(items=items, header_row=[], raw_html="") # Total should be 8159 + (-2000) = 6159 assert result.total_amount == "6 159,00" def test_empty_result(self): """Test empty LineItemsResult.""" result = LineItemsResult(items=[], header_row=[], raw_html="") assert result.items == [] assert result.total_amount is None class TestMergedCellExtraction: """Tests for merged cell extraction (rental invoices).""" def test_has_merged_header_single_cell_with_keywords(self): """Test detection of merged header with multiple keywords.""" extractor = LineItemsExtractor() # Single cell with multiple keywords - should be detected as merged merged_header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] assert extractor._has_merged_header(merged_header) is True def test_has_merged_header_normal_header(self): """Test normal header is not detected as merged.""" extractor = LineItemsExtractor() # Normal separate headers normal_header = ["Beskrivning", "Antal", "Belopp"] assert extractor._has_merged_header(normal_header) is False def test_has_merged_header_empty(self): """Test empty header.""" extractor = LineItemsExtractor() assert extractor._has_merged_header([]) is False assert extractor._has_merged_header(None) is False def test_has_merged_header_with_empty_trailing_cells(self): """Test merged header detection with empty trailing cells.""" extractor = LineItemsExtractor() # PP-StructureV3 may produce headers with empty trailing cells merged_header_with_empty = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", "", "", ""] assert extractor._has_merged_header(merged_header_with_empty) is True # Should also work with leading empty cells merged_header_leading_empty = ["", "", "Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", ""] assert extractor._has_merged_header(merged_header_leading_empty) is True def test_extract_from_merged_cells_rental_invoice(self): """Test extracting from merged cells like rental invoice. Each amount becomes a separate row. Negative amounts are marked as is_deduction=True. """ extractor = LineItemsExtractor() header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [ ["", "", "", "8159 -2000"], ["", "", "", ""], ] items = extractor._extract_from_merged_cells(header, rows) # Should have 2 items: one for amount, one for deduction assert len(items) == 2 assert items[0].amount == "8159" assert items[0].is_deduction is False assert items[0].article_number == "0218103-1201" assert items[0].description == "2 rum och kök" assert items[1].amount == "-2000" assert items[1].is_deduction is True assert items[1].description == "Avdrag" def test_extract_from_merged_cells_separate_rows(self): """Test extracting when amount and deduction are in separate rows.""" extractor = LineItemsExtractor() header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [ ["", "", "", "8159"], # Amount in row 1 ["", "", "", "-2000"], # Deduction in row 2 ] items = extractor._extract_from_merged_cells(header, rows) # Should have 2 items: one for amount, one for deduction assert len(items) == 2 assert items[0].amount == "8159" assert items[0].is_deduction is False assert items[0].article_number == "0218103-1201" assert items[0].description == "2 rum och kök" assert items[1].amount == "-2000" assert items[1].is_deduction is True def test_extract_from_merged_cells_swedish_format(self): """Test extracting Swedish formatted amounts with spaces.""" extractor = LineItemsExtractor() header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"] rows = [ ["", "", "", "8 159"], # Swedish format with space ["", "", "", "-2 000"], # Swedish format with space ] items = extractor._extract_from_merged_cells(header, rows) # Should have 2 items assert len(items) == 2 # Amounts are cleaned (spaces removed) assert items[0].amount == "8159" assert items[0].is_deduction is False assert items[1].amount == "-2000" assert items[1].is_deduction is True def test_extract_merged_cells_via_extract(self): """Test that extract() calls merged cell parsing when needed.""" html = """
Specifikation 0218103-1201 2 rum och kök Hyra Avdrag
8159 -2000
""" extractor = LineItemsExtractor() result = extractor.extract(html) # Should have extracted 2 items via merged cell parsing assert len(result.items) == 2 assert result.items[0].amount == "8159" assert result.items[0].is_deduction is False assert result.items[1].amount == "-2000" assert result.items[1].is_deduction is True