invoice-master-poc-v2/tests/table/test_line_items_extractor.py

"""
Tests for Line Items Extractor

Tests extraction of structured line items from HTML tables.
"""

import pytest
from backend.table.line_items_extractor import (
    LineItem,
    LineItemsResult,
    LineItemsExtractor,
    ColumnMapper,
    HTMLTableParser,
)


class TestLineItem:
    """Tests for LineItem dataclass."""

    def test_create_line_item_with_all_fields(self):
        """Test creating a line item with all fields populated."""
        item = LineItem(
            row_index=0,
            description="Samfällighetsavgift",
            quantity="1",
            unit="st",
            unit_price="6888,00",
            amount="6888,00",
            article_number="3035",
            vat_rate="25",
            confidence=0.95,
        )
        assert item.description == "Samfällighetsavgift"
        assert item.quantity == "1"
        assert item.amount == "6888,00"
        assert item.article_number == "3035"

    def test_create_line_item_with_minimal_fields(self):
        """Test creating a line item with only required fields."""
        item = LineItem(
            row_index=0,
            description="Test item",
            amount="100,00",
        )
        assert item.description == "Test item"
        assert item.amount == "100,00"
        assert item.quantity is None
        assert item.unit_price is None


class TestHTMLTableParser:
    """Tests for HTML table parsing."""

    def test_parse_simple_table(self):
        """Test parsing a simple HTML table."""
        html = """
        <html><body><table>
            <tr><td>A</td><td>B</td></tr>
            <tr><td>1</td><td>2</td></tr>
        </table></body></html>
        """
        parser = HTMLTableParser()
        header, rows = parser.parse(html)

        assert header == []  # No thead
        assert len(rows) == 2
        assert rows[0] == ["A", "B"]
        assert rows[1] == ["1", "2"]

    def test_parse_table_with_thead(self):
        """Test parsing a table with explicit thead."""
        html = """
        <html><body><table>
            <thead><tr><th>Name</th><th>Price</th></tr></thead>
            <tbody><tr><td>Item 1</td><td>100</td></tr></tbody>
        </table></body></html>
        """
        parser = HTMLTableParser()
        header, rows = parser.parse(html)

        assert header == ["Name", "Price"]
        assert len(rows) == 1
        assert rows[0] == ["Item 1", "100"]

    def test_parse_empty_table(self):
        """Test parsing an empty table."""
        html = "<html><body><table></table></body></html>"
        parser = HTMLTableParser()
        header, rows = parser.parse(html)

        assert header == []
        assert rows == []

    def test_parse_table_with_empty_cells(self):
        """Test parsing a table with empty cells."""
        html = """
        <html><body><table>
            <tr><td></td><td>Value</td><td></td></tr>
        </table></body></html>
        """
        parser = HTMLTableParser()
        header, rows = parser.parse(html)

        assert rows[0] == ["", "Value", ""]


class TestColumnMapper:
    """Tests for column mapping."""

    def test_map_swedish_headers(self):
        """Test mapping Swedish column headers."""
        mapper = ColumnMapper()
        headers = ["Art nummer", "Produktbeskrivning", "Antal", "Enhet", "A-pris", "Belopp"]

        mapping = mapper.map(headers)

        assert mapping[0] == "article_number"
        assert mapping[1] == "description"
        assert mapping[2] == "quantity"
        assert mapping[3] == "unit"
        assert mapping[4] == "unit_price"
        assert mapping[5] == "amount"

    def test_map_merged_headers(self):
        """Test mapping merged column headers (e.g., 'Moms A-pris')."""
        mapper = ColumnMapper()
        headers = ["Belopp", "Moms A-pris", "Enhet Antal", "Vara/tjänst", "Art.nr"]

        mapping = mapper.map(headers)

        assert mapping.get(0) == "amount"
        assert mapping.get(3) == "description"  # Vara/tjänst -> description
        assert mapping.get(4) == "article_number"  # Art.nr -> article_number

    def test_map_empty_headers(self):
        """Test mapping empty headers."""
        mapper = ColumnMapper()
        headers = ["", "", ""]

        mapping = mapper.map(headers)

        assert mapping == {}

    def test_map_unknown_headers(self):
        """Test mapping unknown headers."""
        mapper = ColumnMapper()
        headers = ["Foo", "Bar", "Baz"]

        mapping = mapper.map(headers)

        assert mapping == {}


class TestLineItemsExtractor:
    """Tests for LineItemsExtractor."""

    def test_extract_from_simple_html(self):
        """Test extracting line items from simple HTML."""
        html = """
        <html><body><table>
            <thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
            <tbody>
                <tr><td>Product A</td><td>2</td><td>50,00</td><td>100,00</td></tr>
                <tr><td>Product B</td><td>1</td><td>75,00</td><td>75,00</td></tr>
            </tbody>
        </table></body></html>
        """
        extractor = LineItemsExtractor()
        result = extractor.extract(html)

        assert len(result.items) == 2
        assert result.items[0].description == "Product A"
        assert result.items[0].quantity == "2"
        assert result.items[0].amount == "100,00"
        assert result.items[1].description == "Product B"

    def test_extract_from_reversed_table(self):
        """Test extracting from table with header at bottom (PP-StructureV3 quirk)."""
        html = """
        <html><body><table>
            <tr><td>6 888,00</td><td>6 888,00</td><td>1</td><td>Samfällighetsavgift</td><td>3035</td></tr>
            <tr><td>4 811,44</td><td>4 811,44</td><td>1</td><td>GA:1 Avgift</td><td>303501</td></tr>
            <tr><td>Belopp</td><td>Moms A-pris</td><td>Enhet Antal</td><td>Vara/tjänst</td><td>Art.nr</td></tr>
        </table></body></html>
        """
        extractor = LineItemsExtractor()
        result = extractor.extract(html)

        assert len(result.items) == 2
        assert result.items[0].amount == "6 888,00"
        assert result.items[0].description == "Samfällighetsavgift"
        assert result.items[1].description == "GA:1 Avgift"

    def test_extract_from_empty_html(self):
        """Test extracting from empty HTML."""
        extractor = LineItemsExtractor()
        result = extractor.extract("<html><body><table></table></body></html>")

        assert result.items == []

    def test_extract_returns_result_with_metadata(self):
        """Test that extraction returns LineItemsResult with metadata."""
        html = """
        <html><body><table>
            <thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
            <tbody><tr><td>Test</td><td>100</td></tr></tbody>
        </table></body></html>
        """
        extractor = LineItemsExtractor()
        result = extractor.extract(html)

        assert isinstance(result, LineItemsResult)
        assert result.raw_html == html
        assert result.header_row == ["Beskrivning", "Belopp"]

    def test_extract_skips_empty_rows(self):
        """Test that extraction skips rows with no content."""
        html = """
        <html><body><table>
            <thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
            <tbody>
                <tr><td></td><td></td></tr>
                <tr><td>Real item</td><td>100</td></tr>
                <tr><td></td><td></td></tr>
            </tbody>
        </table></body></html>
        """
        extractor = LineItemsExtractor()
        result = extractor.extract(html)

        assert len(result.items) == 1
        assert result.items[0].description == "Real item"

    def test_is_line_items_table(self):
        """Test detection of line items table vs summary table."""
        extractor = LineItemsExtractor()

        # Line items table
        line_items_headers = ["Art nummer", "Produktbeskrivning", "Antal", "Belopp"]
        assert extractor.is_line_items_table(line_items_headers) is True

        # Summary table
        summary_headers = ["Frakt", "Faktura.avg", "Exkl.moms", "Moms", "Belopp att betala"]
        assert extractor.is_line_items_table(summary_headers) is False

        # Payment table
        payment_headers = ["Bankgiro", "OCR", "Belopp"]
        assert extractor.is_line_items_table(payment_headers) is False


class TestLineItemsExtractorFromPdf:
    """Tests for PDF extraction."""

    def test_extract_from_pdf_no_tables(self):
        """Test extraction from PDF with no tables returns None."""
        from unittest.mock import patch

        extractor = LineItemsExtractor()

        # Mock _detect_tables_with_parsing to return no tables and no parsing_res
        with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
            mock_detect.return_value = ([], [])

            result = extractor.extract_from_pdf("fake.pdf")

            assert result is None

    def test_extract_from_pdf_with_tables(self):
        """Test extraction from PDF with tables."""
        from unittest.mock import patch, MagicMock
        from backend.table.structure_detector import TableDetectionResult

        extractor = LineItemsExtractor()

        # Create mock table detection result
        mock_table = MagicMock(spec=TableDetectionResult)
        mock_table.html = """
        <table>
            <tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr>
            <tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr>
        </table>
        """

        # Mock _detect_tables_with_parsing to return table results
        with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
            mock_detect.return_value = ([mock_table], [])

            result = extractor.extract_from_pdf("fake.pdf")

            assert result is not None
            assert len(result.items) >= 1


class TestLineItemsResult:
    """Tests for LineItemsResult dataclass."""

    def test_create_result(self):
        """Test creating a LineItemsResult."""
        items = [
            LineItem(row_index=0, description="Item 1", amount="100"),
            LineItem(row_index=1, description="Item 2", amount="200"),
        ]
        result = LineItemsResult(
            items=items,
            header_row=["Beskrivning", "Belopp"],
            raw_html="<table>...</table>",
        )

        assert len(result.items) == 2
        assert result.header_row == ["Beskrivning", "Belopp"]
        assert result.raw_html == "<table>...</table>"

    def test_total_amount_calculation(self):
        """Test calculating total amount from line items."""
        items = [
            LineItem(row_index=0, description="Item 1", amount="100,00"),
            LineItem(row_index=1, description="Item 2", amount="200,50"),
        ]
        result = LineItemsResult(items=items, header_row=[], raw_html="")

        # Total should be calculated correctly
        assert result.total_amount == "300,50"

    def test_total_amount_with_deduction(self):
        """Test total amount calculation includes deductions (as separate rows)."""
        items = [
            LineItem(row_index=0, description="Rent", amount="8159", is_deduction=False),
            LineItem(row_index=1, description="Avdrag", amount="-2000", is_deduction=True),
        ]
        result = LineItemsResult(items=items, header_row=[], raw_html="")

        # Total should be 8159 + (-2000) = 6159
        assert result.total_amount == "6 159,00"

    def test_empty_result(self):
        """Test empty LineItemsResult."""
        result = LineItemsResult(items=[], header_row=[], raw_html="")

        assert result.items == []
        assert result.total_amount is None


class TestMergedCellExtraction:
    """Tests for merged cell extraction (rental invoices)."""

    def test_has_merged_header_single_cell_with_keywords(self):
        """Test detection of merged header with multiple keywords."""
        extractor = LineItemsExtractor()

        # Single cell with multiple keywords - should be detected as merged
        merged_header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
        assert extractor._has_merged_header(merged_header) is True

    def test_has_merged_header_normal_header(self):
        """Test normal header is not detected as merged."""
        extractor = LineItemsExtractor()

        # Normal separate headers
        normal_header = ["Beskrivning", "Antal", "Belopp"]
        assert extractor._has_merged_header(normal_header) is False

    def test_has_merged_header_empty(self):
        """Test empty header."""
        extractor = LineItemsExtractor()
        assert extractor._has_merged_header([]) is False
        assert extractor._has_merged_header(None) is False

    def test_has_merged_header_with_empty_trailing_cells(self):
        """Test merged header detection with empty trailing cells."""
        extractor = LineItemsExtractor()

        # PP-StructureV3 may produce headers with empty trailing cells
        merged_header_with_empty = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", "", "", ""]
        assert extractor._has_merged_header(merged_header_with_empty) is True

        # Should also work with leading empty cells
        merged_header_leading_empty = ["", "", "Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", ""]
        assert extractor._has_merged_header(merged_header_leading_empty) is True

    def test_extract_from_merged_cells_rental_invoice(self):
        """Test extracting from merged cells like rental invoice.

        Each amount becomes a separate row. Negative amounts are marked as is_deduction=True.
        """
        extractor = LineItemsExtractor()

        header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
        rows = [
            ["", "", "", "8159 -2000"],
            ["", "", "", ""],
        ]

        items = extractor._extract_from_merged_cells(header, rows)

        # Should have 2 items: one for amount, one for deduction
        assert len(items) == 2
        assert items[0].amount == "8159"
        assert items[0].is_deduction is False
        assert items[0].article_number == "0218103-1201"
        assert items[0].description == "2 rum och kök"

        assert items[1].amount == "-2000"
        assert items[1].is_deduction is True
        assert items[1].description == "Avdrag"

    def test_extract_from_merged_cells_separate_rows(self):
        """Test extracting when amount and deduction are in separate rows."""
        extractor = LineItemsExtractor()

        header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
        rows = [
            ["", "", "", "8159"],      # Amount in row 1
            ["", "", "", "-2000"],     # Deduction in row 2
        ]

        items = extractor._extract_from_merged_cells(header, rows)

        # Should have 2 items: one for amount, one for deduction
        assert len(items) == 2
        assert items[0].amount == "8159"
        assert items[0].is_deduction is False
        assert items[0].article_number == "0218103-1201"
        assert items[0].description == "2 rum och kök"

        assert items[1].amount == "-2000"
        assert items[1].is_deduction is True

    def test_extract_from_merged_cells_swedish_format(self):
        """Test extracting Swedish formatted amounts with spaces."""
        extractor = LineItemsExtractor()

        header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
        rows = [
            ["", "", "", "8 159"],      # Swedish format with space
            ["", "", "", "-2 000"],     # Swedish format with space
        ]

        items = extractor._extract_from_merged_cells(header, rows)

        # Should have 2 items
        assert len(items) == 2
        # Amounts are cleaned (spaces removed)
        assert items[0].amount == "8159"
        assert items[0].is_deduction is False
        assert items[1].amount == "-2000"
        assert items[1].is_deduction is True

    def test_extract_merged_cells_via_extract(self):
        """Test that extract() calls merged cell parsing when needed."""
        html = """
        <html><body><table>
            <tr><td colspan="4">Specifikation 0218103-1201 2 rum och kök Hyra Avdrag</td></tr>
            <tr><td></td><td></td><td></td><td>8159 -2000</td></tr>
        </table></body></html>
        """
        extractor = LineItemsExtractor()
        result = extractor.extract(html)

        # Should have extracted 2 items via merged cell parsing
        assert len(result.items) == 2
        assert result.items[0].amount == "8159"
        assert result.items[0].is_deduction is False
        assert result.items[1].amount == "-2000"
        assert result.items[1].is_deduction is True