refactor: split line_items_extractor into smaller modules with comprehensive tests

- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 23:02:00 +01:00
parent c2c8f2dd04
commit 8723ef4653
11 changed files with 2230 additions and 841 deletions
--- a/tests/table/test_line_items_extractor.py
+++ b/tests/table/test_line_items_extractor.py
@@ -272,12 +272,12 @@ class TestLineItemsExtractorFromPdf:

        extractor = LineItemsExtractor()

-        # Create mock table detection result
+        # Create mock table detection result with proper thead/tbody structure
        mock_table = MagicMock(spec=TableDetectionResult)
        mock_table.html = """
        <table>
-            <tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr>
-            <tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr>
+            <thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
+            <tbody><tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr></tbody>
        </table>
        """

@@ -291,6 +291,78 @@ class TestLineItemsExtractorFromPdf:
            assert len(result.items) >= 1


+class TestPdfPathValidation:
+    """Tests for PDF path validation."""
+
+    def test_detect_tables_with_nonexistent_path(self):
+        """Test that non-existent PDF path returns empty results."""
+        extractor = LineItemsExtractor()
+
+        # Create detector and call _detect_tables_with_parsing with non-existent path
+        from unittest.mock import MagicMock
+        from backend.table.structure_detector import TableDetector
+
+        mock_detector = MagicMock(spec=TableDetector)
+        tables, parsing_res = extractor._detect_tables_with_parsing(
+            mock_detector, "nonexistent.pdf"
+        )
+
+        assert tables == []
+        assert parsing_res == []
+
+    def test_detect_tables_with_directory_path(self, tmp_path):
+        """Test that directory path (not file) returns empty results."""
+        extractor = LineItemsExtractor()
+
+        from unittest.mock import MagicMock
+        from backend.table.structure_detector import TableDetector
+
+        mock_detector = MagicMock(spec=TableDetector)
+
+        # tmp_path is a directory, not a file
+        tables, parsing_res = extractor._detect_tables_with_parsing(
+            mock_detector, str(tmp_path)
+        )
+
+        assert tables == []
+        assert parsing_res == []
+
+    def test_detect_tables_validates_file_exists(self, tmp_path):
+        """Test path validation for file existence.
+
+        This test verifies that the method correctly validates the path exists
+        and is a file before attempting to process it.
+        """
+        from unittest.mock import patch
+
+        extractor = LineItemsExtractor()
+
+        # Create a real file path that exists
+        fake_pdf = tmp_path / "test.pdf"
+        fake_pdf.write_bytes(b"not a real pdf")
+
+        # Mock render_pdf_to_images to avoid actual PDF processing
+        with patch("shared.pdf.renderer.render_pdf_to_images") as mock_render:
+            # Return empty iterator - simulates file exists but no pages
+            mock_render.return_value = iter([])
+
+            from unittest.mock import MagicMock
+            from backend.table.structure_detector import TableDetector
+
+            mock_detector = MagicMock(spec=TableDetector)
+            mock_detector._ensure_initialized = MagicMock()
+            mock_detector._pipeline = MagicMock()
+
+            tables, parsing_res = extractor._detect_tables_with_parsing(
+                mock_detector, str(fake_pdf)
+            )
+
+            # render_pdf_to_images was called (path validation passed)
+            mock_render.assert_called_once()
+            assert tables == []
+            assert parsing_res == []
+
+
 class TestLineItemsResult:
    """Tests for LineItemsResult dataclass."""

@@ -462,3 +534,246 @@ class TestMergedCellExtraction:
        assert result.items[0].is_deduction is False
        assert result.items[1].amount == "-2000"
        assert result.items[1].is_deduction is True
+
+
+class TestTextFallbackExtraction:
+    """Tests for text-based fallback extraction."""
+
+    def test_text_fallback_disabled_by_default(self):
+        """Test text fallback can be disabled."""
+        extractor = LineItemsExtractor(enable_text_fallback=False)
+        assert extractor.enable_text_fallback is False
+
+    def test_text_fallback_enabled_by_default(self):
+        """Test text fallback is enabled by default."""
+        extractor = LineItemsExtractor()
+        assert extractor.enable_text_fallback is True
+
+    def test_try_text_fallback_with_valid_parsing_res(self):
+        """Test text fallback with valid parsing results."""
+        from unittest.mock import patch, MagicMock
+        from backend.table.text_line_items_extractor import (
+            TextLineItemsExtractor,
+            TextLineItem,
+            TextLineItemsResult,
+        )
+
+        extractor = LineItemsExtractor()
+
+        # Mock parsing_res_list with text elements
+        parsing_res = [
+            {"label": "text", "bbox": [0, 100, 200, 120], "text": "Product A"},
+            {"label": "text", "bbox": [250, 100, 350, 120], "text": "1 234,56"},
+            {"label": "text", "bbox": [0, 150, 200, 170], "text": "Product B"},
+            {"label": "text", "bbox": [250, 150, 350, 170], "text": "2 345,67"},
+        ]
+
+        # Create mock text extraction result
+        mock_text_result = TextLineItemsResult(
+            items=[
+                TextLineItem(row_index=0, description="Product A", amount="1 234,56"),
+                TextLineItem(row_index=1, description="Product B", amount="2 345,67"),
+            ],
+            header_row=[],
+        )
+
+        with patch.object(TextLineItemsExtractor, 'extract_from_parsing_res', return_value=mock_text_result):
+            result = extractor._try_text_fallback(parsing_res)
+
+            assert result is not None
+            assert len(result.items) == 2
+            assert result.items[0].description == "Product A"
+            assert result.items[1].description == "Product B"
+
+    def test_try_text_fallback_returns_none_on_failure(self):
+        """Test text fallback returns None when extraction fails."""
+        from unittest.mock import patch
+
+        extractor = LineItemsExtractor()
+
+        with patch('backend.table.text_line_items_extractor.TextLineItemsExtractor.extract_from_parsing_res', return_value=None):
+            result = extractor._try_text_fallback([])
+            assert result is None
+
+    def test_extract_from_pdf_uses_text_fallback(self):
+        """Test extract_from_pdf uses text fallback when no tables found."""
+        from unittest.mock import patch, MagicMock
+        from backend.table.text_line_items_extractor import TextLineItem, TextLineItemsResult
+
+        extractor = LineItemsExtractor(enable_text_fallback=True)
+
+        # Mock _detect_tables_with_parsing to return no tables but parsing_res
+        mock_text_result = TextLineItemsResult(
+            items=[
+                TextLineItem(row_index=0, description="Product", amount="100,00"),
+                TextLineItem(row_index=1, description="Product 2", amount="200,00"),
+            ],
+            header_row=[],
+        )
+
+        with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
+            mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
+
+            with patch.object(extractor, '_try_text_fallback', return_value=MagicMock(items=[MagicMock()])) as mock_fallback:
+                result = extractor.extract_from_pdf("fake.pdf")
+
+                # Text fallback should be called
+                mock_fallback.assert_called_once()
+
+    def test_extract_from_pdf_skips_fallback_when_disabled(self):
+        """Test extract_from_pdf skips text fallback when disabled."""
+        from unittest.mock import patch
+
+        extractor = LineItemsExtractor(enable_text_fallback=False)
+
+        with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
+            mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
+
+            result = extractor.extract_from_pdf("fake.pdf")
+
+            # Should return None, not use text fallback
+            assert result is None
+
+
+class TestVerticallyMergedCellExtraction:
+    """Tests for vertically merged cell extraction."""
+
+    def test_detects_vertically_merged_cells(self):
+        """Test detection of vertically merged cells in rows."""
+        extractor = LineItemsExtractor()
+
+        # Rows with multiple product numbers in single cell
+        rows = [["Produktnr 1457280 1457281 1060381 merged text here"]]
+        assert extractor._has_vertically_merged_cells(rows) is True
+
+    def test_splits_vertically_merged_rows(self):
+        """Test splitting vertically merged rows."""
+        extractor = LineItemsExtractor()
+
+        rows = [
+            ["Produktnr 1234567 1234568", "Antal 2ST 3ST"],
+        ]
+        header, data = extractor._split_merged_rows(rows)
+
+        # Should split into header + data rows
+        assert isinstance(header, list)
+        assert isinstance(data, list)
+
+
+class TestDeductionDetection:
+    """Tests for deduction/discount detection."""
+
+    def test_detects_deduction_by_keyword_avdrag(self):
+        """Test detection of deduction by 'avdrag' keyword."""
+        html = """
+        <html><body><table>
+            <thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
+            <tbody>
+                <tr><td>Hyresavdrag januari</td><td>-500,00</td></tr>
+            </tbody>
+        </table></body></html>
+        """
+        extractor = LineItemsExtractor()
+        result = extractor.extract(html)
+
+        assert len(result.items) == 1
+        assert result.items[0].is_deduction is True
+
+    def test_detects_deduction_by_keyword_rabatt(self):
+        """Test detection of deduction by 'rabatt' keyword."""
+        html = """
+        <html><body><table>
+            <thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
+            <tbody>
+                <tr><td>Rabatt 10%</td><td>-100,00</td></tr>
+            </tbody>
+        </table></body></html>
+        """
+        extractor = LineItemsExtractor()
+        result = extractor.extract(html)
+
+        assert len(result.items) == 1
+        assert result.items[0].is_deduction is True
+
+    def test_detects_deduction_by_negative_amount(self):
+        """Test detection of deduction by negative amount."""
+        html = """
+        <html><body><table>
+            <thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
+            <tbody>
+                <tr><td>Some credit</td><td>-250,00</td></tr>
+            </tbody>
+        </table></body></html>
+        """
+        extractor = LineItemsExtractor()
+        result = extractor.extract(html)
+
+        assert len(result.items) == 1
+        assert result.items[0].is_deduction is True
+
+    def test_normal_item_not_deduction(self):
+        """Test normal item is not marked as deduction."""
+        html = """
+        <html><body><table>
+            <thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
+            <tbody>
+                <tr><td>Normal product</td><td>500,00</td></tr>
+            </tbody>
+        </table></body></html>
+        """
+        extractor = LineItemsExtractor()
+        result = extractor.extract(html)
+
+        assert len(result.items) == 1
+        assert result.items[0].is_deduction is False
+
+
+class TestHeaderDetection:
+    """Tests for header row detection."""
+
+    def test_detect_header_at_bottom(self):
+        """Test detecting header at bottom of table (reversed)."""
+        extractor = LineItemsExtractor()
+
+        rows = [
+            ["100,00", "Product A", "1"],
+            ["200,00", "Product B", "2"],
+            ["Belopp", "Beskrivning", "Antal"],  # Header at bottom
+        ]
+
+        header_idx, header, is_at_end = extractor._detect_header_row(rows)
+
+        assert header_idx == 2
+        assert is_at_end is True
+        assert "Belopp" in header
+
+    def test_detect_header_at_top(self):
+        """Test detecting header at top of table."""
+        extractor = LineItemsExtractor()
+
+        rows = [
+            ["Belopp", "Beskrivning", "Antal"],  # Header at top
+            ["100,00", "Product A", "1"],
+            ["200,00", "Product B", "2"],
+        ]
+
+        header_idx, header, is_at_end = extractor._detect_header_row(rows)
+
+        assert header_idx == 0
+        assert is_at_end is False
+        assert "Belopp" in header
+
+    def test_no_header_detected(self):
+        """Test when no header is detected."""
+        extractor = LineItemsExtractor()
+
+        rows = [
+            ["100,00", "Product A", "1"],
+            ["200,00", "Product B", "2"],
+        ]
+
+        header_idx, header, is_at_end = extractor._detect_header_row(rows)
+
+        assert header_idx == -1
+        assert header == []
+        assert is_at_end is False