refactor: split line_items_extractor into smaller modules with comprehensive tests
- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -272,12 +272,12 @@ class TestLineItemsExtractorFromPdf:
|
||||
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Create mock table detection result
|
||||
# Create mock table detection result with proper thead/tbody structure
|
||||
mock_table = MagicMock(spec=TableDetectionResult)
|
||||
mock_table.html = """
|
||||
<table>
|
||||
<tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr>
|
||||
<tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr>
|
||||
<thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
|
||||
<tbody><tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr></tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
@@ -291,6 +291,78 @@ class TestLineItemsExtractorFromPdf:
|
||||
assert len(result.items) >= 1
|
||||
|
||||
|
||||
class TestPdfPathValidation:
|
||||
"""Tests for PDF path validation."""
|
||||
|
||||
def test_detect_tables_with_nonexistent_path(self):
|
||||
"""Test that non-existent PDF path returns empty results."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Create detector and call _detect_tables_with_parsing with non-existent path
|
||||
from unittest.mock import MagicMock
|
||||
from backend.table.structure_detector import TableDetector
|
||||
|
||||
mock_detector = MagicMock(spec=TableDetector)
|
||||
tables, parsing_res = extractor._detect_tables_with_parsing(
|
||||
mock_detector, "nonexistent.pdf"
|
||||
)
|
||||
|
||||
assert tables == []
|
||||
assert parsing_res == []
|
||||
|
||||
def test_detect_tables_with_directory_path(self, tmp_path):
|
||||
"""Test that directory path (not file) returns empty results."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
from backend.table.structure_detector import TableDetector
|
||||
|
||||
mock_detector = MagicMock(spec=TableDetector)
|
||||
|
||||
# tmp_path is a directory, not a file
|
||||
tables, parsing_res = extractor._detect_tables_with_parsing(
|
||||
mock_detector, str(tmp_path)
|
||||
)
|
||||
|
||||
assert tables == []
|
||||
assert parsing_res == []
|
||||
|
||||
def test_detect_tables_validates_file_exists(self, tmp_path):
|
||||
"""Test path validation for file existence.
|
||||
|
||||
This test verifies that the method correctly validates the path exists
|
||||
and is a file before attempting to process it.
|
||||
"""
|
||||
from unittest.mock import patch
|
||||
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Create a real file path that exists
|
||||
fake_pdf = tmp_path / "test.pdf"
|
||||
fake_pdf.write_bytes(b"not a real pdf")
|
||||
|
||||
# Mock render_pdf_to_images to avoid actual PDF processing
|
||||
with patch("shared.pdf.renderer.render_pdf_to_images") as mock_render:
|
||||
# Return empty iterator - simulates file exists but no pages
|
||||
mock_render.return_value = iter([])
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
from backend.table.structure_detector import TableDetector
|
||||
|
||||
mock_detector = MagicMock(spec=TableDetector)
|
||||
mock_detector._ensure_initialized = MagicMock()
|
||||
mock_detector._pipeline = MagicMock()
|
||||
|
||||
tables, parsing_res = extractor._detect_tables_with_parsing(
|
||||
mock_detector, str(fake_pdf)
|
||||
)
|
||||
|
||||
# render_pdf_to_images was called (path validation passed)
|
||||
mock_render.assert_called_once()
|
||||
assert tables == []
|
||||
assert parsing_res == []
|
||||
|
||||
|
||||
class TestLineItemsResult:
|
||||
"""Tests for LineItemsResult dataclass."""
|
||||
|
||||
@@ -462,3 +534,246 @@ class TestMergedCellExtraction:
|
||||
assert result.items[0].is_deduction is False
|
||||
assert result.items[1].amount == "-2000"
|
||||
assert result.items[1].is_deduction is True
|
||||
|
||||
|
||||
class TestTextFallbackExtraction:
|
||||
"""Tests for text-based fallback extraction."""
|
||||
|
||||
def test_text_fallback_disabled_by_default(self):
|
||||
"""Test text fallback can be disabled."""
|
||||
extractor = LineItemsExtractor(enable_text_fallback=False)
|
||||
assert extractor.enable_text_fallback is False
|
||||
|
||||
def test_text_fallback_enabled_by_default(self):
|
||||
"""Test text fallback is enabled by default."""
|
||||
extractor = LineItemsExtractor()
|
||||
assert extractor.enable_text_fallback is True
|
||||
|
||||
def test_try_text_fallback_with_valid_parsing_res(self):
|
||||
"""Test text fallback with valid parsing results."""
|
||||
from unittest.mock import patch, MagicMock
|
||||
from backend.table.text_line_items_extractor import (
|
||||
TextLineItemsExtractor,
|
||||
TextLineItem,
|
||||
TextLineItemsResult,
|
||||
)
|
||||
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Mock parsing_res_list with text elements
|
||||
parsing_res = [
|
||||
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Product A"},
|
||||
{"label": "text", "bbox": [250, 100, 350, 120], "text": "1 234,56"},
|
||||
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Product B"},
|
||||
{"label": "text", "bbox": [250, 150, 350, 170], "text": "2 345,67"},
|
||||
]
|
||||
|
||||
# Create mock text extraction result
|
||||
mock_text_result = TextLineItemsResult(
|
||||
items=[
|
||||
TextLineItem(row_index=0, description="Product A", amount="1 234,56"),
|
||||
TextLineItem(row_index=1, description="Product B", amount="2 345,67"),
|
||||
],
|
||||
header_row=[],
|
||||
)
|
||||
|
||||
with patch.object(TextLineItemsExtractor, 'extract_from_parsing_res', return_value=mock_text_result):
|
||||
result = extractor._try_text_fallback(parsing_res)
|
||||
|
||||
assert result is not None
|
||||
assert len(result.items) == 2
|
||||
assert result.items[0].description == "Product A"
|
||||
assert result.items[1].description == "Product B"
|
||||
|
||||
def test_try_text_fallback_returns_none_on_failure(self):
|
||||
"""Test text fallback returns None when extraction fails."""
|
||||
from unittest.mock import patch
|
||||
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
with patch('backend.table.text_line_items_extractor.TextLineItemsExtractor.extract_from_parsing_res', return_value=None):
|
||||
result = extractor._try_text_fallback([])
|
||||
assert result is None
|
||||
|
||||
def test_extract_from_pdf_uses_text_fallback(self):
|
||||
"""Test extract_from_pdf uses text fallback when no tables found."""
|
||||
from unittest.mock import patch, MagicMock
|
||||
from backend.table.text_line_items_extractor import TextLineItem, TextLineItemsResult
|
||||
|
||||
extractor = LineItemsExtractor(enable_text_fallback=True)
|
||||
|
||||
# Mock _detect_tables_with_parsing to return no tables but parsing_res
|
||||
mock_text_result = TextLineItemsResult(
|
||||
items=[
|
||||
TextLineItem(row_index=0, description="Product", amount="100,00"),
|
||||
TextLineItem(row_index=1, description="Product 2", amount="200,00"),
|
||||
],
|
||||
header_row=[],
|
||||
)
|
||||
|
||||
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
||||
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
|
||||
|
||||
with patch.object(extractor, '_try_text_fallback', return_value=MagicMock(items=[MagicMock()])) as mock_fallback:
|
||||
result = extractor.extract_from_pdf("fake.pdf")
|
||||
|
||||
# Text fallback should be called
|
||||
mock_fallback.assert_called_once()
|
||||
|
||||
def test_extract_from_pdf_skips_fallback_when_disabled(self):
|
||||
"""Test extract_from_pdf skips text fallback when disabled."""
|
||||
from unittest.mock import patch
|
||||
|
||||
extractor = LineItemsExtractor(enable_text_fallback=False)
|
||||
|
||||
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
||||
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
|
||||
|
||||
result = extractor.extract_from_pdf("fake.pdf")
|
||||
|
||||
# Should return None, not use text fallback
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestVerticallyMergedCellExtraction:
|
||||
"""Tests for vertically merged cell extraction."""
|
||||
|
||||
def test_detects_vertically_merged_cells(self):
|
||||
"""Test detection of vertically merged cells in rows."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Rows with multiple product numbers in single cell
|
||||
rows = [["Produktnr 1457280 1457281 1060381 merged text here"]]
|
||||
assert extractor._has_vertically_merged_cells(rows) is True
|
||||
|
||||
def test_splits_vertically_merged_rows(self):
|
||||
"""Test splitting vertically merged rows."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
rows = [
|
||||
["Produktnr 1234567 1234568", "Antal 2ST 3ST"],
|
||||
]
|
||||
header, data = extractor._split_merged_rows(rows)
|
||||
|
||||
# Should split into header + data rows
|
||||
assert isinstance(header, list)
|
||||
assert isinstance(data, list)
|
||||
|
||||
|
||||
class TestDeductionDetection:
|
||||
"""Tests for deduction/discount detection."""
|
||||
|
||||
def test_detects_deduction_by_keyword_avdrag(self):
|
||||
"""Test detection of deduction by 'avdrag' keyword."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td>Hyresavdrag januari</td><td>-500,00</td></tr>
|
||||
</tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert len(result.items) == 1
|
||||
assert result.items[0].is_deduction is True
|
||||
|
||||
def test_detects_deduction_by_keyword_rabatt(self):
|
||||
"""Test detection of deduction by 'rabatt' keyword."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td>Rabatt 10%</td><td>-100,00</td></tr>
|
||||
</tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert len(result.items) == 1
|
||||
assert result.items[0].is_deduction is True
|
||||
|
||||
def test_detects_deduction_by_negative_amount(self):
|
||||
"""Test detection of deduction by negative amount."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td>Some credit</td><td>-250,00</td></tr>
|
||||
</tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert len(result.items) == 1
|
||||
assert result.items[0].is_deduction is True
|
||||
|
||||
def test_normal_item_not_deduction(self):
|
||||
"""Test normal item is not marked as deduction."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td>Normal product</td><td>500,00</td></tr>
|
||||
</tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert len(result.items) == 1
|
||||
assert result.items[0].is_deduction is False
|
||||
|
||||
|
||||
class TestHeaderDetection:
|
||||
"""Tests for header row detection."""
|
||||
|
||||
def test_detect_header_at_bottom(self):
|
||||
"""Test detecting header at bottom of table (reversed)."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
rows = [
|
||||
["100,00", "Product A", "1"],
|
||||
["200,00", "Product B", "2"],
|
||||
["Belopp", "Beskrivning", "Antal"], # Header at bottom
|
||||
]
|
||||
|
||||
header_idx, header, is_at_end = extractor._detect_header_row(rows)
|
||||
|
||||
assert header_idx == 2
|
||||
assert is_at_end is True
|
||||
assert "Belopp" in header
|
||||
|
||||
def test_detect_header_at_top(self):
|
||||
"""Test detecting header at top of table."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
rows = [
|
||||
["Belopp", "Beskrivning", "Antal"], # Header at top
|
||||
["100,00", "Product A", "1"],
|
||||
["200,00", "Product B", "2"],
|
||||
]
|
||||
|
||||
header_idx, header, is_at_end = extractor._detect_header_row(rows)
|
||||
|
||||
assert header_idx == 0
|
||||
assert is_at_end is False
|
||||
assert "Belopp" in header
|
||||
|
||||
def test_no_header_detected(self):
|
||||
"""Test when no header is detected."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
rows = [
|
||||
["100,00", "Product A", "1"],
|
||||
["200,00", "Product B", "2"],
|
||||
]
|
||||
|
||||
header_idx, header, is_at_end = extractor._detect_header_row(rows)
|
||||
|
||||
assert header_idx == -1
|
||||
assert header == []
|
||||
assert is_at_end is False
|
||||
|
||||
Reference in New Issue
Block a user