refactor: split line_items_extractor into smaller modules with comprehensive tests

- Extract models.py (LineItem, LineItemsResult dataclasses)
- Extract html_table_parser.py (ColumnMapper, HtmlTableParser)
- Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells)
- Reduce line_items_extractor.py from 971 to 396 lines
- Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.)
- Fix row grouping algorithm in text_line_items_extractor.py
- Demote INFO logs to DEBUG level in structure_detector.py
- Add 209 tests achieving 85%+ coverage on main modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-02-03 23:02:00 +01:00
parent c2c8f2dd04
commit 8723ef4653
11 changed files with 2230 additions and 841 deletions

View File

@@ -272,12 +272,12 @@ class TestLineItemsExtractorFromPdf:
extractor = LineItemsExtractor()
# Create mock table detection result
# Create mock table detection result with proper thead/tbody structure
mock_table = MagicMock(spec=TableDetectionResult)
mock_table.html = """
<table>
<tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr>
<tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr>
<thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
<tbody><tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr></tbody>
</table>
"""
@@ -291,6 +291,78 @@ class TestLineItemsExtractorFromPdf:
assert len(result.items) >= 1
class TestPdfPathValidation:
"""Tests for PDF path validation."""
def test_detect_tables_with_nonexistent_path(self):
"""Test that non-existent PDF path returns empty results."""
extractor = LineItemsExtractor()
# Create detector and call _detect_tables_with_parsing with non-existent path
from unittest.mock import MagicMock
from backend.table.structure_detector import TableDetector
mock_detector = MagicMock(spec=TableDetector)
tables, parsing_res = extractor._detect_tables_with_parsing(
mock_detector, "nonexistent.pdf"
)
assert tables == []
assert parsing_res == []
def test_detect_tables_with_directory_path(self, tmp_path):
"""Test that directory path (not file) returns empty results."""
extractor = LineItemsExtractor()
from unittest.mock import MagicMock
from backend.table.structure_detector import TableDetector
mock_detector = MagicMock(spec=TableDetector)
# tmp_path is a directory, not a file
tables, parsing_res = extractor._detect_tables_with_parsing(
mock_detector, str(tmp_path)
)
assert tables == []
assert parsing_res == []
def test_detect_tables_validates_file_exists(self, tmp_path):
"""Test path validation for file existence.
This test verifies that the method correctly validates the path exists
and is a file before attempting to process it.
"""
from unittest.mock import patch
extractor = LineItemsExtractor()
# Create a real file path that exists
fake_pdf = tmp_path / "test.pdf"
fake_pdf.write_bytes(b"not a real pdf")
# Mock render_pdf_to_images to avoid actual PDF processing
with patch("shared.pdf.renderer.render_pdf_to_images") as mock_render:
# Return empty iterator - simulates file exists but no pages
mock_render.return_value = iter([])
from unittest.mock import MagicMock
from backend.table.structure_detector import TableDetector
mock_detector = MagicMock(spec=TableDetector)
mock_detector._ensure_initialized = MagicMock()
mock_detector._pipeline = MagicMock()
tables, parsing_res = extractor._detect_tables_with_parsing(
mock_detector, str(fake_pdf)
)
# render_pdf_to_images was called (path validation passed)
mock_render.assert_called_once()
assert tables == []
assert parsing_res == []
class TestLineItemsResult:
"""Tests for LineItemsResult dataclass."""
@@ -462,3 +534,246 @@ class TestMergedCellExtraction:
assert result.items[0].is_deduction is False
assert result.items[1].amount == "-2000"
assert result.items[1].is_deduction is True
class TestTextFallbackExtraction:
"""Tests for text-based fallback extraction."""
def test_text_fallback_disabled_by_default(self):
"""Test text fallback can be disabled."""
extractor = LineItemsExtractor(enable_text_fallback=False)
assert extractor.enable_text_fallback is False
def test_text_fallback_enabled_by_default(self):
"""Test text fallback is enabled by default."""
extractor = LineItemsExtractor()
assert extractor.enable_text_fallback is True
def test_try_text_fallback_with_valid_parsing_res(self):
"""Test text fallback with valid parsing results."""
from unittest.mock import patch, MagicMock
from backend.table.text_line_items_extractor import (
TextLineItemsExtractor,
TextLineItem,
TextLineItemsResult,
)
extractor = LineItemsExtractor()
# Mock parsing_res_list with text elements
parsing_res = [
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Product A"},
{"label": "text", "bbox": [250, 100, 350, 120], "text": "1 234,56"},
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Product B"},
{"label": "text", "bbox": [250, 150, 350, 170], "text": "2 345,67"},
]
# Create mock text extraction result
mock_text_result = TextLineItemsResult(
items=[
TextLineItem(row_index=0, description="Product A", amount="1 234,56"),
TextLineItem(row_index=1, description="Product B", amount="2 345,67"),
],
header_row=[],
)
with patch.object(TextLineItemsExtractor, 'extract_from_parsing_res', return_value=mock_text_result):
result = extractor._try_text_fallback(parsing_res)
assert result is not None
assert len(result.items) == 2
assert result.items[0].description == "Product A"
assert result.items[1].description == "Product B"
def test_try_text_fallback_returns_none_on_failure(self):
"""Test text fallback returns None when extraction fails."""
from unittest.mock import patch
extractor = LineItemsExtractor()
with patch('backend.table.text_line_items_extractor.TextLineItemsExtractor.extract_from_parsing_res', return_value=None):
result = extractor._try_text_fallback([])
assert result is None
def test_extract_from_pdf_uses_text_fallback(self):
"""Test extract_from_pdf uses text fallback when no tables found."""
from unittest.mock import patch, MagicMock
from backend.table.text_line_items_extractor import TextLineItem, TextLineItemsResult
extractor = LineItemsExtractor(enable_text_fallback=True)
# Mock _detect_tables_with_parsing to return no tables but parsing_res
mock_text_result = TextLineItemsResult(
items=[
TextLineItem(row_index=0, description="Product", amount="100,00"),
TextLineItem(row_index=1, description="Product 2", amount="200,00"),
],
header_row=[],
)
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
with patch.object(extractor, '_try_text_fallback', return_value=MagicMock(items=[MagicMock()])) as mock_fallback:
result = extractor.extract_from_pdf("fake.pdf")
# Text fallback should be called
mock_fallback.assert_called_once()
def test_extract_from_pdf_skips_fallback_when_disabled(self):
"""Test extract_from_pdf skips text fallback when disabled."""
from unittest.mock import patch
extractor = LineItemsExtractor(enable_text_fallback=False)
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
result = extractor.extract_from_pdf("fake.pdf")
# Should return None, not use text fallback
assert result is None
class TestVerticallyMergedCellExtraction:
"""Tests for vertically merged cell extraction."""
def test_detects_vertically_merged_cells(self):
"""Test detection of vertically merged cells in rows."""
extractor = LineItemsExtractor()
# Rows with multiple product numbers in single cell
rows = [["Produktnr 1457280 1457281 1060381 merged text here"]]
assert extractor._has_vertically_merged_cells(rows) is True
def test_splits_vertically_merged_rows(self):
"""Test splitting vertically merged rows."""
extractor = LineItemsExtractor()
rows = [
["Produktnr 1234567 1234568", "Antal 2ST 3ST"],
]
header, data = extractor._split_merged_rows(rows)
# Should split into header + data rows
assert isinstance(header, list)
assert isinstance(data, list)
class TestDeductionDetection:
"""Tests for deduction/discount detection."""
def test_detects_deduction_by_keyword_avdrag(self):
"""Test detection of deduction by 'avdrag' keyword."""
html = """
<html><body><table>
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
<tbody>
<tr><td>Hyresavdrag januari</td><td>-500,00</td></tr>
</tbody>
</table></body></html>
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is True
def test_detects_deduction_by_keyword_rabatt(self):
"""Test detection of deduction by 'rabatt' keyword."""
html = """
<html><body><table>
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
<tbody>
<tr><td>Rabatt 10%</td><td>-100,00</td></tr>
</tbody>
</table></body></html>
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is True
def test_detects_deduction_by_negative_amount(self):
"""Test detection of deduction by negative amount."""
html = """
<html><body><table>
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
<tbody>
<tr><td>Some credit</td><td>-250,00</td></tr>
</tbody>
</table></body></html>
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is True
def test_normal_item_not_deduction(self):
"""Test normal item is not marked as deduction."""
html = """
<html><body><table>
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
<tbody>
<tr><td>Normal product</td><td>500,00</td></tr>
</tbody>
</table></body></html>
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is False
class TestHeaderDetection:
"""Tests for header row detection."""
def test_detect_header_at_bottom(self):
"""Test detecting header at bottom of table (reversed)."""
extractor = LineItemsExtractor()
rows = [
["100,00", "Product A", "1"],
["200,00", "Product B", "2"],
["Belopp", "Beskrivning", "Antal"], # Header at bottom
]
header_idx, header, is_at_end = extractor._detect_header_row(rows)
assert header_idx == 2
assert is_at_end is True
assert "Belopp" in header
def test_detect_header_at_top(self):
"""Test detecting header at top of table."""
extractor = LineItemsExtractor()
rows = [
["Belopp", "Beskrivning", "Antal"], # Header at top
["100,00", "Product A", "1"],
["200,00", "Product B", "2"],
]
header_idx, header, is_at_end = extractor._detect_header_row(rows)
assert header_idx == 0
assert is_at_end is False
assert "Belopp" in header
def test_no_header_detected(self):
"""Test when no header is detected."""
extractor = LineItemsExtractor()
rows = [
["100,00", "Product A", "1"],
["200,00", "Product B", "2"],
]
header_idx, header, is_at_end = extractor._detect_header_row(rows)
assert header_idx == -1
assert header == []
assert is_at_end is False