"""
Tests for Line Items Extractor
Tests extraction of structured line items from HTML tables.
"""
import pytest
from backend.table.line_items_extractor import (
LineItem,
LineItemsResult,
LineItemsExtractor,
ColumnMapper,
HTMLTableParser,
)
class TestLineItem:
"""Tests for LineItem dataclass."""
def test_create_line_item_with_all_fields(self):
"""Test creating a line item with all fields populated."""
item = LineItem(
row_index=0,
description="Samfällighetsavgift",
quantity="1",
unit="st",
unit_price="6888,00",
amount="6888,00",
article_number="3035",
vat_rate="25",
confidence=0.95,
)
assert item.description == "Samfällighetsavgift"
assert item.quantity == "1"
assert item.amount == "6888,00"
assert item.article_number == "3035"
def test_create_line_item_with_minimal_fields(self):
"""Test creating a line item with only required fields."""
item = LineItem(
row_index=0,
description="Test item",
amount="100,00",
)
assert item.description == "Test item"
assert item.amount == "100,00"
assert item.quantity is None
assert item.unit_price is None
class TestHTMLTableParser:
"""Tests for HTML table parsing."""
def test_parse_simple_table(self):
"""Test parsing a simple HTML table."""
html = """
"""
parser = HTMLTableParser()
header, rows = parser.parse(html)
assert header == [] # No thead
assert len(rows) == 2
assert rows[0] == ["A", "B"]
assert rows[1] == ["1", "2"]
def test_parse_table_with_thead(self):
"""Test parsing a table with explicit thead."""
html = """
"""
parser = HTMLTableParser()
header, rows = parser.parse(html)
assert header == ["Name", "Price"]
assert len(rows) == 1
assert rows[0] == ["Item 1", "100"]
def test_parse_empty_table(self):
"""Test parsing an empty table."""
html = ""
parser = HTMLTableParser()
header, rows = parser.parse(html)
assert header == []
assert rows == []
def test_parse_table_with_empty_cells(self):
"""Test parsing a table with empty cells."""
html = """
"""
parser = HTMLTableParser()
header, rows = parser.parse(html)
assert rows[0] == ["", "Value", ""]
class TestColumnMapper:
"""Tests for column mapping."""
def test_map_swedish_headers(self):
"""Test mapping Swedish column headers."""
mapper = ColumnMapper()
headers = ["Art nummer", "Produktbeskrivning", "Antal", "Enhet", "A-pris", "Belopp"]
mapping = mapper.map(headers)
assert mapping[0] == "article_number"
assert mapping[1] == "description"
assert mapping[2] == "quantity"
assert mapping[3] == "unit"
assert mapping[4] == "unit_price"
assert mapping[5] == "amount"
def test_map_merged_headers(self):
"""Test mapping merged column headers (e.g., 'Moms A-pris')."""
mapper = ColumnMapper()
headers = ["Belopp", "Moms A-pris", "Enhet Antal", "Vara/tjänst", "Art.nr"]
mapping = mapper.map(headers)
assert mapping.get(0) == "amount"
assert mapping.get(3) == "description" # Vara/tjänst -> description
assert mapping.get(4) == "article_number" # Art.nr -> article_number
def test_map_empty_headers(self):
"""Test mapping empty headers."""
mapper = ColumnMapper()
headers = ["", "", ""]
mapping = mapper.map(headers)
assert mapping == {}
def test_map_unknown_headers(self):
"""Test mapping unknown headers."""
mapper = ColumnMapper()
headers = ["Foo", "Bar", "Baz"]
mapping = mapper.map(headers)
assert mapping == {}
class TestLineItemsExtractor:
"""Tests for LineItemsExtractor."""
def test_extract_from_simple_html(self):
"""Test extracting line items from simple HTML."""
html = """
| Beskrivning | Antal | Pris | Belopp |
| Product A | 2 | 50,00 | 100,00 |
| Product B | 1 | 75,00 | 75,00 |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 2
assert result.items[0].description == "Product A"
assert result.items[0].quantity == "2"
assert result.items[0].amount == "100,00"
assert result.items[1].description == "Product B"
def test_extract_from_reversed_table(self):
"""Test extracting from table with header at bottom (PP-StructureV3 quirk)."""
html = """
| 6 888,00 | 6 888,00 | 1 | Samfällighetsavgift | 3035 |
| 4 811,44 | 4 811,44 | 1 | GA:1 Avgift | 303501 |
| Belopp | Moms A-pris | Enhet Antal | Vara/tjänst | Art.nr |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 2
assert result.items[0].amount == "6 888,00"
assert result.items[0].description == "Samfällighetsavgift"
assert result.items[1].description == "GA:1 Avgift"
def test_extract_from_empty_html(self):
"""Test extracting from empty HTML."""
extractor = LineItemsExtractor()
result = extractor.extract("")
assert result.items == []
def test_extract_returns_result_with_metadata(self):
"""Test that extraction returns LineItemsResult with metadata."""
html = """
| Beskrivning | Belopp |
| Test | 100 |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert isinstance(result, LineItemsResult)
assert result.raw_html == html
assert result.header_row == ["Beskrivning", "Belopp"]
def test_extract_skips_empty_rows(self):
"""Test that extraction skips rows with no content."""
html = """
| Beskrivning | Belopp |
| |
| Real item | 100 |
| |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].description == "Real item"
def test_is_line_items_table(self):
"""Test detection of line items table vs summary table."""
extractor = LineItemsExtractor()
# Line items table
line_items_headers = ["Art nummer", "Produktbeskrivning", "Antal", "Belopp"]
assert extractor.is_line_items_table(line_items_headers) is True
# Summary table
summary_headers = ["Frakt", "Faktura.avg", "Exkl.moms", "Moms", "Belopp att betala"]
assert extractor.is_line_items_table(summary_headers) is False
# Payment table
payment_headers = ["Bankgiro", "OCR", "Belopp"]
assert extractor.is_line_items_table(payment_headers) is False
class TestLineItemsExtractorFromPdf:
"""Tests for PDF extraction."""
def test_extract_from_pdf_no_tables(self):
"""Test extraction from PDF with no tables returns None."""
from unittest.mock import patch
extractor = LineItemsExtractor()
# Mock _detect_tables_with_parsing to return no tables and no parsing_res
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
mock_detect.return_value = ([], [])
result = extractor.extract_from_pdf("fake.pdf")
assert result is None
def test_extract_from_pdf_with_tables(self):
"""Test extraction from PDF with tables."""
from unittest.mock import patch, MagicMock
from backend.table.structure_detector import TableDetectionResult
extractor = LineItemsExtractor()
# Create mock table detection result with proper thead/tbody structure
mock_table = MagicMock(spec=TableDetectionResult)
mock_table.html = """
| Beskrivning | Antal | Pris | Belopp |
| Product A | 2 | 100,00 | 200,00 |
"""
# Mock _detect_tables_with_parsing to return table results
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
mock_detect.return_value = ([mock_table], [])
result = extractor.extract_from_pdf("fake.pdf")
assert result is not None
assert len(result.items) >= 1
class TestPdfPathValidation:
"""Tests for PDF path validation."""
def test_detect_tables_with_nonexistent_path(self):
"""Test that non-existent PDF path returns empty results."""
extractor = LineItemsExtractor()
# Create detector and call _detect_tables_with_parsing with non-existent path
from unittest.mock import MagicMock
from backend.table.structure_detector import TableDetector
mock_detector = MagicMock(spec=TableDetector)
tables, parsing_res = extractor._detect_tables_with_parsing(
mock_detector, "nonexistent.pdf"
)
assert tables == []
assert parsing_res == []
def test_detect_tables_with_directory_path(self, tmp_path):
"""Test that directory path (not file) returns empty results."""
extractor = LineItemsExtractor()
from unittest.mock import MagicMock
from backend.table.structure_detector import TableDetector
mock_detector = MagicMock(spec=TableDetector)
# tmp_path is a directory, not a file
tables, parsing_res = extractor._detect_tables_with_parsing(
mock_detector, str(tmp_path)
)
assert tables == []
assert parsing_res == []
def test_detect_tables_validates_file_exists(self, tmp_path):
"""Test path validation for file existence.
This test verifies that the method correctly validates the path exists
and is a file before attempting to process it.
"""
from unittest.mock import patch
extractor = LineItemsExtractor()
# Create a real file path that exists
fake_pdf = tmp_path / "test.pdf"
fake_pdf.write_bytes(b"not a real pdf")
# Mock render_pdf_to_images to avoid actual PDF processing
with patch("shared.pdf.renderer.render_pdf_to_images") as mock_render:
# Return empty iterator - simulates file exists but no pages
mock_render.return_value = iter([])
from unittest.mock import MagicMock
from backend.table.structure_detector import TableDetector
mock_detector = MagicMock(spec=TableDetector)
mock_detector._ensure_initialized = MagicMock()
mock_detector._pipeline = MagicMock()
tables, parsing_res = extractor._detect_tables_with_parsing(
mock_detector, str(fake_pdf)
)
# render_pdf_to_images was called (path validation passed)
mock_render.assert_called_once()
assert tables == []
assert parsing_res == []
class TestLineItemsResult:
"""Tests for LineItemsResult dataclass."""
def test_create_result(self):
"""Test creating a LineItemsResult."""
items = [
LineItem(row_index=0, description="Item 1", amount="100"),
LineItem(row_index=1, description="Item 2", amount="200"),
]
result = LineItemsResult(
items=items,
header_row=["Beskrivning", "Belopp"],
raw_html="",
)
assert len(result.items) == 2
assert result.header_row == ["Beskrivning", "Belopp"]
assert result.raw_html == ""
def test_total_amount_calculation(self):
"""Test calculating total amount from line items."""
items = [
LineItem(row_index=0, description="Item 1", amount="100,00"),
LineItem(row_index=1, description="Item 2", amount="200,50"),
]
result = LineItemsResult(items=items, header_row=[], raw_html="")
# Total should be calculated correctly
assert result.total_amount == "300,50"
def test_total_amount_with_deduction(self):
"""Test total amount calculation includes deductions (as separate rows)."""
items = [
LineItem(row_index=0, description="Rent", amount="8159", is_deduction=False),
LineItem(row_index=1, description="Avdrag", amount="-2000", is_deduction=True),
]
result = LineItemsResult(items=items, header_row=[], raw_html="")
# Total should be 8159 + (-2000) = 6159
assert result.total_amount == "6 159,00"
def test_empty_result(self):
"""Test empty LineItemsResult."""
result = LineItemsResult(items=[], header_row=[], raw_html="")
assert result.items == []
assert result.total_amount is None
class TestMergedCellExtraction:
"""Tests for merged cell extraction (rental invoices)."""
def test_has_merged_header_single_cell_with_keywords(self):
"""Test detection of merged header with multiple keywords."""
extractor = LineItemsExtractor()
# Single cell with multiple keywords - should be detected as merged
merged_header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
assert extractor._has_merged_header(merged_header) is True
def test_has_merged_header_normal_header(self):
"""Test normal header is not detected as merged."""
extractor = LineItemsExtractor()
# Normal separate headers
normal_header = ["Beskrivning", "Antal", "Belopp"]
assert extractor._has_merged_header(normal_header) is False
def test_has_merged_header_empty(self):
"""Test empty header."""
extractor = LineItemsExtractor()
assert extractor._has_merged_header([]) is False
assert extractor._has_merged_header(None) is False
def test_has_merged_header_with_empty_trailing_cells(self):
"""Test merged header detection with empty trailing cells."""
extractor = LineItemsExtractor()
# PP-StructureV3 may produce headers with empty trailing cells
merged_header_with_empty = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", "", "", ""]
assert extractor._has_merged_header(merged_header_with_empty) is True
# Should also work with leading empty cells
merged_header_leading_empty = ["", "", "Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", ""]
assert extractor._has_merged_header(merged_header_leading_empty) is True
def test_extract_from_merged_cells_rental_invoice(self):
"""Test extracting from merged cells like rental invoice.
Each amount becomes a separate row. Negative amounts are marked as is_deduction=True.
"""
extractor = LineItemsExtractor()
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
rows = [
["", "", "", "8159 -2000"],
["", "", "", ""],
]
items = extractor._extract_from_merged_cells(header, rows)
# Should have 2 items: one for amount, one for deduction
assert len(items) == 2
assert items[0].amount == "8159"
assert items[0].is_deduction is False
assert items[0].article_number == "0218103-1201"
assert items[0].description == "2 rum och kök"
assert items[1].amount == "-2000"
assert items[1].is_deduction is True
assert items[1].description == "Avdrag"
def test_extract_from_merged_cells_separate_rows(self):
"""Test extracting when amount and deduction are in separate rows."""
extractor = LineItemsExtractor()
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
rows = [
["", "", "", "8159"], # Amount in row 1
["", "", "", "-2000"], # Deduction in row 2
]
items = extractor._extract_from_merged_cells(header, rows)
# Should have 2 items: one for amount, one for deduction
assert len(items) == 2
assert items[0].amount == "8159"
assert items[0].is_deduction is False
assert items[0].article_number == "0218103-1201"
assert items[0].description == "2 rum och kök"
assert items[1].amount == "-2000"
assert items[1].is_deduction is True
def test_extract_from_merged_cells_swedish_format(self):
"""Test extracting Swedish formatted amounts with spaces."""
extractor = LineItemsExtractor()
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
rows = [
["", "", "", "8 159"], # Swedish format with space
["", "", "", "-2 000"], # Swedish format with space
]
items = extractor._extract_from_merged_cells(header, rows)
# Should have 2 items
assert len(items) == 2
# Amounts are cleaned (spaces removed)
assert items[0].amount == "8159"
assert items[0].is_deduction is False
assert items[1].amount == "-2000"
assert items[1].is_deduction is True
def test_extract_merged_cells_via_extract(self):
"""Test that extract() calls merged cell parsing when needed."""
html = """
| Specifikation 0218103-1201 2 rum och kök Hyra Avdrag |
| | | 8159 -2000 |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
# Should have extracted 2 items via merged cell parsing
assert len(result.items) == 2
assert result.items[0].amount == "8159"
assert result.items[0].is_deduction is False
assert result.items[1].amount == "-2000"
assert result.items[1].is_deduction is True
class TestTextFallbackExtraction:
"""Tests for text-based fallback extraction."""
def test_text_fallback_disabled_by_default(self):
"""Test text fallback can be disabled."""
extractor = LineItemsExtractor(enable_text_fallback=False)
assert extractor.enable_text_fallback is False
def test_text_fallback_enabled_by_default(self):
"""Test text fallback is enabled by default."""
extractor = LineItemsExtractor()
assert extractor.enable_text_fallback is True
def test_try_text_fallback_with_valid_parsing_res(self):
"""Test text fallback with valid parsing results."""
from unittest.mock import patch, MagicMock
from backend.table.text_line_items_extractor import (
TextLineItemsExtractor,
TextLineItem,
TextLineItemsResult,
)
extractor = LineItemsExtractor()
# Mock parsing_res_list with text elements
parsing_res = [
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Product A"},
{"label": "text", "bbox": [250, 100, 350, 120], "text": "1 234,56"},
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Product B"},
{"label": "text", "bbox": [250, 150, 350, 170], "text": "2 345,67"},
]
# Create mock text extraction result
mock_text_result = TextLineItemsResult(
items=[
TextLineItem(row_index=0, description="Product A", amount="1 234,56"),
TextLineItem(row_index=1, description="Product B", amount="2 345,67"),
],
header_row=[],
)
with patch.object(TextLineItemsExtractor, 'extract_from_parsing_res', return_value=mock_text_result):
result = extractor._try_text_fallback(parsing_res)
assert result is not None
assert len(result.items) == 2
assert result.items[0].description == "Product A"
assert result.items[1].description == "Product B"
def test_try_text_fallback_returns_none_on_failure(self):
"""Test text fallback returns None when extraction fails."""
from unittest.mock import patch
extractor = LineItemsExtractor()
with patch('backend.table.text_line_items_extractor.TextLineItemsExtractor.extract_from_parsing_res', return_value=None):
result = extractor._try_text_fallback([])
assert result is None
def test_extract_from_pdf_uses_text_fallback(self):
"""Test extract_from_pdf uses text fallback when no tables found."""
from unittest.mock import patch, MagicMock
from backend.table.text_line_items_extractor import TextLineItem, TextLineItemsResult
extractor = LineItemsExtractor(enable_text_fallback=True)
# Mock _detect_tables_with_parsing to return no tables but parsing_res
mock_text_result = TextLineItemsResult(
items=[
TextLineItem(row_index=0, description="Product", amount="100,00"),
TextLineItem(row_index=1, description="Product 2", amount="200,00"),
],
header_row=[],
)
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
with patch.object(extractor, '_try_text_fallback', return_value=MagicMock(items=[MagicMock()])) as mock_fallback:
result = extractor.extract_from_pdf("fake.pdf")
# Text fallback should be called
mock_fallback.assert_called_once()
def test_extract_from_pdf_skips_fallback_when_disabled(self):
"""Test extract_from_pdf skips text fallback when disabled."""
from unittest.mock import patch
extractor = LineItemsExtractor(enable_text_fallback=False)
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
result = extractor.extract_from_pdf("fake.pdf")
# Should return None, not use text fallback
assert result is None
class TestVerticallyMergedCellExtraction:
"""Tests for vertically merged cell extraction."""
def test_detects_vertically_merged_cells(self):
"""Test detection of vertically merged cells in rows."""
extractor = LineItemsExtractor()
# Rows with multiple product numbers in single cell
rows = [["Produktnr 1457280 1457281 1060381 merged text here"]]
assert extractor._has_vertically_merged_cells(rows) is True
def test_splits_vertically_merged_rows(self):
"""Test splitting vertically merged rows."""
extractor = LineItemsExtractor()
rows = [
["Produktnr 1234567 1234568", "Antal 2ST 3ST"],
]
header, data = extractor._split_merged_rows(rows)
# Should split into header + data rows
assert isinstance(header, list)
assert isinstance(data, list)
class TestDeductionDetection:
"""Tests for deduction/discount detection."""
def test_detects_deduction_by_keyword_avdrag(self):
"""Test detection of deduction by 'avdrag' keyword."""
html = """
| Beskrivning | Belopp |
| Hyresavdrag januari | -500,00 |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is True
def test_detects_deduction_by_keyword_rabatt(self):
"""Test detection of deduction by 'rabatt' keyword."""
html = """
| Beskrivning | Belopp |
| Rabatt 10% | -100,00 |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is True
def test_detects_deduction_by_negative_amount(self):
"""Test detection of deduction by negative amount."""
html = """
| Beskrivning | Belopp |
| Some credit | -250,00 |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is True
def test_normal_item_not_deduction(self):
"""Test normal item is not marked as deduction."""
html = """
| Beskrivning | Belopp |
| Normal product | 500,00 |
"""
extractor = LineItemsExtractor()
result = extractor.extract(html)
assert len(result.items) == 1
assert result.items[0].is_deduction is False
class TestHeaderDetection:
"""Tests for header row detection."""
def test_detect_header_at_bottom(self):
"""Test detecting header at bottom of table (reversed)."""
extractor = LineItemsExtractor()
rows = [
["100,00", "Product A", "1"],
["200,00", "Product B", "2"],
["Belopp", "Beskrivning", "Antal"], # Header at bottom
]
header_idx, header, is_at_end = extractor._detect_header_row(rows)
assert header_idx == 2
assert is_at_end is True
assert "Belopp" in header
def test_detect_header_at_top(self):
"""Test detecting header at top of table."""
extractor = LineItemsExtractor()
rows = [
["Belopp", "Beskrivning", "Antal"], # Header at top
["100,00", "Product A", "1"],
["200,00", "Product B", "2"],
]
header_idx, header, is_at_end = extractor._detect_header_row(rows)
assert header_idx == 0
assert is_at_end is False
assert "Belopp" in header
def test_no_header_detected(self):
"""Test when no header is detected."""
extractor = LineItemsExtractor()
rows = [
["100,00", "Product A", "1"],
["200,00", "Product B", "2"],
]
header_idx, header, is_at_end = extractor._detect_header_row(rows)
assert header_idx == -1
assert header == []
assert is_at_end is False