465 lines
16 KiB
Python
465 lines
16 KiB
Python
"""
|
|
Tests for Line Items Extractor
|
|
|
|
Tests extraction of structured line items from HTML tables.
|
|
"""
|
|
|
|
import pytest
|
|
from backend.table.line_items_extractor import (
|
|
LineItem,
|
|
LineItemsResult,
|
|
LineItemsExtractor,
|
|
ColumnMapper,
|
|
HTMLTableParser,
|
|
)
|
|
|
|
|
|
class TestLineItem:
|
|
"""Tests for LineItem dataclass."""
|
|
|
|
def test_create_line_item_with_all_fields(self):
|
|
"""Test creating a line item with all fields populated."""
|
|
item = LineItem(
|
|
row_index=0,
|
|
description="Samfällighetsavgift",
|
|
quantity="1",
|
|
unit="st",
|
|
unit_price="6888,00",
|
|
amount="6888,00",
|
|
article_number="3035",
|
|
vat_rate="25",
|
|
confidence=0.95,
|
|
)
|
|
assert item.description == "Samfällighetsavgift"
|
|
assert item.quantity == "1"
|
|
assert item.amount == "6888,00"
|
|
assert item.article_number == "3035"
|
|
|
|
def test_create_line_item_with_minimal_fields(self):
|
|
"""Test creating a line item with only required fields."""
|
|
item = LineItem(
|
|
row_index=0,
|
|
description="Test item",
|
|
amount="100,00",
|
|
)
|
|
assert item.description == "Test item"
|
|
assert item.amount == "100,00"
|
|
assert item.quantity is None
|
|
assert item.unit_price is None
|
|
|
|
|
|
class TestHTMLTableParser:
|
|
"""Tests for HTML table parsing."""
|
|
|
|
def test_parse_simple_table(self):
|
|
"""Test parsing a simple HTML table."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td>A</td><td>B</td></tr>
|
|
<tr><td>1</td><td>2</td></tr>
|
|
</table></body></html>
|
|
"""
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert header == [] # No thead
|
|
assert len(rows) == 2
|
|
assert rows[0] == ["A", "B"]
|
|
assert rows[1] == ["1", "2"]
|
|
|
|
def test_parse_table_with_thead(self):
|
|
"""Test parsing a table with explicit thead."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Name</th><th>Price</th></tr></thead>
|
|
<tbody><tr><td>Item 1</td><td>100</td></tr></tbody>
|
|
</table></body></html>
|
|
"""
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert header == ["Name", "Price"]
|
|
assert len(rows) == 1
|
|
assert rows[0] == ["Item 1", "100"]
|
|
|
|
def test_parse_empty_table(self):
|
|
"""Test parsing an empty table."""
|
|
html = "<html><body><table></table></body></html>"
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert header == []
|
|
assert rows == []
|
|
|
|
def test_parse_table_with_empty_cells(self):
|
|
"""Test parsing a table with empty cells."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td></td><td>Value</td><td></td></tr>
|
|
</table></body></html>
|
|
"""
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert rows[0] == ["", "Value", ""]
|
|
|
|
|
|
class TestColumnMapper:
|
|
"""Tests for column mapping."""
|
|
|
|
def test_map_swedish_headers(self):
|
|
"""Test mapping Swedish column headers."""
|
|
mapper = ColumnMapper()
|
|
headers = ["Art nummer", "Produktbeskrivning", "Antal", "Enhet", "A-pris", "Belopp"]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping[0] == "article_number"
|
|
assert mapping[1] == "description"
|
|
assert mapping[2] == "quantity"
|
|
assert mapping[3] == "unit"
|
|
assert mapping[4] == "unit_price"
|
|
assert mapping[5] == "amount"
|
|
|
|
def test_map_merged_headers(self):
|
|
"""Test mapping merged column headers (e.g., 'Moms A-pris')."""
|
|
mapper = ColumnMapper()
|
|
headers = ["Belopp", "Moms A-pris", "Enhet Antal", "Vara/tjänst", "Art.nr"]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping.get(0) == "amount"
|
|
assert mapping.get(3) == "description" # Vara/tjänst -> description
|
|
assert mapping.get(4) == "article_number" # Art.nr -> article_number
|
|
|
|
def test_map_empty_headers(self):
|
|
"""Test mapping empty headers."""
|
|
mapper = ColumnMapper()
|
|
headers = ["", "", ""]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping == {}
|
|
|
|
def test_map_unknown_headers(self):
|
|
"""Test mapping unknown headers."""
|
|
mapper = ColumnMapper()
|
|
headers = ["Foo", "Bar", "Baz"]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping == {}
|
|
|
|
|
|
class TestLineItemsExtractor:
|
|
"""Tests for LineItemsExtractor."""
|
|
|
|
def test_extract_from_simple_html(self):
|
|
"""Test extracting line items from simple HTML."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Product A</td><td>2</td><td>50,00</td><td>100,00</td></tr>
|
|
<tr><td>Product B</td><td>1</td><td>75,00</td><td>75,00</td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 2
|
|
assert result.items[0].description == "Product A"
|
|
assert result.items[0].quantity == "2"
|
|
assert result.items[0].amount == "100,00"
|
|
assert result.items[1].description == "Product B"
|
|
|
|
def test_extract_from_reversed_table(self):
|
|
"""Test extracting from table with header at bottom (PP-StructureV3 quirk)."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td>6 888,00</td><td>6 888,00</td><td>1</td><td>Samfällighetsavgift</td><td>3035</td></tr>
|
|
<tr><td>4 811,44</td><td>4 811,44</td><td>1</td><td>GA:1 Avgift</td><td>303501</td></tr>
|
|
<tr><td>Belopp</td><td>Moms A-pris</td><td>Enhet Antal</td><td>Vara/tjänst</td><td>Art.nr</td></tr>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 2
|
|
assert result.items[0].amount == "6 888,00"
|
|
assert result.items[0].description == "Samfällighetsavgift"
|
|
assert result.items[1].description == "GA:1 Avgift"
|
|
|
|
def test_extract_from_empty_html(self):
|
|
"""Test extracting from empty HTML."""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract("<html><body><table></table></body></html>")
|
|
|
|
assert result.items == []
|
|
|
|
def test_extract_returns_result_with_metadata(self):
|
|
"""Test that extraction returns LineItemsResult with metadata."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody><tr><td>Test</td><td>100</td></tr></tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert isinstance(result, LineItemsResult)
|
|
assert result.raw_html == html
|
|
assert result.header_row == ["Beskrivning", "Belopp"]
|
|
|
|
def test_extract_skips_empty_rows(self):
|
|
"""Test that extraction skips rows with no content."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td></td><td></td></tr>
|
|
<tr><td>Real item</td><td>100</td></tr>
|
|
<tr><td></td><td></td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 1
|
|
assert result.items[0].description == "Real item"
|
|
|
|
def test_is_line_items_table(self):
|
|
"""Test detection of line items table vs summary table."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Line items table
|
|
line_items_headers = ["Art nummer", "Produktbeskrivning", "Antal", "Belopp"]
|
|
assert extractor.is_line_items_table(line_items_headers) is True
|
|
|
|
# Summary table
|
|
summary_headers = ["Frakt", "Faktura.avg", "Exkl.moms", "Moms", "Belopp att betala"]
|
|
assert extractor.is_line_items_table(summary_headers) is False
|
|
|
|
# Payment table
|
|
payment_headers = ["Bankgiro", "OCR", "Belopp"]
|
|
assert extractor.is_line_items_table(payment_headers) is False
|
|
|
|
|
|
class TestLineItemsExtractorFromPdf:
|
|
"""Tests for PDF extraction."""
|
|
|
|
def test_extract_from_pdf_no_tables(self):
|
|
"""Test extraction from PDF with no tables returns None."""
|
|
from unittest.mock import patch
|
|
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Mock _detect_tables_with_parsing to return no tables and no parsing_res
|
|
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
|
mock_detect.return_value = ([], [])
|
|
|
|
result = extractor.extract_from_pdf("fake.pdf")
|
|
|
|
assert result is None
|
|
|
|
def test_extract_from_pdf_with_tables(self):
|
|
"""Test extraction from PDF with tables."""
|
|
from unittest.mock import patch, MagicMock
|
|
from backend.table.structure_detector import TableDetectionResult
|
|
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Create mock table detection result
|
|
mock_table = MagicMock(spec=TableDetectionResult)
|
|
mock_table.html = """
|
|
<table>
|
|
<tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr>
|
|
<tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr>
|
|
</table>
|
|
"""
|
|
|
|
# Mock _detect_tables_with_parsing to return table results
|
|
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
|
mock_detect.return_value = ([mock_table], [])
|
|
|
|
result = extractor.extract_from_pdf("fake.pdf")
|
|
|
|
assert result is not None
|
|
assert len(result.items) >= 1
|
|
|
|
|
|
class TestLineItemsResult:
|
|
"""Tests for LineItemsResult dataclass."""
|
|
|
|
def test_create_result(self):
|
|
"""Test creating a LineItemsResult."""
|
|
items = [
|
|
LineItem(row_index=0, description="Item 1", amount="100"),
|
|
LineItem(row_index=1, description="Item 2", amount="200"),
|
|
]
|
|
result = LineItemsResult(
|
|
items=items,
|
|
header_row=["Beskrivning", "Belopp"],
|
|
raw_html="<table>...</table>",
|
|
)
|
|
|
|
assert len(result.items) == 2
|
|
assert result.header_row == ["Beskrivning", "Belopp"]
|
|
assert result.raw_html == "<table>...</table>"
|
|
|
|
def test_total_amount_calculation(self):
|
|
"""Test calculating total amount from line items."""
|
|
items = [
|
|
LineItem(row_index=0, description="Item 1", amount="100,00"),
|
|
LineItem(row_index=1, description="Item 2", amount="200,50"),
|
|
]
|
|
result = LineItemsResult(items=items, header_row=[], raw_html="")
|
|
|
|
# Total should be calculated correctly
|
|
assert result.total_amount == "300,50"
|
|
|
|
def test_total_amount_with_deduction(self):
|
|
"""Test total amount calculation includes deductions (as separate rows)."""
|
|
items = [
|
|
LineItem(row_index=0, description="Rent", amount="8159", is_deduction=False),
|
|
LineItem(row_index=1, description="Avdrag", amount="-2000", is_deduction=True),
|
|
]
|
|
result = LineItemsResult(items=items, header_row=[], raw_html="")
|
|
|
|
# Total should be 8159 + (-2000) = 6159
|
|
assert result.total_amount == "6 159,00"
|
|
|
|
def test_empty_result(self):
|
|
"""Test empty LineItemsResult."""
|
|
result = LineItemsResult(items=[], header_row=[], raw_html="")
|
|
|
|
assert result.items == []
|
|
assert result.total_amount is None
|
|
|
|
|
|
class TestMergedCellExtraction:
|
|
"""Tests for merged cell extraction (rental invoices)."""
|
|
|
|
def test_has_merged_header_single_cell_with_keywords(self):
|
|
"""Test detection of merged header with multiple keywords."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Single cell with multiple keywords - should be detected as merged
|
|
merged_header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
assert extractor._has_merged_header(merged_header) is True
|
|
|
|
def test_has_merged_header_normal_header(self):
|
|
"""Test normal header is not detected as merged."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Normal separate headers
|
|
normal_header = ["Beskrivning", "Antal", "Belopp"]
|
|
assert extractor._has_merged_header(normal_header) is False
|
|
|
|
def test_has_merged_header_empty(self):
|
|
"""Test empty header."""
|
|
extractor = LineItemsExtractor()
|
|
assert extractor._has_merged_header([]) is False
|
|
assert extractor._has_merged_header(None) is False
|
|
|
|
def test_has_merged_header_with_empty_trailing_cells(self):
|
|
"""Test merged header detection with empty trailing cells."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# PP-StructureV3 may produce headers with empty trailing cells
|
|
merged_header_with_empty = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", "", "", ""]
|
|
assert extractor._has_merged_header(merged_header_with_empty) is True
|
|
|
|
# Should also work with leading empty cells
|
|
merged_header_leading_empty = ["", "", "Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", ""]
|
|
assert extractor._has_merged_header(merged_header_leading_empty) is True
|
|
|
|
def test_extract_from_merged_cells_rental_invoice(self):
|
|
"""Test extracting from merged cells like rental invoice.
|
|
|
|
Each amount becomes a separate row. Negative amounts are marked as is_deduction=True.
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [
|
|
["", "", "", "8159 -2000"],
|
|
["", "", "", ""],
|
|
]
|
|
|
|
items = extractor._extract_from_merged_cells(header, rows)
|
|
|
|
# Should have 2 items: one for amount, one for deduction
|
|
assert len(items) == 2
|
|
assert items[0].amount == "8159"
|
|
assert items[0].is_deduction is False
|
|
assert items[0].article_number == "0218103-1201"
|
|
assert items[0].description == "2 rum och kök"
|
|
|
|
assert items[1].amount == "-2000"
|
|
assert items[1].is_deduction is True
|
|
assert items[1].description == "Avdrag"
|
|
|
|
def test_extract_from_merged_cells_separate_rows(self):
|
|
"""Test extracting when amount and deduction are in separate rows."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [
|
|
["", "", "", "8159"], # Amount in row 1
|
|
["", "", "", "-2000"], # Deduction in row 2
|
|
]
|
|
|
|
items = extractor._extract_from_merged_cells(header, rows)
|
|
|
|
# Should have 2 items: one for amount, one for deduction
|
|
assert len(items) == 2
|
|
assert items[0].amount == "8159"
|
|
assert items[0].is_deduction is False
|
|
assert items[0].article_number == "0218103-1201"
|
|
assert items[0].description == "2 rum och kök"
|
|
|
|
assert items[1].amount == "-2000"
|
|
assert items[1].is_deduction is True
|
|
|
|
def test_extract_from_merged_cells_swedish_format(self):
|
|
"""Test extracting Swedish formatted amounts with spaces."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [
|
|
["", "", "", "8 159"], # Swedish format with space
|
|
["", "", "", "-2 000"], # Swedish format with space
|
|
]
|
|
|
|
items = extractor._extract_from_merged_cells(header, rows)
|
|
|
|
# Should have 2 items
|
|
assert len(items) == 2
|
|
# Amounts are cleaned (spaces removed)
|
|
assert items[0].amount == "8159"
|
|
assert items[0].is_deduction is False
|
|
assert items[1].amount == "-2000"
|
|
assert items[1].is_deduction is True
|
|
|
|
def test_extract_merged_cells_via_extract(self):
|
|
"""Test that extract() calls merged cell parsing when needed."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td colspan="4">Specifikation 0218103-1201 2 rum och kök Hyra Avdrag</td></tr>
|
|
<tr><td></td><td></td><td></td><td>8159 -2000</td></tr>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
# Should have extracted 2 items via merged cell parsing
|
|
assert len(result.items) == 2
|
|
assert result.items[0].amount == "8159"
|
|
assert result.items[0].is_deduction is False
|
|
assert result.items[1].amount == "-2000"
|
|
assert result.items[1].is_deduction is True
|