Update paddle, and support invoice line item
This commit is contained in:
464
tests/table/test_line_items_extractor.py
Normal file
464
tests/table/test_line_items_extractor.py
Normal file
@@ -0,0 +1,464 @@
|
||||
"""
|
||||
Tests for Line Items Extractor
|
||||
|
||||
Tests extraction of structured line items from HTML tables.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from backend.table.line_items_extractor import (
|
||||
LineItem,
|
||||
LineItemsResult,
|
||||
LineItemsExtractor,
|
||||
ColumnMapper,
|
||||
HTMLTableParser,
|
||||
)
|
||||
|
||||
|
||||
class TestLineItem:
|
||||
"""Tests for LineItem dataclass."""
|
||||
|
||||
def test_create_line_item_with_all_fields(self):
|
||||
"""Test creating a line item with all fields populated."""
|
||||
item = LineItem(
|
||||
row_index=0,
|
||||
description="Samfällighetsavgift",
|
||||
quantity="1",
|
||||
unit="st",
|
||||
unit_price="6888,00",
|
||||
amount="6888,00",
|
||||
article_number="3035",
|
||||
vat_rate="25",
|
||||
confidence=0.95,
|
||||
)
|
||||
assert item.description == "Samfällighetsavgift"
|
||||
assert item.quantity == "1"
|
||||
assert item.amount == "6888,00"
|
||||
assert item.article_number == "3035"
|
||||
|
||||
def test_create_line_item_with_minimal_fields(self):
|
||||
"""Test creating a line item with only required fields."""
|
||||
item = LineItem(
|
||||
row_index=0,
|
||||
description="Test item",
|
||||
amount="100,00",
|
||||
)
|
||||
assert item.description == "Test item"
|
||||
assert item.amount == "100,00"
|
||||
assert item.quantity is None
|
||||
assert item.unit_price is None
|
||||
|
||||
|
||||
class TestHTMLTableParser:
|
||||
"""Tests for HTML table parsing."""
|
||||
|
||||
def test_parse_simple_table(self):
|
||||
"""Test parsing a simple HTML table."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<tr><td>A</td><td>B</td></tr>
|
||||
<tr><td>1</td><td>2</td></tr>
|
||||
</table></body></html>
|
||||
"""
|
||||
parser = HTMLTableParser()
|
||||
header, rows = parser.parse(html)
|
||||
|
||||
assert header == [] # No thead
|
||||
assert len(rows) == 2
|
||||
assert rows[0] == ["A", "B"]
|
||||
assert rows[1] == ["1", "2"]
|
||||
|
||||
def test_parse_table_with_thead(self):
|
||||
"""Test parsing a table with explicit thead."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Name</th><th>Price</th></tr></thead>
|
||||
<tbody><tr><td>Item 1</td><td>100</td></tr></tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
parser = HTMLTableParser()
|
||||
header, rows = parser.parse(html)
|
||||
|
||||
assert header == ["Name", "Price"]
|
||||
assert len(rows) == 1
|
||||
assert rows[0] == ["Item 1", "100"]
|
||||
|
||||
def test_parse_empty_table(self):
|
||||
"""Test parsing an empty table."""
|
||||
html = "<html><body><table></table></body></html>"
|
||||
parser = HTMLTableParser()
|
||||
header, rows = parser.parse(html)
|
||||
|
||||
assert header == []
|
||||
assert rows == []
|
||||
|
||||
def test_parse_table_with_empty_cells(self):
|
||||
"""Test parsing a table with empty cells."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<tr><td></td><td>Value</td><td></td></tr>
|
||||
</table></body></html>
|
||||
"""
|
||||
parser = HTMLTableParser()
|
||||
header, rows = parser.parse(html)
|
||||
|
||||
assert rows[0] == ["", "Value", ""]
|
||||
|
||||
|
||||
class TestColumnMapper:
|
||||
"""Tests for column mapping."""
|
||||
|
||||
def test_map_swedish_headers(self):
|
||||
"""Test mapping Swedish column headers."""
|
||||
mapper = ColumnMapper()
|
||||
headers = ["Art nummer", "Produktbeskrivning", "Antal", "Enhet", "A-pris", "Belopp"]
|
||||
|
||||
mapping = mapper.map(headers)
|
||||
|
||||
assert mapping[0] == "article_number"
|
||||
assert mapping[1] == "description"
|
||||
assert mapping[2] == "quantity"
|
||||
assert mapping[3] == "unit"
|
||||
assert mapping[4] == "unit_price"
|
||||
assert mapping[5] == "amount"
|
||||
|
||||
def test_map_merged_headers(self):
|
||||
"""Test mapping merged column headers (e.g., 'Moms A-pris')."""
|
||||
mapper = ColumnMapper()
|
||||
headers = ["Belopp", "Moms A-pris", "Enhet Antal", "Vara/tjänst", "Art.nr"]
|
||||
|
||||
mapping = mapper.map(headers)
|
||||
|
||||
assert mapping.get(0) == "amount"
|
||||
assert mapping.get(3) == "description" # Vara/tjänst -> description
|
||||
assert mapping.get(4) == "article_number" # Art.nr -> article_number
|
||||
|
||||
def test_map_empty_headers(self):
|
||||
"""Test mapping empty headers."""
|
||||
mapper = ColumnMapper()
|
||||
headers = ["", "", ""]
|
||||
|
||||
mapping = mapper.map(headers)
|
||||
|
||||
assert mapping == {}
|
||||
|
||||
def test_map_unknown_headers(self):
|
||||
"""Test mapping unknown headers."""
|
||||
mapper = ColumnMapper()
|
||||
headers = ["Foo", "Bar", "Baz"]
|
||||
|
||||
mapping = mapper.map(headers)
|
||||
|
||||
assert mapping == {}
|
||||
|
||||
|
||||
class TestLineItemsExtractor:
|
||||
"""Tests for LineItemsExtractor."""
|
||||
|
||||
def test_extract_from_simple_html(self):
|
||||
"""Test extracting line items from simple HTML."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td>Product A</td><td>2</td><td>50,00</td><td>100,00</td></tr>
|
||||
<tr><td>Product B</td><td>1</td><td>75,00</td><td>75,00</td></tr>
|
||||
</tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert len(result.items) == 2
|
||||
assert result.items[0].description == "Product A"
|
||||
assert result.items[0].quantity == "2"
|
||||
assert result.items[0].amount == "100,00"
|
||||
assert result.items[1].description == "Product B"
|
||||
|
||||
def test_extract_from_reversed_table(self):
|
||||
"""Test extracting from table with header at bottom (PP-StructureV3 quirk)."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<tr><td>6 888,00</td><td>6 888,00</td><td>1</td><td>Samfällighetsavgift</td><td>3035</td></tr>
|
||||
<tr><td>4 811,44</td><td>4 811,44</td><td>1</td><td>GA:1 Avgift</td><td>303501</td></tr>
|
||||
<tr><td>Belopp</td><td>Moms A-pris</td><td>Enhet Antal</td><td>Vara/tjänst</td><td>Art.nr</td></tr>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert len(result.items) == 2
|
||||
assert result.items[0].amount == "6 888,00"
|
||||
assert result.items[0].description == "Samfällighetsavgift"
|
||||
assert result.items[1].description == "GA:1 Avgift"
|
||||
|
||||
def test_extract_from_empty_html(self):
|
||||
"""Test extracting from empty HTML."""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract("<html><body><table></table></body></html>")
|
||||
|
||||
assert result.items == []
|
||||
|
||||
def test_extract_returns_result_with_metadata(self):
|
||||
"""Test that extraction returns LineItemsResult with metadata."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
||||
<tbody><tr><td>Test</td><td>100</td></tr></tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert isinstance(result, LineItemsResult)
|
||||
assert result.raw_html == html
|
||||
assert result.header_row == ["Beskrivning", "Belopp"]
|
||||
|
||||
def test_extract_skips_empty_rows(self):
|
||||
"""Test that extraction skips rows with no content."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td></td><td></td></tr>
|
||||
<tr><td>Real item</td><td>100</td></tr>
|
||||
<tr><td></td><td></td></tr>
|
||||
</tbody>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
assert len(result.items) == 1
|
||||
assert result.items[0].description == "Real item"
|
||||
|
||||
def test_is_line_items_table(self):
|
||||
"""Test detection of line items table vs summary table."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Line items table
|
||||
line_items_headers = ["Art nummer", "Produktbeskrivning", "Antal", "Belopp"]
|
||||
assert extractor.is_line_items_table(line_items_headers) is True
|
||||
|
||||
# Summary table
|
||||
summary_headers = ["Frakt", "Faktura.avg", "Exkl.moms", "Moms", "Belopp att betala"]
|
||||
assert extractor.is_line_items_table(summary_headers) is False
|
||||
|
||||
# Payment table
|
||||
payment_headers = ["Bankgiro", "OCR", "Belopp"]
|
||||
assert extractor.is_line_items_table(payment_headers) is False
|
||||
|
||||
|
||||
class TestLineItemsExtractorFromPdf:
|
||||
"""Tests for PDF extraction."""
|
||||
|
||||
def test_extract_from_pdf_no_tables(self):
|
||||
"""Test extraction from PDF with no tables returns None."""
|
||||
from unittest.mock import patch
|
||||
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Mock _detect_tables_with_parsing to return no tables and no parsing_res
|
||||
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
||||
mock_detect.return_value = ([], [])
|
||||
|
||||
result = extractor.extract_from_pdf("fake.pdf")
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_extract_from_pdf_with_tables(self):
|
||||
"""Test extraction from PDF with tables."""
|
||||
from unittest.mock import patch, MagicMock
|
||||
from backend.table.structure_detector import TableDetectionResult
|
||||
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Create mock table detection result
|
||||
mock_table = MagicMock(spec=TableDetectionResult)
|
||||
mock_table.html = """
|
||||
<table>
|
||||
<tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr>
|
||||
<tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr>
|
||||
</table>
|
||||
"""
|
||||
|
||||
# Mock _detect_tables_with_parsing to return table results
|
||||
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
||||
mock_detect.return_value = ([mock_table], [])
|
||||
|
||||
result = extractor.extract_from_pdf("fake.pdf")
|
||||
|
||||
assert result is not None
|
||||
assert len(result.items) >= 1
|
||||
|
||||
|
||||
class TestLineItemsResult:
|
||||
"""Tests for LineItemsResult dataclass."""
|
||||
|
||||
def test_create_result(self):
|
||||
"""Test creating a LineItemsResult."""
|
||||
items = [
|
||||
LineItem(row_index=0, description="Item 1", amount="100"),
|
||||
LineItem(row_index=1, description="Item 2", amount="200"),
|
||||
]
|
||||
result = LineItemsResult(
|
||||
items=items,
|
||||
header_row=["Beskrivning", "Belopp"],
|
||||
raw_html="<table>...</table>",
|
||||
)
|
||||
|
||||
assert len(result.items) == 2
|
||||
assert result.header_row == ["Beskrivning", "Belopp"]
|
||||
assert result.raw_html == "<table>...</table>"
|
||||
|
||||
def test_total_amount_calculation(self):
|
||||
"""Test calculating total amount from line items."""
|
||||
items = [
|
||||
LineItem(row_index=0, description="Item 1", amount="100,00"),
|
||||
LineItem(row_index=1, description="Item 2", amount="200,50"),
|
||||
]
|
||||
result = LineItemsResult(items=items, header_row=[], raw_html="")
|
||||
|
||||
# Total should be calculated correctly
|
||||
assert result.total_amount == "300,50"
|
||||
|
||||
def test_total_amount_with_deduction(self):
|
||||
"""Test total amount calculation includes deductions (as separate rows)."""
|
||||
items = [
|
||||
LineItem(row_index=0, description="Rent", amount="8159", is_deduction=False),
|
||||
LineItem(row_index=1, description="Avdrag", amount="-2000", is_deduction=True),
|
||||
]
|
||||
result = LineItemsResult(items=items, header_row=[], raw_html="")
|
||||
|
||||
# Total should be 8159 + (-2000) = 6159
|
||||
assert result.total_amount == "6 159,00"
|
||||
|
||||
def test_empty_result(self):
|
||||
"""Test empty LineItemsResult."""
|
||||
result = LineItemsResult(items=[], header_row=[], raw_html="")
|
||||
|
||||
assert result.items == []
|
||||
assert result.total_amount is None
|
||||
|
||||
|
||||
class TestMergedCellExtraction:
|
||||
"""Tests for merged cell extraction (rental invoices)."""
|
||||
|
||||
def test_has_merged_header_single_cell_with_keywords(self):
|
||||
"""Test detection of merged header with multiple keywords."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Single cell with multiple keywords - should be detected as merged
|
||||
merged_header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
||||
assert extractor._has_merged_header(merged_header) is True
|
||||
|
||||
def test_has_merged_header_normal_header(self):
|
||||
"""Test normal header is not detected as merged."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# Normal separate headers
|
||||
normal_header = ["Beskrivning", "Antal", "Belopp"]
|
||||
assert extractor._has_merged_header(normal_header) is False
|
||||
|
||||
def test_has_merged_header_empty(self):
|
||||
"""Test empty header."""
|
||||
extractor = LineItemsExtractor()
|
||||
assert extractor._has_merged_header([]) is False
|
||||
assert extractor._has_merged_header(None) is False
|
||||
|
||||
def test_has_merged_header_with_empty_trailing_cells(self):
|
||||
"""Test merged header detection with empty trailing cells."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
# PP-StructureV3 may produce headers with empty trailing cells
|
||||
merged_header_with_empty = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", "", "", ""]
|
||||
assert extractor._has_merged_header(merged_header_with_empty) is True
|
||||
|
||||
# Should also work with leading empty cells
|
||||
merged_header_leading_empty = ["", "", "Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", ""]
|
||||
assert extractor._has_merged_header(merged_header_leading_empty) is True
|
||||
|
||||
def test_extract_from_merged_cells_rental_invoice(self):
|
||||
"""Test extracting from merged cells like rental invoice.
|
||||
|
||||
Each amount becomes a separate row. Negative amounts are marked as is_deduction=True.
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
||||
rows = [
|
||||
["", "", "", "8159 -2000"],
|
||||
["", "", "", ""],
|
||||
]
|
||||
|
||||
items = extractor._extract_from_merged_cells(header, rows)
|
||||
|
||||
# Should have 2 items: one for amount, one for deduction
|
||||
assert len(items) == 2
|
||||
assert items[0].amount == "8159"
|
||||
assert items[0].is_deduction is False
|
||||
assert items[0].article_number == "0218103-1201"
|
||||
assert items[0].description == "2 rum och kök"
|
||||
|
||||
assert items[1].amount == "-2000"
|
||||
assert items[1].is_deduction is True
|
||||
assert items[1].description == "Avdrag"
|
||||
|
||||
def test_extract_from_merged_cells_separate_rows(self):
|
||||
"""Test extracting when amount and deduction are in separate rows."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
||||
rows = [
|
||||
["", "", "", "8159"], # Amount in row 1
|
||||
["", "", "", "-2000"], # Deduction in row 2
|
||||
]
|
||||
|
||||
items = extractor._extract_from_merged_cells(header, rows)
|
||||
|
||||
# Should have 2 items: one for amount, one for deduction
|
||||
assert len(items) == 2
|
||||
assert items[0].amount == "8159"
|
||||
assert items[0].is_deduction is False
|
||||
assert items[0].article_number == "0218103-1201"
|
||||
assert items[0].description == "2 rum och kök"
|
||||
|
||||
assert items[1].amount == "-2000"
|
||||
assert items[1].is_deduction is True
|
||||
|
||||
def test_extract_from_merged_cells_swedish_format(self):
|
||||
"""Test extracting Swedish formatted amounts with spaces."""
|
||||
extractor = LineItemsExtractor()
|
||||
|
||||
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
||||
rows = [
|
||||
["", "", "", "8 159"], # Swedish format with space
|
||||
["", "", "", "-2 000"], # Swedish format with space
|
||||
]
|
||||
|
||||
items = extractor._extract_from_merged_cells(header, rows)
|
||||
|
||||
# Should have 2 items
|
||||
assert len(items) == 2
|
||||
# Amounts are cleaned (spaces removed)
|
||||
assert items[0].amount == "8159"
|
||||
assert items[0].is_deduction is False
|
||||
assert items[1].amount == "-2000"
|
||||
assert items[1].is_deduction is True
|
||||
|
||||
def test_extract_merged_cells_via_extract(self):
|
||||
"""Test that extract() calls merged cell parsing when needed."""
|
||||
html = """
|
||||
<html><body><table>
|
||||
<tr><td colspan="4">Specifikation 0218103-1201 2 rum och kök Hyra Avdrag</td></tr>
|
||||
<tr><td></td><td></td><td></td><td>8159 -2000</td></tr>
|
||||
</table></body></html>
|
||||
"""
|
||||
extractor = LineItemsExtractor()
|
||||
result = extractor.extract(html)
|
||||
|
||||
# Should have extracted 2 items via merged cell parsing
|
||||
assert len(result.items) == 2
|
||||
assert result.items[0].amount == "8159"
|
||||
assert result.items[0].is_deduction is False
|
||||
assert result.items[1].amount == "-2000"
|
||||
assert result.items[1].is_deduction is True
|
||||
Reference in New Issue
Block a user