- Extract models.py (LineItem, LineItemsResult dataclasses) - Extract html_table_parser.py (ColumnMapper, HtmlTableParser) - Extract merged_cell_handler.py (MergedCellHandler for PP-StructureV3 merged cells) - Reduce line_items_extractor.py from 971 to 396 lines - Add constants for magic numbers (MIN_AMOUNT_THRESHOLD, ROW_GROUPING_THRESHOLD, etc.) - Fix row grouping algorithm in text_line_items_extractor.py - Demote INFO logs to DEBUG level in structure_detector.py - Add 209 tests achieving 85%+ coverage on main modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
780 lines
28 KiB
Python
780 lines
28 KiB
Python
"""
|
|
Tests for Line Items Extractor
|
|
|
|
Tests extraction of structured line items from HTML tables.
|
|
"""
|
|
|
|
import pytest
|
|
from backend.table.line_items_extractor import (
|
|
LineItem,
|
|
LineItemsResult,
|
|
LineItemsExtractor,
|
|
ColumnMapper,
|
|
HTMLTableParser,
|
|
)
|
|
|
|
|
|
class TestLineItem:
|
|
"""Tests for LineItem dataclass."""
|
|
|
|
def test_create_line_item_with_all_fields(self):
|
|
"""Test creating a line item with all fields populated."""
|
|
item = LineItem(
|
|
row_index=0,
|
|
description="Samfällighetsavgift",
|
|
quantity="1",
|
|
unit="st",
|
|
unit_price="6888,00",
|
|
amount="6888,00",
|
|
article_number="3035",
|
|
vat_rate="25",
|
|
confidence=0.95,
|
|
)
|
|
assert item.description == "Samfällighetsavgift"
|
|
assert item.quantity == "1"
|
|
assert item.amount == "6888,00"
|
|
assert item.article_number == "3035"
|
|
|
|
def test_create_line_item_with_minimal_fields(self):
|
|
"""Test creating a line item with only required fields."""
|
|
item = LineItem(
|
|
row_index=0,
|
|
description="Test item",
|
|
amount="100,00",
|
|
)
|
|
assert item.description == "Test item"
|
|
assert item.amount == "100,00"
|
|
assert item.quantity is None
|
|
assert item.unit_price is None
|
|
|
|
|
|
class TestHTMLTableParser:
|
|
"""Tests for HTML table parsing."""
|
|
|
|
def test_parse_simple_table(self):
|
|
"""Test parsing a simple HTML table."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td>A</td><td>B</td></tr>
|
|
<tr><td>1</td><td>2</td></tr>
|
|
</table></body></html>
|
|
"""
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert header == [] # No thead
|
|
assert len(rows) == 2
|
|
assert rows[0] == ["A", "B"]
|
|
assert rows[1] == ["1", "2"]
|
|
|
|
def test_parse_table_with_thead(self):
|
|
"""Test parsing a table with explicit thead."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Name</th><th>Price</th></tr></thead>
|
|
<tbody><tr><td>Item 1</td><td>100</td></tr></tbody>
|
|
</table></body></html>
|
|
"""
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert header == ["Name", "Price"]
|
|
assert len(rows) == 1
|
|
assert rows[0] == ["Item 1", "100"]
|
|
|
|
def test_parse_empty_table(self):
|
|
"""Test parsing an empty table."""
|
|
html = "<html><body><table></table></body></html>"
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert header == []
|
|
assert rows == []
|
|
|
|
def test_parse_table_with_empty_cells(self):
|
|
"""Test parsing a table with empty cells."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td></td><td>Value</td><td></td></tr>
|
|
</table></body></html>
|
|
"""
|
|
parser = HTMLTableParser()
|
|
header, rows = parser.parse(html)
|
|
|
|
assert rows[0] == ["", "Value", ""]
|
|
|
|
|
|
class TestColumnMapper:
|
|
"""Tests for column mapping."""
|
|
|
|
def test_map_swedish_headers(self):
|
|
"""Test mapping Swedish column headers."""
|
|
mapper = ColumnMapper()
|
|
headers = ["Art nummer", "Produktbeskrivning", "Antal", "Enhet", "A-pris", "Belopp"]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping[0] == "article_number"
|
|
assert mapping[1] == "description"
|
|
assert mapping[2] == "quantity"
|
|
assert mapping[3] == "unit"
|
|
assert mapping[4] == "unit_price"
|
|
assert mapping[5] == "amount"
|
|
|
|
def test_map_merged_headers(self):
|
|
"""Test mapping merged column headers (e.g., 'Moms A-pris')."""
|
|
mapper = ColumnMapper()
|
|
headers = ["Belopp", "Moms A-pris", "Enhet Antal", "Vara/tjänst", "Art.nr"]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping.get(0) == "amount"
|
|
assert mapping.get(3) == "description" # Vara/tjänst -> description
|
|
assert mapping.get(4) == "article_number" # Art.nr -> article_number
|
|
|
|
def test_map_empty_headers(self):
|
|
"""Test mapping empty headers."""
|
|
mapper = ColumnMapper()
|
|
headers = ["", "", ""]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping == {}
|
|
|
|
def test_map_unknown_headers(self):
|
|
"""Test mapping unknown headers."""
|
|
mapper = ColumnMapper()
|
|
headers = ["Foo", "Bar", "Baz"]
|
|
|
|
mapping = mapper.map(headers)
|
|
|
|
assert mapping == {}
|
|
|
|
|
|
class TestLineItemsExtractor:
|
|
"""Tests for LineItemsExtractor."""
|
|
|
|
def test_extract_from_simple_html(self):
|
|
"""Test extracting line items from simple HTML."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Product A</td><td>2</td><td>50,00</td><td>100,00</td></tr>
|
|
<tr><td>Product B</td><td>1</td><td>75,00</td><td>75,00</td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 2
|
|
assert result.items[0].description == "Product A"
|
|
assert result.items[0].quantity == "2"
|
|
assert result.items[0].amount == "100,00"
|
|
assert result.items[1].description == "Product B"
|
|
|
|
def test_extract_from_reversed_table(self):
|
|
"""Test extracting from table with header at bottom (PP-StructureV3 quirk)."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td>6 888,00</td><td>6 888,00</td><td>1</td><td>Samfällighetsavgift</td><td>3035</td></tr>
|
|
<tr><td>4 811,44</td><td>4 811,44</td><td>1</td><td>GA:1 Avgift</td><td>303501</td></tr>
|
|
<tr><td>Belopp</td><td>Moms A-pris</td><td>Enhet Antal</td><td>Vara/tjänst</td><td>Art.nr</td></tr>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 2
|
|
assert result.items[0].amount == "6 888,00"
|
|
assert result.items[0].description == "Samfällighetsavgift"
|
|
assert result.items[1].description == "GA:1 Avgift"
|
|
|
|
def test_extract_from_empty_html(self):
|
|
"""Test extracting from empty HTML."""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract("<html><body><table></table></body></html>")
|
|
|
|
assert result.items == []
|
|
|
|
def test_extract_returns_result_with_metadata(self):
|
|
"""Test that extraction returns LineItemsResult with metadata."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody><tr><td>Test</td><td>100</td></tr></tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert isinstance(result, LineItemsResult)
|
|
assert result.raw_html == html
|
|
assert result.header_row == ["Beskrivning", "Belopp"]
|
|
|
|
def test_extract_skips_empty_rows(self):
|
|
"""Test that extraction skips rows with no content."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td></td><td></td></tr>
|
|
<tr><td>Real item</td><td>100</td></tr>
|
|
<tr><td></td><td></td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 1
|
|
assert result.items[0].description == "Real item"
|
|
|
|
def test_is_line_items_table(self):
|
|
"""Test detection of line items table vs summary table."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Line items table
|
|
line_items_headers = ["Art nummer", "Produktbeskrivning", "Antal", "Belopp"]
|
|
assert extractor.is_line_items_table(line_items_headers) is True
|
|
|
|
# Summary table
|
|
summary_headers = ["Frakt", "Faktura.avg", "Exkl.moms", "Moms", "Belopp att betala"]
|
|
assert extractor.is_line_items_table(summary_headers) is False
|
|
|
|
# Payment table
|
|
payment_headers = ["Bankgiro", "OCR", "Belopp"]
|
|
assert extractor.is_line_items_table(payment_headers) is False
|
|
|
|
|
|
class TestLineItemsExtractorFromPdf:
|
|
"""Tests for PDF extraction."""
|
|
|
|
def test_extract_from_pdf_no_tables(self):
|
|
"""Test extraction from PDF with no tables returns None."""
|
|
from unittest.mock import patch
|
|
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Mock _detect_tables_with_parsing to return no tables and no parsing_res
|
|
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
|
mock_detect.return_value = ([], [])
|
|
|
|
result = extractor.extract_from_pdf("fake.pdf")
|
|
|
|
assert result is None
|
|
|
|
def test_extract_from_pdf_with_tables(self):
|
|
"""Test extraction from PDF with tables."""
|
|
from unittest.mock import patch, MagicMock
|
|
from backend.table.structure_detector import TableDetectionResult
|
|
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Create mock table detection result with proper thead/tbody structure
|
|
mock_table = MagicMock(spec=TableDetectionResult)
|
|
mock_table.html = """
|
|
<table>
|
|
<thead><tr><th>Beskrivning</th><th>Antal</th><th>Pris</th><th>Belopp</th></tr></thead>
|
|
<tbody><tr><td>Product A</td><td>2</td><td>100,00</td><td>200,00</td></tr></tbody>
|
|
</table>
|
|
"""
|
|
|
|
# Mock _detect_tables_with_parsing to return table results
|
|
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
|
mock_detect.return_value = ([mock_table], [])
|
|
|
|
result = extractor.extract_from_pdf("fake.pdf")
|
|
|
|
assert result is not None
|
|
assert len(result.items) >= 1
|
|
|
|
|
|
class TestPdfPathValidation:
|
|
"""Tests for PDF path validation."""
|
|
|
|
def test_detect_tables_with_nonexistent_path(self):
|
|
"""Test that non-existent PDF path returns empty results."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Create detector and call _detect_tables_with_parsing with non-existent path
|
|
from unittest.mock import MagicMock
|
|
from backend.table.structure_detector import TableDetector
|
|
|
|
mock_detector = MagicMock(spec=TableDetector)
|
|
tables, parsing_res = extractor._detect_tables_with_parsing(
|
|
mock_detector, "nonexistent.pdf"
|
|
)
|
|
|
|
assert tables == []
|
|
assert parsing_res == []
|
|
|
|
def test_detect_tables_with_directory_path(self, tmp_path):
|
|
"""Test that directory path (not file) returns empty results."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
from unittest.mock import MagicMock
|
|
from backend.table.structure_detector import TableDetector
|
|
|
|
mock_detector = MagicMock(spec=TableDetector)
|
|
|
|
# tmp_path is a directory, not a file
|
|
tables, parsing_res = extractor._detect_tables_with_parsing(
|
|
mock_detector, str(tmp_path)
|
|
)
|
|
|
|
assert tables == []
|
|
assert parsing_res == []
|
|
|
|
def test_detect_tables_validates_file_exists(self, tmp_path):
|
|
"""Test path validation for file existence.
|
|
|
|
This test verifies that the method correctly validates the path exists
|
|
and is a file before attempting to process it.
|
|
"""
|
|
from unittest.mock import patch
|
|
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Create a real file path that exists
|
|
fake_pdf = tmp_path / "test.pdf"
|
|
fake_pdf.write_bytes(b"not a real pdf")
|
|
|
|
# Mock render_pdf_to_images to avoid actual PDF processing
|
|
with patch("shared.pdf.renderer.render_pdf_to_images") as mock_render:
|
|
# Return empty iterator - simulates file exists but no pages
|
|
mock_render.return_value = iter([])
|
|
|
|
from unittest.mock import MagicMock
|
|
from backend.table.structure_detector import TableDetector
|
|
|
|
mock_detector = MagicMock(spec=TableDetector)
|
|
mock_detector._ensure_initialized = MagicMock()
|
|
mock_detector._pipeline = MagicMock()
|
|
|
|
tables, parsing_res = extractor._detect_tables_with_parsing(
|
|
mock_detector, str(fake_pdf)
|
|
)
|
|
|
|
# render_pdf_to_images was called (path validation passed)
|
|
mock_render.assert_called_once()
|
|
assert tables == []
|
|
assert parsing_res == []
|
|
|
|
|
|
class TestLineItemsResult:
|
|
"""Tests for LineItemsResult dataclass."""
|
|
|
|
def test_create_result(self):
|
|
"""Test creating a LineItemsResult."""
|
|
items = [
|
|
LineItem(row_index=0, description="Item 1", amount="100"),
|
|
LineItem(row_index=1, description="Item 2", amount="200"),
|
|
]
|
|
result = LineItemsResult(
|
|
items=items,
|
|
header_row=["Beskrivning", "Belopp"],
|
|
raw_html="<table>...</table>",
|
|
)
|
|
|
|
assert len(result.items) == 2
|
|
assert result.header_row == ["Beskrivning", "Belopp"]
|
|
assert result.raw_html == "<table>...</table>"
|
|
|
|
def test_total_amount_calculation(self):
|
|
"""Test calculating total amount from line items."""
|
|
items = [
|
|
LineItem(row_index=0, description="Item 1", amount="100,00"),
|
|
LineItem(row_index=1, description="Item 2", amount="200,50"),
|
|
]
|
|
result = LineItemsResult(items=items, header_row=[], raw_html="")
|
|
|
|
# Total should be calculated correctly
|
|
assert result.total_amount == "300,50"
|
|
|
|
def test_total_amount_with_deduction(self):
|
|
"""Test total amount calculation includes deductions (as separate rows)."""
|
|
items = [
|
|
LineItem(row_index=0, description="Rent", amount="8159", is_deduction=False),
|
|
LineItem(row_index=1, description="Avdrag", amount="-2000", is_deduction=True),
|
|
]
|
|
result = LineItemsResult(items=items, header_row=[], raw_html="")
|
|
|
|
# Total should be 8159 + (-2000) = 6159
|
|
assert result.total_amount == "6 159,00"
|
|
|
|
def test_empty_result(self):
|
|
"""Test empty LineItemsResult."""
|
|
result = LineItemsResult(items=[], header_row=[], raw_html="")
|
|
|
|
assert result.items == []
|
|
assert result.total_amount is None
|
|
|
|
|
|
class TestMergedCellExtraction:
|
|
"""Tests for merged cell extraction (rental invoices)."""
|
|
|
|
def test_has_merged_header_single_cell_with_keywords(self):
|
|
"""Test detection of merged header with multiple keywords."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Single cell with multiple keywords - should be detected as merged
|
|
merged_header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
assert extractor._has_merged_header(merged_header) is True
|
|
|
|
def test_has_merged_header_normal_header(self):
|
|
"""Test normal header is not detected as merged."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Normal separate headers
|
|
normal_header = ["Beskrivning", "Antal", "Belopp"]
|
|
assert extractor._has_merged_header(normal_header) is False
|
|
|
|
def test_has_merged_header_empty(self):
|
|
"""Test empty header."""
|
|
extractor = LineItemsExtractor()
|
|
assert extractor._has_merged_header([]) is False
|
|
assert extractor._has_merged_header(None) is False
|
|
|
|
def test_has_merged_header_with_empty_trailing_cells(self):
|
|
"""Test merged header detection with empty trailing cells."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# PP-StructureV3 may produce headers with empty trailing cells
|
|
merged_header_with_empty = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", "", "", ""]
|
|
assert extractor._has_merged_header(merged_header_with_empty) is True
|
|
|
|
# Should also work with leading empty cells
|
|
merged_header_leading_empty = ["", "", "Specifikation 0218103-1201 2 rum och kök Hyra Avdrag", ""]
|
|
assert extractor._has_merged_header(merged_header_leading_empty) is True
|
|
|
|
def test_extract_from_merged_cells_rental_invoice(self):
|
|
"""Test extracting from merged cells like rental invoice.
|
|
|
|
Each amount becomes a separate row. Negative amounts are marked as is_deduction=True.
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [
|
|
["", "", "", "8159 -2000"],
|
|
["", "", "", ""],
|
|
]
|
|
|
|
items = extractor._extract_from_merged_cells(header, rows)
|
|
|
|
# Should have 2 items: one for amount, one for deduction
|
|
assert len(items) == 2
|
|
assert items[0].amount == "8159"
|
|
assert items[0].is_deduction is False
|
|
assert items[0].article_number == "0218103-1201"
|
|
assert items[0].description == "2 rum och kök"
|
|
|
|
assert items[1].amount == "-2000"
|
|
assert items[1].is_deduction is True
|
|
assert items[1].description == "Avdrag"
|
|
|
|
def test_extract_from_merged_cells_separate_rows(self):
|
|
"""Test extracting when amount and deduction are in separate rows."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [
|
|
["", "", "", "8159"], # Amount in row 1
|
|
["", "", "", "-2000"], # Deduction in row 2
|
|
]
|
|
|
|
items = extractor._extract_from_merged_cells(header, rows)
|
|
|
|
# Should have 2 items: one for amount, one for deduction
|
|
assert len(items) == 2
|
|
assert items[0].amount == "8159"
|
|
assert items[0].is_deduction is False
|
|
assert items[0].article_number == "0218103-1201"
|
|
assert items[0].description == "2 rum och kök"
|
|
|
|
assert items[1].amount == "-2000"
|
|
assert items[1].is_deduction is True
|
|
|
|
def test_extract_from_merged_cells_swedish_format(self):
|
|
"""Test extracting Swedish formatted amounts with spaces."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
header = ["Specifikation 0218103-1201 2 rum och kök Hyra Avdrag"]
|
|
rows = [
|
|
["", "", "", "8 159"], # Swedish format with space
|
|
["", "", "", "-2 000"], # Swedish format with space
|
|
]
|
|
|
|
items = extractor._extract_from_merged_cells(header, rows)
|
|
|
|
# Should have 2 items
|
|
assert len(items) == 2
|
|
# Amounts are cleaned (spaces removed)
|
|
assert items[0].amount == "8159"
|
|
assert items[0].is_deduction is False
|
|
assert items[1].amount == "-2000"
|
|
assert items[1].is_deduction is True
|
|
|
|
def test_extract_merged_cells_via_extract(self):
|
|
"""Test that extract() calls merged cell parsing when needed."""
|
|
html = """
|
|
<html><body><table>
|
|
<tr><td colspan="4">Specifikation 0218103-1201 2 rum och kök Hyra Avdrag</td></tr>
|
|
<tr><td></td><td></td><td></td><td>8159 -2000</td></tr>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
# Should have extracted 2 items via merged cell parsing
|
|
assert len(result.items) == 2
|
|
assert result.items[0].amount == "8159"
|
|
assert result.items[0].is_deduction is False
|
|
assert result.items[1].amount == "-2000"
|
|
assert result.items[1].is_deduction is True
|
|
|
|
|
|
class TestTextFallbackExtraction:
|
|
"""Tests for text-based fallback extraction."""
|
|
|
|
def test_text_fallback_disabled_by_default(self):
|
|
"""Test text fallback can be disabled."""
|
|
extractor = LineItemsExtractor(enable_text_fallback=False)
|
|
assert extractor.enable_text_fallback is False
|
|
|
|
def test_text_fallback_enabled_by_default(self):
|
|
"""Test text fallback is enabled by default."""
|
|
extractor = LineItemsExtractor()
|
|
assert extractor.enable_text_fallback is True
|
|
|
|
def test_try_text_fallback_with_valid_parsing_res(self):
|
|
"""Test text fallback with valid parsing results."""
|
|
from unittest.mock import patch, MagicMock
|
|
from backend.table.text_line_items_extractor import (
|
|
TextLineItemsExtractor,
|
|
TextLineItem,
|
|
TextLineItemsResult,
|
|
)
|
|
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Mock parsing_res_list with text elements
|
|
parsing_res = [
|
|
{"label": "text", "bbox": [0, 100, 200, 120], "text": "Product A"},
|
|
{"label": "text", "bbox": [250, 100, 350, 120], "text": "1 234,56"},
|
|
{"label": "text", "bbox": [0, 150, 200, 170], "text": "Product B"},
|
|
{"label": "text", "bbox": [250, 150, 350, 170], "text": "2 345,67"},
|
|
]
|
|
|
|
# Create mock text extraction result
|
|
mock_text_result = TextLineItemsResult(
|
|
items=[
|
|
TextLineItem(row_index=0, description="Product A", amount="1 234,56"),
|
|
TextLineItem(row_index=1, description="Product B", amount="2 345,67"),
|
|
],
|
|
header_row=[],
|
|
)
|
|
|
|
with patch.object(TextLineItemsExtractor, 'extract_from_parsing_res', return_value=mock_text_result):
|
|
result = extractor._try_text_fallback(parsing_res)
|
|
|
|
assert result is not None
|
|
assert len(result.items) == 2
|
|
assert result.items[0].description == "Product A"
|
|
assert result.items[1].description == "Product B"
|
|
|
|
def test_try_text_fallback_returns_none_on_failure(self):
|
|
"""Test text fallback returns None when extraction fails."""
|
|
from unittest.mock import patch
|
|
|
|
extractor = LineItemsExtractor()
|
|
|
|
with patch('backend.table.text_line_items_extractor.TextLineItemsExtractor.extract_from_parsing_res', return_value=None):
|
|
result = extractor._try_text_fallback([])
|
|
assert result is None
|
|
|
|
def test_extract_from_pdf_uses_text_fallback(self):
|
|
"""Test extract_from_pdf uses text fallback when no tables found."""
|
|
from unittest.mock import patch, MagicMock
|
|
from backend.table.text_line_items_extractor import TextLineItem, TextLineItemsResult
|
|
|
|
extractor = LineItemsExtractor(enable_text_fallback=True)
|
|
|
|
# Mock _detect_tables_with_parsing to return no tables but parsing_res
|
|
mock_text_result = TextLineItemsResult(
|
|
items=[
|
|
TextLineItem(row_index=0, description="Product", amount="100,00"),
|
|
TextLineItem(row_index=1, description="Product 2", amount="200,00"),
|
|
],
|
|
header_row=[],
|
|
)
|
|
|
|
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
|
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
|
|
|
|
with patch.object(extractor, '_try_text_fallback', return_value=MagicMock(items=[MagicMock()])) as mock_fallback:
|
|
result = extractor.extract_from_pdf("fake.pdf")
|
|
|
|
# Text fallback should be called
|
|
mock_fallback.assert_called_once()
|
|
|
|
def test_extract_from_pdf_skips_fallback_when_disabled(self):
|
|
"""Test extract_from_pdf skips text fallback when disabled."""
|
|
from unittest.mock import patch
|
|
|
|
extractor = LineItemsExtractor(enable_text_fallback=False)
|
|
|
|
with patch.object(extractor, '_detect_tables_with_parsing') as mock_detect:
|
|
mock_detect.return_value = ([], [{"label": "text", "text": "test"}])
|
|
|
|
result = extractor.extract_from_pdf("fake.pdf")
|
|
|
|
# Should return None, not use text fallback
|
|
assert result is None
|
|
|
|
|
|
class TestVerticallyMergedCellExtraction:
|
|
"""Tests for vertically merged cell extraction."""
|
|
|
|
def test_detects_vertically_merged_cells(self):
|
|
"""Test detection of vertically merged cells in rows."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
# Rows with multiple product numbers in single cell
|
|
rows = [["Produktnr 1457280 1457281 1060381 merged text here"]]
|
|
assert extractor._has_vertically_merged_cells(rows) is True
|
|
|
|
def test_splits_vertically_merged_rows(self):
|
|
"""Test splitting vertically merged rows."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
rows = [
|
|
["Produktnr 1234567 1234568", "Antal 2ST 3ST"],
|
|
]
|
|
header, data = extractor._split_merged_rows(rows)
|
|
|
|
# Should split into header + data rows
|
|
assert isinstance(header, list)
|
|
assert isinstance(data, list)
|
|
|
|
|
|
class TestDeductionDetection:
|
|
"""Tests for deduction/discount detection."""
|
|
|
|
def test_detects_deduction_by_keyword_avdrag(self):
|
|
"""Test detection of deduction by 'avdrag' keyword."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Hyresavdrag januari</td><td>-500,00</td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 1
|
|
assert result.items[0].is_deduction is True
|
|
|
|
def test_detects_deduction_by_keyword_rabatt(self):
|
|
"""Test detection of deduction by 'rabatt' keyword."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Rabatt 10%</td><td>-100,00</td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 1
|
|
assert result.items[0].is_deduction is True
|
|
|
|
def test_detects_deduction_by_negative_amount(self):
|
|
"""Test detection of deduction by negative amount."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Some credit</td><td>-250,00</td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 1
|
|
assert result.items[0].is_deduction is True
|
|
|
|
def test_normal_item_not_deduction(self):
|
|
"""Test normal item is not marked as deduction."""
|
|
html = """
|
|
<html><body><table>
|
|
<thead><tr><th>Beskrivning</th><th>Belopp</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>Normal product</td><td>500,00</td></tr>
|
|
</tbody>
|
|
</table></body></html>
|
|
"""
|
|
extractor = LineItemsExtractor()
|
|
result = extractor.extract(html)
|
|
|
|
assert len(result.items) == 1
|
|
assert result.items[0].is_deduction is False
|
|
|
|
|
|
class TestHeaderDetection:
|
|
"""Tests for header row detection."""
|
|
|
|
def test_detect_header_at_bottom(self):
|
|
"""Test detecting header at bottom of table (reversed)."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
rows = [
|
|
["100,00", "Product A", "1"],
|
|
["200,00", "Product B", "2"],
|
|
["Belopp", "Beskrivning", "Antal"], # Header at bottom
|
|
]
|
|
|
|
header_idx, header, is_at_end = extractor._detect_header_row(rows)
|
|
|
|
assert header_idx == 2
|
|
assert is_at_end is True
|
|
assert "Belopp" in header
|
|
|
|
def test_detect_header_at_top(self):
|
|
"""Test detecting header at top of table."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
rows = [
|
|
["Belopp", "Beskrivning", "Antal"], # Header at top
|
|
["100,00", "Product A", "1"],
|
|
["200,00", "Product B", "2"],
|
|
]
|
|
|
|
header_idx, header, is_at_end = extractor._detect_header_row(rows)
|
|
|
|
assert header_idx == 0
|
|
assert is_at_end is False
|
|
assert "Belopp" in header
|
|
|
|
def test_no_header_detected(self):
|
|
"""Test when no header is detected."""
|
|
extractor = LineItemsExtractor()
|
|
|
|
rows = [
|
|
["100,00", "Product A", "1"],
|
|
["200,00", "Product B", "2"],
|
|
]
|
|
|
|
header_idx, header, is_at_end = extractor._detect_header_row(rows)
|
|
|
|
assert header_idx == -1
|
|
assert header == []
|
|
assert is_at_end is False
|