402 lines
14 KiB
Python
402 lines
14 KiB
Python
"""
|
|
Tests for Field Extractor
|
|
|
|
Tests field normalization functions:
|
|
- Invoice number normalization
|
|
- Date normalization
|
|
- Amount normalization
|
|
- Bankgiro/Plusgiro normalization
|
|
- OCR number normalization
|
|
- Payment line normalization
|
|
"""
|
|
|
|
import pytest
|
|
from inference.pipeline.field_extractor import FieldExtractor
|
|
|
|
|
|
class TestFieldExtractorInit:
|
|
"""Tests for FieldExtractor initialization."""
|
|
|
|
def test_default_init(self):
|
|
"""Test default initialization."""
|
|
extractor = FieldExtractor()
|
|
assert extractor.ocr_lang == 'en'
|
|
assert extractor.use_gpu is False
|
|
assert extractor.bbox_padding == 0.1
|
|
assert extractor.dpi == 300
|
|
|
|
def test_custom_init(self):
|
|
"""Test custom initialization."""
|
|
extractor = FieldExtractor(
|
|
ocr_lang='sv',
|
|
use_gpu=True,
|
|
bbox_padding=0.2,
|
|
dpi=150
|
|
)
|
|
assert extractor.ocr_lang == 'sv'
|
|
assert extractor.use_gpu is True
|
|
assert extractor.bbox_padding == 0.2
|
|
assert extractor.dpi == 150
|
|
|
|
|
|
class TestNormalizeInvoiceNumber:
|
|
"""Tests for invoice number normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_alphanumeric_invoice_number(self, extractor):
|
|
"""Test alphanumeric invoice number like A3861."""
|
|
result, is_valid, error = extractor._normalize_invoice_number("Fakturanummer: A3861")
|
|
assert result == 'A3861'
|
|
assert is_valid is True
|
|
|
|
def test_prefix_invoice_number(self, extractor):
|
|
"""Test invoice number with prefix like INV12345."""
|
|
result, is_valid, error = extractor._normalize_invoice_number("Invoice INV12345")
|
|
assert result is not None
|
|
assert 'INV' in result or '12345' in result
|
|
|
|
def test_numeric_invoice_number(self, extractor):
|
|
"""Test pure numeric invoice number."""
|
|
result, is_valid, error = extractor._normalize_invoice_number("Invoice: 12345678")
|
|
assert result is not None
|
|
assert result.isdigit()
|
|
|
|
def test_year_prefixed_invoice_number(self, extractor):
|
|
"""Test invoice number with year prefix like 2024-001."""
|
|
result, is_valid, error = extractor._normalize_invoice_number("Faktura 2024-12345")
|
|
assert result is not None
|
|
assert '2024' in result
|
|
|
|
def test_avoid_long_ocr_sequence(self, extractor):
|
|
"""Test that long OCR-like sequences are avoided."""
|
|
# When text contains both short invoice number and long OCR sequence
|
|
text = "Fakturanummer: A3861 OCR: 310196187399952763290708"
|
|
result, is_valid, error = extractor._normalize_invoice_number(text)
|
|
# Should prefer the shorter alphanumeric pattern
|
|
assert result == 'A3861'
|
|
|
|
def test_empty_string(self, extractor):
|
|
"""Test empty string input."""
|
|
result, is_valid, error = extractor._normalize_invoice_number("")
|
|
assert result is None or is_valid is False
|
|
|
|
|
|
class TestNormalizeBankgiro:
|
|
"""Tests for Bankgiro normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_standard_7_digit_format(self, extractor):
|
|
"""Test 7-digit Bankgiro XXX-XXXX."""
|
|
result, is_valid, error = extractor._normalize_bankgiro("Bankgiro: 782-1713")
|
|
assert result == '782-1713'
|
|
assert is_valid is True
|
|
|
|
def test_standard_8_digit_format(self, extractor):
|
|
"""Test 8-digit Bankgiro XXXX-XXXX."""
|
|
result, is_valid, error = extractor._normalize_bankgiro("BG 5393-9484")
|
|
assert result == '5393-9484'
|
|
assert is_valid is True
|
|
|
|
def test_without_dash(self, extractor):
|
|
"""Test Bankgiro without dash."""
|
|
result, is_valid, error = extractor._normalize_bankgiro("Bankgiro 7821713")
|
|
assert result is not None
|
|
# Should be formatted with dash
|
|
|
|
def test_with_spaces(self, extractor):
|
|
"""Test Bankgiro with spaces - may not parse if spaces break the pattern."""
|
|
result, is_valid, error = extractor._normalize_bankgiro("BG: 782 1713")
|
|
# Spaces in the middle might cause parsing issues - that's acceptable
|
|
# The test passes if it doesn't crash
|
|
|
|
def test_invalid_bankgiro(self, extractor):
|
|
"""Test invalid Bankgiro (too short)."""
|
|
result, is_valid, error = extractor._normalize_bankgiro("BG: 123")
|
|
# Should fail or return None
|
|
|
|
|
|
class TestNormalizePlusgiro:
|
|
"""Tests for Plusgiro normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_standard_format(self, extractor):
|
|
"""Test standard Plusgiro format XXXXXXX-X."""
|
|
result, is_valid, error = extractor._normalize_plusgiro("Plusgiro: 1234567-8")
|
|
assert result is not None
|
|
assert '-' in result
|
|
|
|
def test_without_dash(self, extractor):
|
|
"""Test Plusgiro without dash."""
|
|
result, is_valid, error = extractor._normalize_plusgiro("PG 12345678")
|
|
assert result is not None
|
|
|
|
def test_distinguish_from_bankgiro(self, extractor):
|
|
"""Test that Plusgiro is distinguished from Bankgiro by format."""
|
|
# Plusgiro has 1 digit after dash, Bankgiro has 4
|
|
pg_text = "4809603-6" # Plusgiro format
|
|
bg_text = "782-1713" # Bankgiro format
|
|
|
|
pg_result, _, _ = extractor._normalize_plusgiro(pg_text)
|
|
bg_result, _, _ = extractor._normalize_bankgiro(bg_text)
|
|
|
|
# Both should succeed in their respective normalizations
|
|
|
|
|
|
class TestNormalizeAmount:
|
|
"""Tests for Amount normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_swedish_format_comma(self, extractor):
|
|
"""Test Swedish format with comma: 11 699,00."""
|
|
result, is_valid, error = extractor._normalize_amount("11 699,00 SEK")
|
|
assert result is not None
|
|
assert is_valid is True
|
|
|
|
def test_integer_amount(self, extractor):
|
|
"""Test integer amount without decimals."""
|
|
result, is_valid, error = extractor._normalize_amount("Amount: 11699")
|
|
assert result is not None
|
|
|
|
def test_with_currency(self, extractor):
|
|
"""Test amount with currency symbol."""
|
|
result, is_valid, error = extractor._normalize_amount("SEK 11 699,00")
|
|
assert result is not None
|
|
|
|
def test_large_amount(self, extractor):
|
|
"""Test large amount with thousand separators."""
|
|
result, is_valid, error = extractor._normalize_amount("1 234 567,89")
|
|
assert result is not None
|
|
|
|
|
|
class TestNormalizeOCR:
|
|
"""Tests for OCR number normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_standard_ocr(self, extractor):
|
|
"""Test standard OCR number."""
|
|
result, is_valid, error = extractor._normalize_ocr_number("OCR: 310196187399952")
|
|
assert result == '310196187399952'
|
|
assert is_valid is True
|
|
|
|
def test_ocr_with_spaces(self, extractor):
|
|
"""Test OCR number with spaces."""
|
|
result, is_valid, error = extractor._normalize_ocr_number("3101 9618 7399 952")
|
|
assert result is not None
|
|
assert ' ' not in result # Spaces should be removed
|
|
|
|
def test_short_ocr_invalid(self, extractor):
|
|
"""Test that too short OCR is invalid."""
|
|
result, is_valid, error = extractor._normalize_ocr_number("123")
|
|
assert is_valid is False
|
|
|
|
|
|
class TestNormalizeDate:
|
|
"""Tests for date normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_iso_format(self, extractor):
|
|
"""Test ISO date format YYYY-MM-DD."""
|
|
result, is_valid, error = extractor._normalize_date("2026-01-31")
|
|
assert result == '2026-01-31'
|
|
assert is_valid is True
|
|
|
|
def test_swedish_format(self, extractor):
|
|
"""Test Swedish format with dots: 31.01.2026."""
|
|
result, is_valid, error = extractor._normalize_date("31.01.2026")
|
|
assert result is not None
|
|
assert is_valid is True
|
|
|
|
def test_slash_format(self, extractor):
|
|
"""Test slash format: 31/01/2026."""
|
|
result, is_valid, error = extractor._normalize_date("31/01/2026")
|
|
assert result is not None
|
|
|
|
def test_compact_format(self, extractor):
|
|
"""Test compact format: 20260131."""
|
|
result, is_valid, error = extractor._normalize_date("20260131")
|
|
assert result is not None
|
|
|
|
def test_invalid_date(self, extractor):
|
|
"""Test invalid date."""
|
|
result, is_valid, error = extractor._normalize_date("not a date")
|
|
assert is_valid is False
|
|
|
|
|
|
class TestNormalizePaymentLine:
|
|
"""Tests for payment line normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_standard_payment_line(self, extractor):
|
|
"""Test standard payment line parsing."""
|
|
text = "# 310196187399952 # 11699 00 6 > 7821713#41#"
|
|
result, is_valid, error = extractor._normalize_payment_line(text)
|
|
|
|
assert result is not None
|
|
assert is_valid is True
|
|
# Should be formatted as: OCR:xxx Amount:xxx BG:xxx
|
|
assert 'OCR:' in result or '310196187399952' in result
|
|
|
|
def test_payment_line_with_spaces_in_bg(self, extractor):
|
|
"""Test payment line with spaces in Bankgiro."""
|
|
text = "# 310196187399952 # 11699 00 6 > 78 2 1 713 #41#"
|
|
result, is_valid, error = extractor._normalize_payment_line(text)
|
|
|
|
assert result is not None
|
|
assert is_valid is True
|
|
# Bankgiro should be normalized despite spaces
|
|
|
|
def test_payment_line_with_spaces_in_check_digits(self, extractor):
|
|
"""Test payment line with spaces around check digits: #41 # instead of #41#."""
|
|
text = "# 6026726908 # 736 00 9 > 5692041 #41 #"
|
|
result, is_valid, error = extractor._normalize_payment_line(text)
|
|
|
|
assert result is not None
|
|
assert is_valid is True
|
|
assert "6026726908" in result
|
|
assert "736 00" in result
|
|
assert "5692041#41#" in result
|
|
|
|
def test_payment_line_with_ocr_spaces_in_amount(self, extractor):
|
|
"""Test payment line with OCR-induced spaces in amount: '12 0 0 00' -> '1200 00'."""
|
|
text = "# 11000770600242 # 12 0 0 00 5 3082963#41#"
|
|
result, is_valid, error = extractor._normalize_payment_line(text)
|
|
|
|
assert result is not None
|
|
assert is_valid is True
|
|
assert "11000770600242" in result
|
|
assert "1200 00" in result
|
|
assert "3082963#41#" in result
|
|
|
|
def test_payment_line_without_greater_symbol(self, extractor):
|
|
"""Test payment line with missing > symbol (low-DPI OCR issue)."""
|
|
text = "# 11000770600242 # 1200 00 5 3082963#41#"
|
|
result, is_valid, error = extractor._normalize_payment_line(text)
|
|
|
|
assert result is not None
|
|
assert is_valid is True
|
|
assert "11000770600242" in result
|
|
assert "1200 00" in result
|
|
|
|
|
|
class TestNormalizeCustomerNumber:
|
|
"""Tests for customer number normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_with_separator(self, extractor):
|
|
"""Test customer number with separator: JTY 576-3."""
|
|
result, is_valid, error = extractor._normalize_customer_number("Kundnr: JTY 576-3")
|
|
assert result is not None
|
|
|
|
def test_compact_format(self, extractor):
|
|
"""Test compact customer number: JTY5763."""
|
|
result, is_valid, error = extractor._normalize_customer_number("JTY5763")
|
|
assert result is not None
|
|
|
|
def test_format_without_dash(self, extractor):
|
|
"""Test customer number format without dash: Dwq 211X -> DWQ 211-X."""
|
|
text = "Dwq 211X Billo SE 106 43 Stockholm"
|
|
result, is_valid, error = extractor._normalize_customer_number(text)
|
|
|
|
assert result is not None
|
|
assert is_valid is True
|
|
assert result == "DWQ 211-X"
|
|
|
|
def test_swedish_postal_code_exclusion(self, extractor):
|
|
"""Test that Swedish postal codes are excluded: SE 106 43 should not be extracted."""
|
|
text = "SE 106 43 Stockholm"
|
|
result, is_valid, error = extractor._normalize_customer_number(text)
|
|
|
|
# Should not extract postal code
|
|
assert result is None or "SE 106" not in result
|
|
|
|
def test_customer_number_with_postal_code_in_text(self, extractor):
|
|
"""Test extracting customer number when postal code is also present."""
|
|
text = "Customer: ABC 123X, Address: SE 106 43 Stockholm"
|
|
result, is_valid, error = extractor._normalize_customer_number(text)
|
|
|
|
assert result is not None
|
|
assert "ABC" in result
|
|
# Should not extract postal code
|
|
assert "SE 106" not in result if result else True
|
|
|
|
|
|
class TestNormalizeSupplierOrgNumber:
|
|
"""Tests for supplier organization number normalization."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_standard_format(self, extractor):
|
|
"""Test standard format NNNNNN-NNNN."""
|
|
result, is_valid, error = extractor._normalize_supplier_org_number("Org.nr 516406-1102")
|
|
assert result == '516406-1102'
|
|
assert is_valid is True
|
|
|
|
def test_vat_number_format(self, extractor):
|
|
"""Test VAT number format SE + 10 digits + 01."""
|
|
result, is_valid, error = extractor._normalize_supplier_org_number("Momsreg.nr SE556123456701")
|
|
assert result is not None
|
|
assert '-' in result
|
|
|
|
|
|
class TestNormalizeAndValidateDispatch:
|
|
"""Tests for the _normalize_and_validate dispatch method."""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return FieldExtractor()
|
|
|
|
def test_dispatch_invoice_number(self, extractor):
|
|
"""Test dispatch to invoice number normalizer."""
|
|
result, is_valid, error = extractor._normalize_and_validate('InvoiceNumber', 'A3861')
|
|
assert result is not None
|
|
|
|
def test_dispatch_amount(self, extractor):
|
|
"""Test dispatch to amount normalizer."""
|
|
result, is_valid, error = extractor._normalize_and_validate('Amount', '11699,00')
|
|
assert result is not None
|
|
|
|
def test_dispatch_bankgiro(self, extractor):
|
|
"""Test dispatch to Bankgiro normalizer."""
|
|
result, is_valid, error = extractor._normalize_and_validate('Bankgiro', '782-1713')
|
|
assert result is not None
|
|
|
|
def test_dispatch_ocr(self, extractor):
|
|
"""Test dispatch to OCR normalizer."""
|
|
result, is_valid, error = extractor._normalize_and_validate('OCR', '310196187399952')
|
|
assert result is not None
|
|
|
|
def test_dispatch_date(self, extractor):
|
|
"""Test dispatch to date normalizer."""
|
|
result, is_valid, error = extractor._normalize_and_validate('InvoiceDate', '2026-01-31')
|
|
assert result is not None
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v'])
|