WIP
This commit is contained in:
@@ -12,6 +12,15 @@ Tests field normalization functions:
|
||||
|
||||
import pytest
|
||||
from inference.pipeline.field_extractor import FieldExtractor
|
||||
from inference.pipeline.normalizers import (
|
||||
InvoiceNumberNormalizer,
|
||||
OcrNumberNormalizer,
|
||||
BankgiroNormalizer,
|
||||
PlusgiroNormalizer,
|
||||
AmountNormalizer,
|
||||
DateNormalizer,
|
||||
SupplierOrgNumberNormalizer,
|
||||
)
|
||||
|
||||
|
||||
class TestFieldExtractorInit:
|
||||
@@ -43,81 +52,81 @@ class TestNormalizeInvoiceNumber:
|
||||
"""Tests for invoice number normalization."""
|
||||
|
||||
@pytest.fixture
|
||||
def extractor(self):
|
||||
return FieldExtractor()
|
||||
def normalizer(self):
|
||||
return InvoiceNumberNormalizer()
|
||||
|
||||
def test_alphanumeric_invoice_number(self, extractor):
|
||||
def test_alphanumeric_invoice_number(self, normalizer):
|
||||
"""Test alphanumeric invoice number like A3861."""
|
||||
result, is_valid, error = extractor._normalize_invoice_number("Fakturanummer: A3861")
|
||||
assert result == 'A3861'
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("Fakturanummer: A3861")
|
||||
assert result.value == 'A3861'
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_prefix_invoice_number(self, extractor):
|
||||
def test_prefix_invoice_number(self, normalizer):
|
||||
"""Test invoice number with prefix like INV12345."""
|
||||
result, is_valid, error = extractor._normalize_invoice_number("Invoice INV12345")
|
||||
assert result is not None
|
||||
assert 'INV' in result or '12345' in result
|
||||
result = normalizer.normalize("Invoice INV12345")
|
||||
assert result.value is not None
|
||||
assert 'INV' in result.value or '12345' in result.value
|
||||
|
||||
def test_numeric_invoice_number(self, extractor):
|
||||
def test_numeric_invoice_number(self, normalizer):
|
||||
"""Test pure numeric invoice number."""
|
||||
result, is_valid, error = extractor._normalize_invoice_number("Invoice: 12345678")
|
||||
assert result is not None
|
||||
assert result.isdigit()
|
||||
result = normalizer.normalize("Invoice: 12345678")
|
||||
assert result.value is not None
|
||||
assert result.value.isdigit()
|
||||
|
||||
def test_year_prefixed_invoice_number(self, extractor):
|
||||
def test_year_prefixed_invoice_number(self, normalizer):
|
||||
"""Test invoice number with year prefix like 2024-001."""
|
||||
result, is_valid, error = extractor._normalize_invoice_number("Faktura 2024-12345")
|
||||
assert result is not None
|
||||
assert '2024' in result
|
||||
result = normalizer.normalize("Faktura 2024-12345")
|
||||
assert result.value is not None
|
||||
assert '2024' in result.value
|
||||
|
||||
def test_avoid_long_ocr_sequence(self, extractor):
|
||||
def test_avoid_long_ocr_sequence(self, normalizer):
|
||||
"""Test that long OCR-like sequences are avoided."""
|
||||
# When text contains both short invoice number and long OCR sequence
|
||||
text = "Fakturanummer: A3861 OCR: 310196187399952763290708"
|
||||
result, is_valid, error = extractor._normalize_invoice_number(text)
|
||||
result = normalizer.normalize(text)
|
||||
# Should prefer the shorter alphanumeric pattern
|
||||
assert result == 'A3861'
|
||||
assert result.value == 'A3861'
|
||||
|
||||
def test_empty_string(self, extractor):
|
||||
def test_empty_string(self, normalizer):
|
||||
"""Test empty string input."""
|
||||
result, is_valid, error = extractor._normalize_invoice_number("")
|
||||
assert result is None or is_valid is False
|
||||
result = normalizer.normalize("")
|
||||
assert result.value is None or result.is_valid is False
|
||||
|
||||
|
||||
class TestNormalizeBankgiro:
|
||||
"""Tests for Bankgiro normalization."""
|
||||
|
||||
@pytest.fixture
|
||||
def extractor(self):
|
||||
return FieldExtractor()
|
||||
def normalizer(self):
|
||||
return BankgiroNormalizer()
|
||||
|
||||
def test_standard_7_digit_format(self, extractor):
|
||||
def test_standard_7_digit_format(self, normalizer):
|
||||
"""Test 7-digit Bankgiro XXX-XXXX."""
|
||||
result, is_valid, error = extractor._normalize_bankgiro("Bankgiro: 782-1713")
|
||||
assert result == '782-1713'
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("Bankgiro: 782-1713")
|
||||
assert result.value == '782-1713'
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_standard_8_digit_format(self, extractor):
|
||||
def test_standard_8_digit_format(self, normalizer):
|
||||
"""Test 8-digit Bankgiro XXXX-XXXX."""
|
||||
result, is_valid, error = extractor._normalize_bankgiro("BG 5393-9484")
|
||||
assert result == '5393-9484'
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("BG 5393-9484")
|
||||
assert result.value == '5393-9484'
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_without_dash(self, extractor):
|
||||
def test_without_dash(self, normalizer):
|
||||
"""Test Bankgiro without dash."""
|
||||
result, is_valid, error = extractor._normalize_bankgiro("Bankgiro 7821713")
|
||||
assert result is not None
|
||||
result = normalizer.normalize("Bankgiro 7821713")
|
||||
assert result.value is not None
|
||||
# Should be formatted with dash
|
||||
|
||||
def test_with_spaces(self, extractor):
|
||||
def test_with_spaces(self, normalizer):
|
||||
"""Test Bankgiro with spaces - may not parse if spaces break the pattern."""
|
||||
result, is_valid, error = extractor._normalize_bankgiro("BG: 782 1713")
|
||||
result = normalizer.normalize("BG: 782 1713")
|
||||
# Spaces in the middle might cause parsing issues - that's acceptable
|
||||
# The test passes if it doesn't crash
|
||||
|
||||
def test_invalid_bankgiro(self, extractor):
|
||||
def test_invalid_bankgiro(self, normalizer):
|
||||
"""Test invalid Bankgiro (too short)."""
|
||||
result, is_valid, error = extractor._normalize_bankgiro("BG: 123")
|
||||
result = normalizer.normalize("BG: 123")
|
||||
# Should fail or return None
|
||||
|
||||
|
||||
@@ -125,28 +134,32 @@ class TestNormalizePlusgiro:
|
||||
"""Tests for Plusgiro normalization."""
|
||||
|
||||
@pytest.fixture
|
||||
def extractor(self):
|
||||
return FieldExtractor()
|
||||
def normalizer(self):
|
||||
return PlusgiroNormalizer()
|
||||
|
||||
def test_standard_format(self, extractor):
|
||||
@pytest.fixture
|
||||
def bg_normalizer(self):
|
||||
return BankgiroNormalizer()
|
||||
|
||||
def test_standard_format(self, normalizer):
|
||||
"""Test standard Plusgiro format XXXXXXX-X."""
|
||||
result, is_valid, error = extractor._normalize_plusgiro("Plusgiro: 1234567-8")
|
||||
assert result is not None
|
||||
assert '-' in result
|
||||
result = normalizer.normalize("Plusgiro: 1234567-8")
|
||||
assert result.value is not None
|
||||
assert '-' in result.value
|
||||
|
||||
def test_without_dash(self, extractor):
|
||||
def test_without_dash(self, normalizer):
|
||||
"""Test Plusgiro without dash."""
|
||||
result, is_valid, error = extractor._normalize_plusgiro("PG 12345678")
|
||||
assert result is not None
|
||||
result = normalizer.normalize("PG 12345678")
|
||||
assert result.value is not None
|
||||
|
||||
def test_distinguish_from_bankgiro(self, extractor):
|
||||
def test_distinguish_from_bankgiro(self, normalizer, bg_normalizer):
|
||||
"""Test that Plusgiro is distinguished from Bankgiro by format."""
|
||||
# Plusgiro has 1 digit after dash, Bankgiro has 4
|
||||
pg_text = "4809603-6" # Plusgiro format
|
||||
bg_text = "782-1713" # Bankgiro format
|
||||
|
||||
pg_result, _, _ = extractor._normalize_plusgiro(pg_text)
|
||||
bg_result, _, _ = extractor._normalize_bankgiro(bg_text)
|
||||
pg_result = normalizer.normalize(pg_text)
|
||||
bg_result = bg_normalizer.normalize(bg_text)
|
||||
|
||||
# Both should succeed in their respective normalizations
|
||||
|
||||
@@ -155,89 +168,89 @@ class TestNormalizeAmount:
|
||||
"""Tests for Amount normalization."""
|
||||
|
||||
@pytest.fixture
|
||||
def extractor(self):
|
||||
return FieldExtractor()
|
||||
def normalizer(self):
|
||||
return AmountNormalizer()
|
||||
|
||||
def test_swedish_format_comma(self, extractor):
|
||||
def test_swedish_format_comma(self, normalizer):
|
||||
"""Test Swedish format with comma: 11 699,00."""
|
||||
result, is_valid, error = extractor._normalize_amount("11 699,00 SEK")
|
||||
assert result is not None
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("11 699,00 SEK")
|
||||
assert result.value is not None
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_integer_amount(self, extractor):
|
||||
def test_integer_amount(self, normalizer):
|
||||
"""Test integer amount without decimals."""
|
||||
result, is_valid, error = extractor._normalize_amount("Amount: 11699")
|
||||
assert result is not None
|
||||
result = normalizer.normalize("Amount: 11699")
|
||||
assert result.value is not None
|
||||
|
||||
def test_with_currency(self, extractor):
|
||||
def test_with_currency(self, normalizer):
|
||||
"""Test amount with currency symbol."""
|
||||
result, is_valid, error = extractor._normalize_amount("SEK 11 699,00")
|
||||
assert result is not None
|
||||
result = normalizer.normalize("SEK 11 699,00")
|
||||
assert result.value is not None
|
||||
|
||||
def test_large_amount(self, extractor):
|
||||
def test_large_amount(self, normalizer):
|
||||
"""Test large amount with thousand separators."""
|
||||
result, is_valid, error = extractor._normalize_amount("1 234 567,89")
|
||||
assert result is not None
|
||||
result = normalizer.normalize("1 234 567,89")
|
||||
assert result.value is not None
|
||||
|
||||
|
||||
class TestNormalizeOCR:
|
||||
"""Tests for OCR number normalization."""
|
||||
|
||||
@pytest.fixture
|
||||
def extractor(self):
|
||||
return FieldExtractor()
|
||||
def normalizer(self):
|
||||
return OcrNumberNormalizer()
|
||||
|
||||
def test_standard_ocr(self, extractor):
|
||||
def test_standard_ocr(self, normalizer):
|
||||
"""Test standard OCR number."""
|
||||
result, is_valid, error = extractor._normalize_ocr_number("OCR: 310196187399952")
|
||||
assert result == '310196187399952'
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("OCR: 310196187399952")
|
||||
assert result.value == '310196187399952'
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_ocr_with_spaces(self, extractor):
|
||||
def test_ocr_with_spaces(self, normalizer):
|
||||
"""Test OCR number with spaces."""
|
||||
result, is_valid, error = extractor._normalize_ocr_number("3101 9618 7399 952")
|
||||
assert result is not None
|
||||
assert ' ' not in result # Spaces should be removed
|
||||
result = normalizer.normalize("3101 9618 7399 952")
|
||||
assert result.value is not None
|
||||
assert ' ' not in result.value # Spaces should be removed
|
||||
|
||||
def test_short_ocr_invalid(self, extractor):
|
||||
def test_short_ocr_invalid(self, normalizer):
|
||||
"""Test that too short OCR is invalid."""
|
||||
result, is_valid, error = extractor._normalize_ocr_number("123")
|
||||
assert is_valid is False
|
||||
result = normalizer.normalize("123")
|
||||
assert result.is_valid is False
|
||||
|
||||
|
||||
class TestNormalizeDate:
|
||||
"""Tests for date normalization."""
|
||||
|
||||
@pytest.fixture
|
||||
def extractor(self):
|
||||
return FieldExtractor()
|
||||
def normalizer(self):
|
||||
return DateNormalizer()
|
||||
|
||||
def test_iso_format(self, extractor):
|
||||
def test_iso_format(self, normalizer):
|
||||
"""Test ISO date format YYYY-MM-DD."""
|
||||
result, is_valid, error = extractor._normalize_date("2026-01-31")
|
||||
assert result == '2026-01-31'
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("2026-01-31")
|
||||
assert result.value == '2026-01-31'
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_swedish_format(self, extractor):
|
||||
def test_swedish_format(self, normalizer):
|
||||
"""Test Swedish format with dots: 31.01.2026."""
|
||||
result, is_valid, error = extractor._normalize_date("31.01.2026")
|
||||
assert result is not None
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("31.01.2026")
|
||||
assert result.value is not None
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_slash_format(self, extractor):
|
||||
def test_slash_format(self, normalizer):
|
||||
"""Test slash format: 31/01/2026."""
|
||||
result, is_valid, error = extractor._normalize_date("31/01/2026")
|
||||
assert result is not None
|
||||
result = normalizer.normalize("31/01/2026")
|
||||
assert result.value is not None
|
||||
|
||||
def test_compact_format(self, extractor):
|
||||
def test_compact_format(self, normalizer):
|
||||
"""Test compact format: 20260131."""
|
||||
result, is_valid, error = extractor._normalize_date("20260131")
|
||||
assert result is not None
|
||||
result = normalizer.normalize("20260131")
|
||||
assert result.value is not None
|
||||
|
||||
def test_invalid_date(self, extractor):
|
||||
def test_invalid_date(self, normalizer):
|
||||
"""Test invalid date."""
|
||||
result, is_valid, error = extractor._normalize_date("not a date")
|
||||
assert is_valid is False
|
||||
result = normalizer.normalize("not a date")
|
||||
assert result.is_valid is False
|
||||
|
||||
|
||||
class TestNormalizePaymentLine:
|
||||
@@ -348,20 +361,20 @@ class TestNormalizeSupplierOrgNumber:
|
||||
"""Tests for supplier organization number normalization."""
|
||||
|
||||
@pytest.fixture
|
||||
def extractor(self):
|
||||
return FieldExtractor()
|
||||
def normalizer(self):
|
||||
return SupplierOrgNumberNormalizer()
|
||||
|
||||
def test_standard_format(self, extractor):
|
||||
def test_standard_format(self, normalizer):
|
||||
"""Test standard format NNNNNN-NNNN."""
|
||||
result, is_valid, error = extractor._normalize_supplier_org_number("Org.nr 516406-1102")
|
||||
assert result == '516406-1102'
|
||||
assert is_valid is True
|
||||
result = normalizer.normalize("Org.nr 516406-1102")
|
||||
assert result.value == '516406-1102'
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_vat_number_format(self, extractor):
|
||||
def test_vat_number_format(self, normalizer):
|
||||
"""Test VAT number format SE + 10 digits + 01."""
|
||||
result, is_valid, error = extractor._normalize_supplier_org_number("Momsreg.nr SE556123456701")
|
||||
assert result is not None
|
||||
assert '-' in result
|
||||
result = normalizer.normalize("Momsreg.nr SE556123456701")
|
||||
assert result.value is not None
|
||||
assert '-' in result.value
|
||||
|
||||
|
||||
class TestNormalizeAndValidateDispatch:
|
||||
|
||||
Reference in New Issue
Block a user