This commit is contained in:
Yaojia Wang
2026-02-01 18:51:54 +01:00
parent 4126196dea
commit a564ac9d70
82 changed files with 13123 additions and 3282 deletions

View File

@@ -12,6 +12,15 @@ Tests field normalization functions:
import pytest
from inference.pipeline.field_extractor import FieldExtractor
from inference.pipeline.normalizers import (
InvoiceNumberNormalizer,
OcrNumberNormalizer,
BankgiroNormalizer,
PlusgiroNormalizer,
AmountNormalizer,
DateNormalizer,
SupplierOrgNumberNormalizer,
)
class TestFieldExtractorInit:
@@ -43,81 +52,81 @@ class TestNormalizeInvoiceNumber:
"""Tests for invoice number normalization."""
@pytest.fixture
def extractor(self):
return FieldExtractor()
def normalizer(self):
return InvoiceNumberNormalizer()
def test_alphanumeric_invoice_number(self, extractor):
def test_alphanumeric_invoice_number(self, normalizer):
"""Test alphanumeric invoice number like A3861."""
result, is_valid, error = extractor._normalize_invoice_number("Fakturanummer: A3861")
assert result == 'A3861'
assert is_valid is True
result = normalizer.normalize("Fakturanummer: A3861")
assert result.value == 'A3861'
assert result.is_valid is True
def test_prefix_invoice_number(self, extractor):
def test_prefix_invoice_number(self, normalizer):
"""Test invoice number with prefix like INV12345."""
result, is_valid, error = extractor._normalize_invoice_number("Invoice INV12345")
assert result is not None
assert 'INV' in result or '12345' in result
result = normalizer.normalize("Invoice INV12345")
assert result.value is not None
assert 'INV' in result.value or '12345' in result.value
def test_numeric_invoice_number(self, extractor):
def test_numeric_invoice_number(self, normalizer):
"""Test pure numeric invoice number."""
result, is_valid, error = extractor._normalize_invoice_number("Invoice: 12345678")
assert result is not None
assert result.isdigit()
result = normalizer.normalize("Invoice: 12345678")
assert result.value is not None
assert result.value.isdigit()
def test_year_prefixed_invoice_number(self, extractor):
def test_year_prefixed_invoice_number(self, normalizer):
"""Test invoice number with year prefix like 2024-001."""
result, is_valid, error = extractor._normalize_invoice_number("Faktura 2024-12345")
assert result is not None
assert '2024' in result
result = normalizer.normalize("Faktura 2024-12345")
assert result.value is not None
assert '2024' in result.value
def test_avoid_long_ocr_sequence(self, extractor):
def test_avoid_long_ocr_sequence(self, normalizer):
"""Test that long OCR-like sequences are avoided."""
# When text contains both short invoice number and long OCR sequence
text = "Fakturanummer: A3861 OCR: 310196187399952763290708"
result, is_valid, error = extractor._normalize_invoice_number(text)
result = normalizer.normalize(text)
# Should prefer the shorter alphanumeric pattern
assert result == 'A3861'
assert result.value == 'A3861'
def test_empty_string(self, extractor):
def test_empty_string(self, normalizer):
"""Test empty string input."""
result, is_valid, error = extractor._normalize_invoice_number("")
assert result is None or is_valid is False
result = normalizer.normalize("")
assert result.value is None or result.is_valid is False
class TestNormalizeBankgiro:
"""Tests for Bankgiro normalization."""
@pytest.fixture
def extractor(self):
return FieldExtractor()
def normalizer(self):
return BankgiroNormalizer()
def test_standard_7_digit_format(self, extractor):
def test_standard_7_digit_format(self, normalizer):
"""Test 7-digit Bankgiro XXX-XXXX."""
result, is_valid, error = extractor._normalize_bankgiro("Bankgiro: 782-1713")
assert result == '782-1713'
assert is_valid is True
result = normalizer.normalize("Bankgiro: 782-1713")
assert result.value == '782-1713'
assert result.is_valid is True
def test_standard_8_digit_format(self, extractor):
def test_standard_8_digit_format(self, normalizer):
"""Test 8-digit Bankgiro XXXX-XXXX."""
result, is_valid, error = extractor._normalize_bankgiro("BG 5393-9484")
assert result == '5393-9484'
assert is_valid is True
result = normalizer.normalize("BG 5393-9484")
assert result.value == '5393-9484'
assert result.is_valid is True
def test_without_dash(self, extractor):
def test_without_dash(self, normalizer):
"""Test Bankgiro without dash."""
result, is_valid, error = extractor._normalize_bankgiro("Bankgiro 7821713")
assert result is not None
result = normalizer.normalize("Bankgiro 7821713")
assert result.value is not None
# Should be formatted with dash
def test_with_spaces(self, extractor):
def test_with_spaces(self, normalizer):
"""Test Bankgiro with spaces - may not parse if spaces break the pattern."""
result, is_valid, error = extractor._normalize_bankgiro("BG: 782 1713")
result = normalizer.normalize("BG: 782 1713")
# Spaces in the middle might cause parsing issues - that's acceptable
# The test passes if it doesn't crash
def test_invalid_bankgiro(self, extractor):
def test_invalid_bankgiro(self, normalizer):
"""Test invalid Bankgiro (too short)."""
result, is_valid, error = extractor._normalize_bankgiro("BG: 123")
result = normalizer.normalize("BG: 123")
# Should fail or return None
@@ -125,28 +134,32 @@ class TestNormalizePlusgiro:
"""Tests for Plusgiro normalization."""
@pytest.fixture
def extractor(self):
return FieldExtractor()
def normalizer(self):
return PlusgiroNormalizer()
def test_standard_format(self, extractor):
@pytest.fixture
def bg_normalizer(self):
return BankgiroNormalizer()
def test_standard_format(self, normalizer):
"""Test standard Plusgiro format XXXXXXX-X."""
result, is_valid, error = extractor._normalize_plusgiro("Plusgiro: 1234567-8")
assert result is not None
assert '-' in result
result = normalizer.normalize("Plusgiro: 1234567-8")
assert result.value is not None
assert '-' in result.value
def test_without_dash(self, extractor):
def test_without_dash(self, normalizer):
"""Test Plusgiro without dash."""
result, is_valid, error = extractor._normalize_plusgiro("PG 12345678")
assert result is not None
result = normalizer.normalize("PG 12345678")
assert result.value is not None
def test_distinguish_from_bankgiro(self, extractor):
def test_distinguish_from_bankgiro(self, normalizer, bg_normalizer):
"""Test that Plusgiro is distinguished from Bankgiro by format."""
# Plusgiro has 1 digit after dash, Bankgiro has 4
pg_text = "4809603-6" # Plusgiro format
bg_text = "782-1713" # Bankgiro format
pg_result, _, _ = extractor._normalize_plusgiro(pg_text)
bg_result, _, _ = extractor._normalize_bankgiro(bg_text)
pg_result = normalizer.normalize(pg_text)
bg_result = bg_normalizer.normalize(bg_text)
# Both should succeed in their respective normalizations
@@ -155,89 +168,89 @@ class TestNormalizeAmount:
"""Tests for Amount normalization."""
@pytest.fixture
def extractor(self):
return FieldExtractor()
def normalizer(self):
return AmountNormalizer()
def test_swedish_format_comma(self, extractor):
def test_swedish_format_comma(self, normalizer):
"""Test Swedish format with comma: 11 699,00."""
result, is_valid, error = extractor._normalize_amount("11 699,00 SEK")
assert result is not None
assert is_valid is True
result = normalizer.normalize("11 699,00 SEK")
assert result.value is not None
assert result.is_valid is True
def test_integer_amount(self, extractor):
def test_integer_amount(self, normalizer):
"""Test integer amount without decimals."""
result, is_valid, error = extractor._normalize_amount("Amount: 11699")
assert result is not None
result = normalizer.normalize("Amount: 11699")
assert result.value is not None
def test_with_currency(self, extractor):
def test_with_currency(self, normalizer):
"""Test amount with currency symbol."""
result, is_valid, error = extractor._normalize_amount("SEK 11 699,00")
assert result is not None
result = normalizer.normalize("SEK 11 699,00")
assert result.value is not None
def test_large_amount(self, extractor):
def test_large_amount(self, normalizer):
"""Test large amount with thousand separators."""
result, is_valid, error = extractor._normalize_amount("1 234 567,89")
assert result is not None
result = normalizer.normalize("1 234 567,89")
assert result.value is not None
class TestNormalizeOCR:
"""Tests for OCR number normalization."""
@pytest.fixture
def extractor(self):
return FieldExtractor()
def normalizer(self):
return OcrNumberNormalizer()
def test_standard_ocr(self, extractor):
def test_standard_ocr(self, normalizer):
"""Test standard OCR number."""
result, is_valid, error = extractor._normalize_ocr_number("OCR: 310196187399952")
assert result == '310196187399952'
assert is_valid is True
result = normalizer.normalize("OCR: 310196187399952")
assert result.value == '310196187399952'
assert result.is_valid is True
def test_ocr_with_spaces(self, extractor):
def test_ocr_with_spaces(self, normalizer):
"""Test OCR number with spaces."""
result, is_valid, error = extractor._normalize_ocr_number("3101 9618 7399 952")
assert result is not None
assert ' ' not in result # Spaces should be removed
result = normalizer.normalize("3101 9618 7399 952")
assert result.value is not None
assert ' ' not in result.value # Spaces should be removed
def test_short_ocr_invalid(self, extractor):
def test_short_ocr_invalid(self, normalizer):
"""Test that too short OCR is invalid."""
result, is_valid, error = extractor._normalize_ocr_number("123")
assert is_valid is False
result = normalizer.normalize("123")
assert result.is_valid is False
class TestNormalizeDate:
"""Tests for date normalization."""
@pytest.fixture
def extractor(self):
return FieldExtractor()
def normalizer(self):
return DateNormalizer()
def test_iso_format(self, extractor):
def test_iso_format(self, normalizer):
"""Test ISO date format YYYY-MM-DD."""
result, is_valid, error = extractor._normalize_date("2026-01-31")
assert result == '2026-01-31'
assert is_valid is True
result = normalizer.normalize("2026-01-31")
assert result.value == '2026-01-31'
assert result.is_valid is True
def test_swedish_format(self, extractor):
def test_swedish_format(self, normalizer):
"""Test Swedish format with dots: 31.01.2026."""
result, is_valid, error = extractor._normalize_date("31.01.2026")
assert result is not None
assert is_valid is True
result = normalizer.normalize("31.01.2026")
assert result.value is not None
assert result.is_valid is True
def test_slash_format(self, extractor):
def test_slash_format(self, normalizer):
"""Test slash format: 31/01/2026."""
result, is_valid, error = extractor._normalize_date("31/01/2026")
assert result is not None
result = normalizer.normalize("31/01/2026")
assert result.value is not None
def test_compact_format(self, extractor):
def test_compact_format(self, normalizer):
"""Test compact format: 20260131."""
result, is_valid, error = extractor._normalize_date("20260131")
assert result is not None
result = normalizer.normalize("20260131")
assert result.value is not None
def test_invalid_date(self, extractor):
def test_invalid_date(self, normalizer):
"""Test invalid date."""
result, is_valid, error = extractor._normalize_date("not a date")
assert is_valid is False
result = normalizer.normalize("not a date")
assert result.is_valid is False
class TestNormalizePaymentLine:
@@ -348,20 +361,20 @@ class TestNormalizeSupplierOrgNumber:
"""Tests for supplier organization number normalization."""
@pytest.fixture
def extractor(self):
return FieldExtractor()
def normalizer(self):
return SupplierOrgNumberNormalizer()
def test_standard_format(self, extractor):
def test_standard_format(self, normalizer):
"""Test standard format NNNNNN-NNNN."""
result, is_valid, error = extractor._normalize_supplier_org_number("Org.nr 516406-1102")
assert result == '516406-1102'
assert is_valid is True
result = normalizer.normalize("Org.nr 516406-1102")
assert result.value == '516406-1102'
assert result.is_valid is True
def test_vat_number_format(self, extractor):
def test_vat_number_format(self, normalizer):
"""Test VAT number format SE + 10 digits + 01."""
result, is_valid, error = extractor._normalize_supplier_org_number("Momsreg.nr SE556123456701")
assert result is not None
assert '-' in result
result = normalizer.normalize("Momsreg.nr SE556123456701")
assert result.value is not None
assert '-' in result.value
class TestNormalizeAndValidateDispatch: