WIP

2026-02-01 18:51:54 +01:00
parent 4126196dea
commit a564ac9d70
82 changed files with 13123 additions and 3282 deletions
--- a/tests/inference/test_field_extractor.py
+++ b/tests/inference/test_field_extractor.py
@@ -12,6 +12,15 @@ Tests field normalization functions:

 import pytest
 from inference.pipeline.field_extractor import FieldExtractor
+from inference.pipeline.normalizers import (
+    InvoiceNumberNormalizer,
+    OcrNumberNormalizer,
+    BankgiroNormalizer,
+    PlusgiroNormalizer,
+    AmountNormalizer,
+    DateNormalizer,
+    SupplierOrgNumberNormalizer,
+)


 class TestFieldExtractorInit:
@@ -43,81 +52,81 @@ class TestNormalizeInvoiceNumber:
    """Tests for invoice number normalization."""

    @pytest.fixture
-    def extractor(self):
-        return FieldExtractor()
+    def normalizer(self):
+        return InvoiceNumberNormalizer()

-    def test_alphanumeric_invoice_number(self, extractor):
+    def test_alphanumeric_invoice_number(self, normalizer):
        """Test alphanumeric invoice number like A3861."""
-        result, is_valid, error = extractor._normalize_invoice_number("Fakturanummer: A3861")
-        assert result == 'A3861'
-        assert is_valid is True
+        result = normalizer.normalize("Fakturanummer: A3861")
+        assert result.value == 'A3861'
+        assert result.is_valid is True

-    def test_prefix_invoice_number(self, extractor):
+    def test_prefix_invoice_number(self, normalizer):
        """Test invoice number with prefix like INV12345."""
-        result, is_valid, error = extractor._normalize_invoice_number("Invoice INV12345")
-        assert result is not None
-        assert 'INV' in result or '12345' in result
+        result = normalizer.normalize("Invoice INV12345")
+        assert result.value is not None
+        assert 'INV' in result.value or '12345' in result.value

-    def test_numeric_invoice_number(self, extractor):
+    def test_numeric_invoice_number(self, normalizer):
        """Test pure numeric invoice number."""
-        result, is_valid, error = extractor._normalize_invoice_number("Invoice: 12345678")
-        assert result is not None
-        assert result.isdigit()
+        result = normalizer.normalize("Invoice: 12345678")
+        assert result.value is not None
+        assert result.value.isdigit()

-    def test_year_prefixed_invoice_number(self, extractor):
+    def test_year_prefixed_invoice_number(self, normalizer):
        """Test invoice number with year prefix like 2024-001."""
-        result, is_valid, error = extractor._normalize_invoice_number("Faktura 2024-12345")
-        assert result is not None
-        assert '2024' in result
+        result = normalizer.normalize("Faktura 2024-12345")
+        assert result.value is not None
+        assert '2024' in result.value

-    def test_avoid_long_ocr_sequence(self, extractor):
+    def test_avoid_long_ocr_sequence(self, normalizer):
        """Test that long OCR-like sequences are avoided."""
        # When text contains both short invoice number and long OCR sequence
        text = "Fakturanummer: A3861 OCR: 310196187399952763290708"
-        result, is_valid, error = extractor._normalize_invoice_number(text)
+        result = normalizer.normalize(text)
        # Should prefer the shorter alphanumeric pattern
-        assert result == 'A3861'
+        assert result.value == 'A3861'

-    def test_empty_string(self, extractor):
+    def test_empty_string(self, normalizer):
        """Test empty string input."""
-        result, is_valid, error = extractor._normalize_invoice_number("")
-        assert result is None or is_valid is False
+        result = normalizer.normalize("")
+        assert result.value is None or result.is_valid is False


 class TestNormalizeBankgiro:
    """Tests for Bankgiro normalization."""

    @pytest.fixture
-    def extractor(self):
-        return FieldExtractor()
+    def normalizer(self):
+        return BankgiroNormalizer()

-    def test_standard_7_digit_format(self, extractor):
+    def test_standard_7_digit_format(self, normalizer):
        """Test 7-digit Bankgiro XXX-XXXX."""
-        result, is_valid, error = extractor._normalize_bankgiro("Bankgiro: 782-1713")
-        assert result == '782-1713'
-        assert is_valid is True
+        result = normalizer.normalize("Bankgiro: 782-1713")
+        assert result.value == '782-1713'
+        assert result.is_valid is True

-    def test_standard_8_digit_format(self, extractor):
+    def test_standard_8_digit_format(self, normalizer):
        """Test 8-digit Bankgiro XXXX-XXXX."""
-        result, is_valid, error = extractor._normalize_bankgiro("BG 5393-9484")
-        assert result == '5393-9484'
-        assert is_valid is True
+        result = normalizer.normalize("BG 5393-9484")
+        assert result.value == '5393-9484'
+        assert result.is_valid is True

-    def test_without_dash(self, extractor):
+    def test_without_dash(self, normalizer):
        """Test Bankgiro without dash."""
-        result, is_valid, error = extractor._normalize_bankgiro("Bankgiro 7821713")
-        assert result is not None
+        result = normalizer.normalize("Bankgiro 7821713")
+        assert result.value is not None
        # Should be formatted with dash

-    def test_with_spaces(self, extractor):
+    def test_with_spaces(self, normalizer):
        """Test Bankgiro with spaces - may not parse if spaces break the pattern."""
-        result, is_valid, error = extractor._normalize_bankgiro("BG: 782 1713")
+        result = normalizer.normalize("BG: 782 1713")
        # Spaces in the middle might cause parsing issues - that's acceptable
        # The test passes if it doesn't crash

-    def test_invalid_bankgiro(self, extractor):
+    def test_invalid_bankgiro(self, normalizer):
        """Test invalid Bankgiro (too short)."""
-        result, is_valid, error = extractor._normalize_bankgiro("BG: 123")
+        result = normalizer.normalize("BG: 123")
        # Should fail or return None


@@ -125,28 +134,32 @@ class TestNormalizePlusgiro:
    """Tests for Plusgiro normalization."""

    @pytest.fixture
-    def extractor(self):
-        return FieldExtractor()
+    def normalizer(self):
+        return PlusgiroNormalizer()

-    def test_standard_format(self, extractor):
+    @pytest.fixture
+    def bg_normalizer(self):
+        return BankgiroNormalizer()
+
+    def test_standard_format(self, normalizer):
        """Test standard Plusgiro format XXXXXXX-X."""
-        result, is_valid, error = extractor._normalize_plusgiro("Plusgiro: 1234567-8")
-        assert result is not None
-        assert '-' in result
+        result = normalizer.normalize("Plusgiro: 1234567-8")
+        assert result.value is not None
+        assert '-' in result.value

-    def test_without_dash(self, extractor):
+    def test_without_dash(self, normalizer):
        """Test Plusgiro without dash."""
-        result, is_valid, error = extractor._normalize_plusgiro("PG 12345678")
-        assert result is not None
+        result = normalizer.normalize("PG 12345678")
+        assert result.value is not None

-    def test_distinguish_from_bankgiro(self, extractor):
+    def test_distinguish_from_bankgiro(self, normalizer, bg_normalizer):
        """Test that Plusgiro is distinguished from Bankgiro by format."""
        # Plusgiro has 1 digit after dash, Bankgiro has 4
        pg_text = "4809603-6"  # Plusgiro format
        bg_text = "782-1713"  # Bankgiro format

-        pg_result, _, _ = extractor._normalize_plusgiro(pg_text)
-        bg_result, _, _ = extractor._normalize_bankgiro(bg_text)
+        pg_result = normalizer.normalize(pg_text)
+        bg_result = bg_normalizer.normalize(bg_text)

        # Both should succeed in their respective normalizations

@@ -155,89 +168,89 @@ class TestNormalizeAmount:
    """Tests for Amount normalization."""

    @pytest.fixture
-    def extractor(self):
-        return FieldExtractor()
+    def normalizer(self):
+        return AmountNormalizer()

-    def test_swedish_format_comma(self, extractor):
+    def test_swedish_format_comma(self, normalizer):
        """Test Swedish format with comma: 11 699,00."""
-        result, is_valid, error = extractor._normalize_amount("11 699,00 SEK")
-        assert result is not None
-        assert is_valid is True
+        result = normalizer.normalize("11 699,00 SEK")
+        assert result.value is not None
+        assert result.is_valid is True

-    def test_integer_amount(self, extractor):
+    def test_integer_amount(self, normalizer):
        """Test integer amount without decimals."""
-        result, is_valid, error = extractor._normalize_amount("Amount: 11699")
-        assert result is not None
+        result = normalizer.normalize("Amount: 11699")
+        assert result.value is not None

-    def test_with_currency(self, extractor):
+    def test_with_currency(self, normalizer):
        """Test amount with currency symbol."""
-        result, is_valid, error = extractor._normalize_amount("SEK 11 699,00")
-        assert result is not None
+        result = normalizer.normalize("SEK 11 699,00")
+        assert result.value is not None

-    def test_large_amount(self, extractor):
+    def test_large_amount(self, normalizer):
        """Test large amount with thousand separators."""
-        result, is_valid, error = extractor._normalize_amount("1 234 567,89")
-        assert result is not None
+        result = normalizer.normalize("1 234 567,89")
+        assert result.value is not None


 class TestNormalizeOCR:
    """Tests for OCR number normalization."""

    @pytest.fixture
-    def extractor(self):
-        return FieldExtractor()
+    def normalizer(self):
+        return OcrNumberNormalizer()

-    def test_standard_ocr(self, extractor):
+    def test_standard_ocr(self, normalizer):
        """Test standard OCR number."""
-        result, is_valid, error = extractor._normalize_ocr_number("OCR: 310196187399952")
-        assert result == '310196187399952'
-        assert is_valid is True
+        result = normalizer.normalize("OCR: 310196187399952")
+        assert result.value == '310196187399952'
+        assert result.is_valid is True

-    def test_ocr_with_spaces(self, extractor):
+    def test_ocr_with_spaces(self, normalizer):
        """Test OCR number with spaces."""
-        result, is_valid, error = extractor._normalize_ocr_number("3101 9618 7399 952")
-        assert result is not None
-        assert ' ' not in result  # Spaces should be removed
+        result = normalizer.normalize("3101 9618 7399 952")
+        assert result.value is not None
+        assert ' ' not in result.value  # Spaces should be removed

-    def test_short_ocr_invalid(self, extractor):
+    def test_short_ocr_invalid(self, normalizer):
        """Test that too short OCR is invalid."""
-        result, is_valid, error = extractor._normalize_ocr_number("123")
-        assert is_valid is False
+        result = normalizer.normalize("123")
+        assert result.is_valid is False


 class TestNormalizeDate:
    """Tests for date normalization."""

    @pytest.fixture
-    def extractor(self):
-        return FieldExtractor()
+    def normalizer(self):
+        return DateNormalizer()

-    def test_iso_format(self, extractor):
+    def test_iso_format(self, normalizer):
        """Test ISO date format YYYY-MM-DD."""
-        result, is_valid, error = extractor._normalize_date("2026-01-31")
-        assert result == '2026-01-31'
-        assert is_valid is True
+        result = normalizer.normalize("2026-01-31")
+        assert result.value == '2026-01-31'
+        assert result.is_valid is True

-    def test_swedish_format(self, extractor):
+    def test_swedish_format(self, normalizer):
        """Test Swedish format with dots: 31.01.2026."""
-        result, is_valid, error = extractor._normalize_date("31.01.2026")
-        assert result is not None
-        assert is_valid is True
+        result = normalizer.normalize("31.01.2026")
+        assert result.value is not None
+        assert result.is_valid is True

-    def test_slash_format(self, extractor):
+    def test_slash_format(self, normalizer):
        """Test slash format: 31/01/2026."""
-        result, is_valid, error = extractor._normalize_date("31/01/2026")
-        assert result is not None
+        result = normalizer.normalize("31/01/2026")
+        assert result.value is not None

-    def test_compact_format(self, extractor):
+    def test_compact_format(self, normalizer):
        """Test compact format: 20260131."""
-        result, is_valid, error = extractor._normalize_date("20260131")
-        assert result is not None
+        result = normalizer.normalize("20260131")
+        assert result.value is not None

-    def test_invalid_date(self, extractor):
+    def test_invalid_date(self, normalizer):
        """Test invalid date."""
-        result, is_valid, error = extractor._normalize_date("not a date")
-        assert is_valid is False
+        result = normalizer.normalize("not a date")
+        assert result.is_valid is False


 class TestNormalizePaymentLine:
@@ -348,20 +361,20 @@ class TestNormalizeSupplierOrgNumber:
    """Tests for supplier organization number normalization."""

    @pytest.fixture
-    def extractor(self):
-        return FieldExtractor()
+    def normalizer(self):
+        return SupplierOrgNumberNormalizer()

-    def test_standard_format(self, extractor):
+    def test_standard_format(self, normalizer):
        """Test standard format NNNNNN-NNNN."""
-        result, is_valid, error = extractor._normalize_supplier_org_number("Org.nr 516406-1102")
-        assert result == '516406-1102'
-        assert is_valid is True
+        result = normalizer.normalize("Org.nr 516406-1102")
+        assert result.value == '516406-1102'
+        assert result.is_valid is True

-    def test_vat_number_format(self, extractor):
+    def test_vat_number_format(self, normalizer):
        """Test VAT number format SE + 10 digits + 01."""
-        result, is_valid, error = extractor._normalize_supplier_org_number("Momsreg.nr SE556123456701")
-        assert result is not None
-        assert '-' in result
+        result = normalizer.normalize("Momsreg.nr SE556123456701")
+        assert result.value is not None
+        assert '-' in result.value


 class TestNormalizeAndValidateDispatch: