""" Tests for Field Extractor Tests field normalization functions: - Invoice number normalization - Date normalization - Amount normalization - Bankgiro/Plusgiro normalization - OCR number normalization - Payment line normalization """ import pytest from inference.pipeline.field_extractor import FieldExtractor from inference.pipeline.normalizers import ( InvoiceNumberNormalizer, OcrNumberNormalizer, BankgiroNormalizer, PlusgiroNormalizer, AmountNormalizer, DateNormalizer, SupplierOrgNumberNormalizer, ) class TestFieldExtractorInit: """Tests for FieldExtractor initialization.""" def test_default_init(self): """Test default initialization.""" extractor = FieldExtractor() assert extractor.ocr_lang == 'en' assert extractor.use_gpu is False assert extractor.bbox_padding == 0.1 assert extractor.dpi == 300 def test_custom_init(self): """Test custom initialization.""" extractor = FieldExtractor( ocr_lang='sv', use_gpu=True, bbox_padding=0.2, dpi=150 ) assert extractor.ocr_lang == 'sv' assert extractor.use_gpu is True assert extractor.bbox_padding == 0.2 assert extractor.dpi == 150 class TestNormalizeInvoiceNumber: """Tests for invoice number normalization.""" @pytest.fixture def normalizer(self): return InvoiceNumberNormalizer() def test_alphanumeric_invoice_number(self, normalizer): """Test alphanumeric invoice number like A3861.""" result = normalizer.normalize("Fakturanummer: A3861") assert result.value == 'A3861' assert result.is_valid is True def test_prefix_invoice_number(self, normalizer): """Test invoice number with prefix like INV12345.""" result = normalizer.normalize("Invoice INV12345") assert result.value is not None assert 'INV' in result.value or '12345' in result.value def test_numeric_invoice_number(self, normalizer): """Test pure numeric invoice number.""" result = normalizer.normalize("Invoice: 12345678") assert result.value is not None assert result.value.isdigit() def test_year_prefixed_invoice_number(self, normalizer): """Test invoice number with year prefix like 2024-001.""" result = normalizer.normalize("Faktura 2024-12345") assert result.value is not None assert '2024' in result.value def test_avoid_long_ocr_sequence(self, normalizer): """Test that long OCR-like sequences are avoided.""" # When text contains both short invoice number and long OCR sequence text = "Fakturanummer: A3861 OCR: 310196187399952763290708" result = normalizer.normalize(text) # Should prefer the shorter alphanumeric pattern assert result.value == 'A3861' def test_empty_string(self, normalizer): """Test empty string input.""" result = normalizer.normalize("") assert result.value is None or result.is_valid is False class TestNormalizeBankgiro: """Tests for Bankgiro normalization.""" @pytest.fixture def normalizer(self): return BankgiroNormalizer() def test_standard_7_digit_format(self, normalizer): """Test 7-digit Bankgiro XXX-XXXX.""" result = normalizer.normalize("Bankgiro: 782-1713") assert result.value == '782-1713' assert result.is_valid is True def test_standard_8_digit_format(self, normalizer): """Test 8-digit Bankgiro XXXX-XXXX.""" result = normalizer.normalize("BG 5393-9484") assert result.value == '5393-9484' assert result.is_valid is True def test_without_dash(self, normalizer): """Test Bankgiro without dash.""" result = normalizer.normalize("Bankgiro 7821713") assert result.value is not None # Should be formatted with dash def test_with_spaces(self, normalizer): """Test Bankgiro with spaces - may not parse if spaces break the pattern.""" result = normalizer.normalize("BG: 782 1713") # Spaces in the middle might cause parsing issues - that's acceptable # The test passes if it doesn't crash def test_invalid_bankgiro(self, normalizer): """Test invalid Bankgiro (too short).""" result = normalizer.normalize("BG: 123") # Should fail or return None class TestNormalizePlusgiro: """Tests for Plusgiro normalization.""" @pytest.fixture def normalizer(self): return PlusgiroNormalizer() @pytest.fixture def bg_normalizer(self): return BankgiroNormalizer() def test_standard_format(self, normalizer): """Test standard Plusgiro format XXXXXXX-X.""" result = normalizer.normalize("Plusgiro: 1234567-8") assert result.value is not None assert '-' in result.value def test_without_dash(self, normalizer): """Test Plusgiro without dash.""" result = normalizer.normalize("PG 12345678") assert result.value is not None def test_distinguish_from_bankgiro(self, normalizer, bg_normalizer): """Test that Plusgiro is distinguished from Bankgiro by format.""" # Plusgiro has 1 digit after dash, Bankgiro has 4 pg_text = "4809603-6" # Plusgiro format bg_text = "782-1713" # Bankgiro format pg_result = normalizer.normalize(pg_text) bg_result = bg_normalizer.normalize(bg_text) # Both should succeed in their respective normalizations class TestNormalizeAmount: """Tests for Amount normalization.""" @pytest.fixture def normalizer(self): return AmountNormalizer() def test_swedish_format_comma(self, normalizer): """Test Swedish format with comma: 11 699,00.""" result = normalizer.normalize("11 699,00 SEK") assert result.value is not None assert result.is_valid is True def test_integer_amount(self, normalizer): """Test integer amount without decimals.""" result = normalizer.normalize("Amount: 11699") assert result.value is not None def test_with_currency(self, normalizer): """Test amount with currency symbol.""" result = normalizer.normalize("SEK 11 699,00") assert result.value is not None def test_large_amount(self, normalizer): """Test large amount with thousand separators.""" result = normalizer.normalize("1 234 567,89") assert result.value is not None class TestNormalizeOCR: """Tests for OCR number normalization.""" @pytest.fixture def normalizer(self): return OcrNumberNormalizer() def test_standard_ocr(self, normalizer): """Test standard OCR number.""" result = normalizer.normalize("OCR: 310196187399952") assert result.value == '310196187399952' assert result.is_valid is True def test_ocr_with_spaces(self, normalizer): """Test OCR number with spaces.""" result = normalizer.normalize("3101 9618 7399 952") assert result.value is not None assert ' ' not in result.value # Spaces should be removed def test_short_ocr_invalid(self, normalizer): """Test that too short OCR is invalid.""" result = normalizer.normalize("123") assert result.is_valid is False class TestNormalizeDate: """Tests for date normalization.""" @pytest.fixture def normalizer(self): return DateNormalizer() def test_iso_format(self, normalizer): """Test ISO date format YYYY-MM-DD.""" result = normalizer.normalize("2026-01-31") assert result.value == '2026-01-31' assert result.is_valid is True def test_swedish_format(self, normalizer): """Test Swedish format with dots: 31.01.2026.""" result = normalizer.normalize("31.01.2026") assert result.value is not None assert result.is_valid is True def test_slash_format(self, normalizer): """Test slash format: 31/01/2026.""" result = normalizer.normalize("31/01/2026") assert result.value is not None def test_compact_format(self, normalizer): """Test compact format: 20260131.""" result = normalizer.normalize("20260131") assert result.value is not None def test_invalid_date(self, normalizer): """Test invalid date.""" result = normalizer.normalize("not a date") assert result.is_valid is False class TestNormalizePaymentLine: """Tests for payment line normalization.""" @pytest.fixture def extractor(self): return FieldExtractor() def test_standard_payment_line(self, extractor): """Test standard payment line parsing.""" text = "# 310196187399952 # 11699 00 6 > 7821713#41#" result, is_valid, error = extractor._normalize_payment_line(text) assert result is not None assert is_valid is True # Should be formatted as: OCR:xxx Amount:xxx BG:xxx assert 'OCR:' in result or '310196187399952' in result def test_payment_line_with_spaces_in_bg(self, extractor): """Test payment line with spaces in Bankgiro.""" text = "# 310196187399952 # 11699 00 6 > 78 2 1 713 #41#" result, is_valid, error = extractor._normalize_payment_line(text) assert result is not None assert is_valid is True # Bankgiro should be normalized despite spaces def test_payment_line_with_spaces_in_check_digits(self, extractor): """Test payment line with spaces around check digits: #41 # instead of #41#.""" text = "# 6026726908 # 736 00 9 > 5692041 #41 #" result, is_valid, error = extractor._normalize_payment_line(text) assert result is not None assert is_valid is True assert "6026726908" in result assert "736 00" in result assert "5692041#41#" in result def test_payment_line_with_ocr_spaces_in_amount(self, extractor): """Test payment line with OCR-induced spaces in amount: '12 0 0 00' -> '1200 00'.""" text = "# 11000770600242 # 12 0 0 00 5 3082963#41#" result, is_valid, error = extractor._normalize_payment_line(text) assert result is not None assert is_valid is True assert "11000770600242" in result assert "1200 00" in result assert "3082963#41#" in result def test_payment_line_without_greater_symbol(self, extractor): """Test payment line with missing > symbol (low-DPI OCR issue).""" text = "# 11000770600242 # 1200 00 5 3082963#41#" result, is_valid, error = extractor._normalize_payment_line(text) assert result is not None assert is_valid is True assert "11000770600242" in result assert "1200 00" in result class TestNormalizeCustomerNumber: """Tests for customer number normalization.""" @pytest.fixture def extractor(self): return FieldExtractor() def test_with_separator(self, extractor): """Test customer number with separator: JTY 576-3.""" result, is_valid, error = extractor._normalize_customer_number("Kundnr: JTY 576-3") assert result is not None def test_compact_format(self, extractor): """Test compact customer number: JTY5763.""" result, is_valid, error = extractor._normalize_customer_number("JTY5763") assert result is not None def test_format_without_dash(self, extractor): """Test customer number format without dash: Dwq 211X -> DWQ 211-X.""" text = "Dwq 211X Billo SE 106 43 Stockholm" result, is_valid, error = extractor._normalize_customer_number(text) assert result is not None assert is_valid is True assert result == "DWQ 211-X" def test_swedish_postal_code_exclusion(self, extractor): """Test that Swedish postal codes are excluded: SE 106 43 should not be extracted.""" text = "SE 106 43 Stockholm" result, is_valid, error = extractor._normalize_customer_number(text) # Should not extract postal code assert result is None or "SE 106" not in result def test_customer_number_with_postal_code_in_text(self, extractor): """Test extracting customer number when postal code is also present.""" text = "Customer: ABC 123X, Address: SE 106 43 Stockholm" result, is_valid, error = extractor._normalize_customer_number(text) assert result is not None assert "ABC" in result # Should not extract postal code assert "SE 106" not in result if result else True class TestNormalizeSupplierOrgNumber: """Tests for supplier organization number normalization.""" @pytest.fixture def normalizer(self): return SupplierOrgNumberNormalizer() def test_standard_format(self, normalizer): """Test standard format NNNNNN-NNNN.""" result = normalizer.normalize("Org.nr 516406-1102") assert result.value == '516406-1102' assert result.is_valid is True def test_vat_number_format(self, normalizer): """Test VAT number format SE + 10 digits + 01.""" result = normalizer.normalize("Momsreg.nr SE556123456701") assert result.value is not None assert '-' in result.value class TestNormalizeAndValidateDispatch: """Tests for the _normalize_and_validate dispatch method.""" @pytest.fixture def extractor(self): return FieldExtractor() def test_dispatch_invoice_number(self, extractor): """Test dispatch to invoice number normalizer.""" result, is_valid, error = extractor._normalize_and_validate('InvoiceNumber', 'A3861') assert result is not None def test_dispatch_amount(self, extractor): """Test dispatch to amount normalizer.""" result, is_valid, error = extractor._normalize_and_validate('Amount', '11699,00') assert result is not None def test_dispatch_bankgiro(self, extractor): """Test dispatch to Bankgiro normalizer.""" result, is_valid, error = extractor._normalize_and_validate('Bankgiro', '782-1713') assert result is not None def test_dispatch_ocr(self, extractor): """Test dispatch to OCR normalizer.""" result, is_valid, error = extractor._normalize_and_validate('OCR', '310196187399952') assert result is not None def test_dispatch_date(self, extractor): """Test dispatch to date normalizer.""" result, is_valid, error = extractor._normalize_and_validate('InvoiceDate', '2026-01-31') assert result is not None if __name__ == '__main__': pytest.main([__file__, '-v'])