""" Tests for the Field Normalization Module. Tests cover all normalizer functions in src/normalize/normalizer.py Usage: pytest src/normalize/test_normalizer.py -v """ import pytest from src.normalize.normalizer import ( FieldNormalizer, NormalizedValue, normalize_field, NORMALIZERS, ) class TestCleanText: """Tests for FieldNormalizer.clean_text()""" def test_removes_zero_width_characters(self): """Should remove zero-width characters.""" text = "hello\u200bworld\u200c\u200d\ufeff" assert FieldNormalizer.clean_text(text) == "helloworld" def test_normalizes_dashes(self): """Should normalize different dash types to standard hyphen.""" # en-dash assert FieldNormalizer.clean_text("123\u2013456") == "123-456" # em-dash assert FieldNormalizer.clean_text("123\u2014456") == "123-456" # minus sign assert FieldNormalizer.clean_text("123\u2212456") == "123-456" # middle dot assert FieldNormalizer.clean_text("123\u00b7456") == "123-456" def test_normalizes_whitespace(self): """Should normalize multiple spaces to single space.""" assert FieldNormalizer.clean_text("hello world") == "hello world" assert FieldNormalizer.clean_text(" hello world ") == "hello world" def test_strips_leading_trailing_whitespace(self): """Should strip leading and trailing whitespace.""" assert FieldNormalizer.clean_text(" hello ") == "hello" class TestNormalizeInvoiceNumber: """Tests for FieldNormalizer.normalize_invoice_number()""" def test_pure_digits(self): """Should keep pure digit invoice numbers.""" variants = FieldNormalizer.normalize_invoice_number("100017500321") assert "100017500321" in variants def test_with_prefix(self): """Should extract digits and keep original.""" variants = FieldNormalizer.normalize_invoice_number("INV-100017500321") assert "INV-100017500321" in variants assert "100017500321" in variants def test_alphanumeric(self): """Should handle alphanumeric invoice numbers.""" variants = FieldNormalizer.normalize_invoice_number("ABC123DEF456") assert "ABC123DEF456" in variants assert "123456" in variants def test_empty_string(self): """Should handle empty string gracefully.""" variants = FieldNormalizer.normalize_invoice_number("") assert variants == [] class TestNormalizeOcrNumber: """Tests for FieldNormalizer.normalize_ocr_number()""" def test_delegates_to_invoice_number(self): """OCR normalization should behave like invoice number normalization.""" value = "123456789" ocr_variants = FieldNormalizer.normalize_ocr_number(value) invoice_variants = FieldNormalizer.normalize_invoice_number(value) assert set(ocr_variants) == set(invoice_variants) class TestNormalizeBankgiro: """Tests for FieldNormalizer.normalize_bankgiro()""" def test_with_dash_8_digits(self): """Should normalize 8-digit bankgiro with dash.""" variants = FieldNormalizer.normalize_bankgiro("5393-9484") assert "5393-9484" in variants assert "53939484" in variants def test_without_dash_8_digits(self): """Should add dash format for 8-digit bankgiro.""" variants = FieldNormalizer.normalize_bankgiro("53939484") assert "53939484" in variants assert "5393-9484" in variants def test_7_digits(self): """Should handle 7-digit bankgiro (XXX-XXXX format).""" variants = FieldNormalizer.normalize_bankgiro("1234567") assert "1234567" in variants assert "123-4567" in variants def test_with_dash_7_digits(self): """Should normalize 7-digit bankgiro with dash.""" variants = FieldNormalizer.normalize_bankgiro("123-4567") assert "123-4567" in variants assert "1234567" in variants class TestNormalizePlusgiro: """Tests for FieldNormalizer.normalize_plusgiro()""" def test_with_dash_8_digits(self): """Should normalize 8-digit plusgiro (XXXXXXX-X format).""" variants = FieldNormalizer.normalize_plusgiro("1234567-8") assert "1234567-8" in variants assert "12345678" in variants def test_without_dash_8_digits(self): """Should add dash format for 8-digit plusgiro.""" variants = FieldNormalizer.normalize_plusgiro("12345678") assert "12345678" in variants assert "1234567-8" in variants def test_7_digits(self): """Should handle 7-digit plusgiro (XXXXXX-X format).""" variants = FieldNormalizer.normalize_plusgiro("1234567") assert "1234567" in variants assert "123456-7" in variants class TestNormalizeOrganisationNumber: """Tests for FieldNormalizer.normalize_organisation_number()""" def test_with_dash(self): """Should normalize org number with dash.""" variants = FieldNormalizer.normalize_organisation_number("556123-4567") assert "556123-4567" in variants assert "5561234567" in variants assert "SE556123456701" in variants def test_without_dash(self): """Should add dash format for org number.""" variants = FieldNormalizer.normalize_organisation_number("5561234567") assert "5561234567" in variants assert "556123-4567" in variants assert "SE556123456701" in variants def test_from_vat_number(self): """Should extract org number from Swedish VAT number.""" variants = FieldNormalizer.normalize_organisation_number("SE556123456701") assert "SE556123456701" in variants assert "5561234567" in variants assert "556123-4567" in variants def test_vat_variants(self): """Should generate various VAT number formats.""" variants = FieldNormalizer.normalize_organisation_number("5561234567") assert "SE556123456701" in variants assert "se556123456701" in variants assert "SE 5561234567 01" in variants assert "SE5561234567" in variants def test_12_digit_with_century(self): """Should handle 12-digit org number with century prefix.""" variants = FieldNormalizer.normalize_organisation_number("195561234567") assert "195561234567" in variants assert "5561234567" in variants assert "556123-4567" in variants class TestNormalizeSupplierAccounts: """Tests for FieldNormalizer.normalize_supplier_accounts()""" def test_single_plusgiro(self): """Should normalize single plusgiro account.""" variants = FieldNormalizer.normalize_supplier_accounts("PG:48676043") assert "PG:48676043" in variants assert "48676043" in variants assert "4867604-3" in variants def test_single_bankgiro(self): """Should normalize single bankgiro account.""" variants = FieldNormalizer.normalize_supplier_accounts("BG:5393-9484") assert "BG:5393-9484" in variants assert "5393-9484" in variants assert "53939484" in variants def test_multiple_accounts(self): """Should handle multiple accounts separated by |.""" variants = FieldNormalizer.normalize_supplier_accounts( "PG:48676043 | PG:49128028" ) assert "PG:48676043" in variants assert "48676043" in variants assert "PG:49128028" in variants assert "49128028" in variants def test_prefix_normalization(self): """Should normalize prefix formats.""" variants = FieldNormalizer.normalize_supplier_accounts("pg:12345678") assert "PG:12345678" in variants assert "PG: 12345678" in variants class TestNormalizeCustomerNumber: """Tests for FieldNormalizer.normalize_customer_number()""" def test_alphanumeric_with_space_and_dash(self): """Should normalize customer number with space and dash.""" variants = FieldNormalizer.normalize_customer_number("EMM 256-6") assert "EMM 256-6" in variants assert "EMM256-6" in variants assert "EMM2566" in variants def test_alphanumeric_with_space(self): """Should normalize customer number with space.""" variants = FieldNormalizer.normalize_customer_number("ABC 123") assert "ABC 123" in variants assert "ABC123" in variants def test_case_variants(self): """Should generate uppercase and lowercase variants.""" variants = FieldNormalizer.normalize_customer_number("Abc123") assert "Abc123" in variants assert "ABC123" in variants assert "abc123" in variants class TestNormalizeAmount: """Tests for FieldNormalizer.normalize_amount()""" def test_integer_amount(self): """Should normalize integer amount.""" variants = FieldNormalizer.normalize_amount("114") assert "114" in variants assert "114,00" in variants assert "114.00" in variants def test_with_comma_decimal(self): """Should normalize amount with comma as decimal separator.""" variants = FieldNormalizer.normalize_amount("114,00") assert "114,00" in variants assert "114.00" in variants def test_with_dot_decimal(self): """Should normalize amount with dot as decimal separator.""" variants = FieldNormalizer.normalize_amount("114.00") assert "114.00" in variants assert "114,00" in variants def test_with_space_thousand_separator(self): """Should handle space as thousand separator.""" variants = FieldNormalizer.normalize_amount("1 234,56") assert "1234,56" in variants assert "1234.56" in variants def test_space_as_decimal_separator(self): """Should handle space as decimal separator (Swedish format).""" variants = FieldNormalizer.normalize_amount("3045 52") assert "3045.52" in variants assert "3045,52" in variants assert "304552" in variants def test_us_format(self): """Should handle US format (comma thousand, dot decimal).""" variants = FieldNormalizer.normalize_amount("1,390.00") assert "1390.00" in variants assert "1390,00" in variants assert "1.390,00" in variants # European conversion def test_european_format(self): """Should handle European format (dot thousand, comma decimal).""" variants = FieldNormalizer.normalize_amount("1.390,00") assert "1390.00" in variants assert "1390,00" in variants assert "1,390.00" in variants # US conversion def test_space_thousand_with_decimal(self): """Should handle space thousand separator with decimal.""" variants = FieldNormalizer.normalize_amount("10 571,00") assert "10571,00" in variants assert "10571.00" in variants def test_removes_currency_symbols(self): """Should remove currency symbols.""" variants = FieldNormalizer.normalize_amount("114 SEK") assert "114" in variants def test_large_amount_european_format(self): """Should generate European format for large amounts.""" variants = FieldNormalizer.normalize_amount("20485") assert "20485" in variants assert "20.485" in variants assert "20.485,00" in variants class TestNormalizeDate: """Tests for FieldNormalizer.normalize_date()""" def test_iso_format(self): """Should parse and generate variants from ISO format.""" variants = FieldNormalizer.normalize_date("2025-12-13") assert "2025-12-13" in variants assert "13/12/2025" in variants assert "13.12.2025" in variants assert "20251213" in variants def test_european_slash_format(self): """Should parse European slash format DD/MM/YYYY.""" variants = FieldNormalizer.normalize_date("13/12/2025") assert "2025-12-13" in variants assert "13/12/2025" in variants def test_european_dot_format(self): """Should parse European dot format DD.MM.YYYY.""" variants = FieldNormalizer.normalize_date("13.12.2025") assert "2025-12-13" in variants assert "13.12.2025" in variants def test_compact_format_yyyymmdd(self): """Should parse compact format YYYYMMDD.""" variants = FieldNormalizer.normalize_date("20251213") assert "2025-12-13" in variants assert "20251213" in variants def test_compact_format_yymmdd(self): """Should parse compact format YYMMDD.""" variants = FieldNormalizer.normalize_date("251213") assert "2025-12-13" in variants assert "251213" in variants def test_short_year_dot_format(self): """Should parse DD.MM.YY format.""" variants = FieldNormalizer.normalize_date("02.08.25") assert "2025-08-02" in variants assert "02.08.25" in variants def test_swedish_month_name(self): """Should parse Swedish month names.""" variants = FieldNormalizer.normalize_date("13 december 2025") assert "2025-12-13" in variants def test_swedish_month_abbreviation(self): """Should parse Swedish month abbreviations.""" variants = FieldNormalizer.normalize_date("13 dec 2025") assert "2025-12-13" in variants def test_generates_swedish_month_variants(self): """Should generate Swedish month name variants.""" variants = FieldNormalizer.normalize_date("2025-01-09") assert "9 januari 2025" in variants assert "9 jan 2025" in variants def test_generates_hyphen_month_abbrev_format(self): """Should generate DD-MON-YY format.""" variants = FieldNormalizer.normalize_date("2024-10-30") assert "30-OKT-24" in variants assert "30-okt-24" in variants def test_iso_with_time(self): """Should handle ISO format with time component.""" variants = FieldNormalizer.normalize_date("2026-01-09 00:00:00") assert "2026-01-09" in variants assert "09/01/2026" in variants def test_ambiguous_date_generates_both(self): """Should generate both interpretations for ambiguous dates.""" # 01/02/2025 could be Jan 2 (US) or Feb 1 (EU) variants = FieldNormalizer.normalize_date("01/02/2025") # Both interpretations should be present assert "2025-02-01" in variants # European: DD/MM/YYYY assert "2025-01-02" in variants # US: MM/DD/YYYY def test_middle_dot_separator(self): """Should generate middle dot separator variant.""" variants = FieldNormalizer.normalize_date("2025-12-13") assert "2025·12·13" in variants def test_spaced_format(self): """Should generate spaced format variants.""" variants = FieldNormalizer.normalize_date("2025-12-13") assert "2025 12 13" in variants assert "25 12 13" in variants class TestNormalizeField: """Tests for the normalize_field() function.""" def test_uses_correct_normalizer(self): """Should use the correct normalizer for each field type.""" # Test InvoiceNumber result = normalize_field("InvoiceNumber", "INV-123") assert "123" in result assert "INV-123" in result # Test Amount result = normalize_field("Amount", "100") assert "100" in result assert "100,00" in result # Test Date result = normalize_field("InvoiceDate", "2025-01-01") assert "2025-01-01" in result assert "01/01/2025" in result def test_unknown_field_cleans_text(self): """Should clean text for unknown field types.""" result = normalize_field("UnknownField", " hello world ") assert result == ["hello world"] def test_none_value(self): """Should return empty list for None value.""" result = normalize_field("InvoiceNumber", None) assert result == [] def test_empty_string(self): """Should return empty list for empty string.""" result = normalize_field("InvoiceNumber", "") assert result == [] def test_whitespace_only(self): """Should return empty list for whitespace-only string.""" result = normalize_field("InvoiceNumber", " ") assert result == [] def test_converts_non_string_to_string(self): """Should convert non-string values to string.""" result = normalize_field("Amount", 100) assert "100" in result class TestNormalizersMapping: """Tests for the NORMALIZERS mapping.""" def test_all_expected_fields_mapped(self): """Should have normalizers for all expected field types.""" expected_fields = [ "InvoiceNumber", "OCR", "Bankgiro", "Plusgiro", "Amount", "InvoiceDate", "InvoiceDueDate", "supplier_organisation_number", "supplier_accounts", "customer_number", ] for field in expected_fields: assert field in NORMALIZERS, f"Missing normalizer for {field}" def test_normalizers_are_callable(self): """All normalizers should be callable.""" for name, normalizer in NORMALIZERS.items(): assert callable(normalizer), f"Normalizer {name} is not callable" class TestNormalizedValueDataclass: """Tests for the NormalizedValue dataclass.""" def test_creation(self): """Should create NormalizedValue with all fields.""" nv = NormalizedValue( original="100", variants=["100", "100.00", "100,00"], field_type="Amount", ) assert nv.original == "100" assert nv.variants == ["100", "100.00", "100,00"] assert nv.field_type == "Amount" class TestEdgeCases: """Tests for edge cases and special scenarios.""" def test_unicode_normalization(self): """Should handle unicode characters properly.""" # Non-breaking space variants = FieldNormalizer.normalize_amount("1\xa0234,56") assert "1234,56" in variants def test_special_dashes_in_bankgiro(self): """Should handle special dash characters in bankgiro.""" # en-dash variants = FieldNormalizer.normalize_bankgiro("5393\u20139484") assert "53939484" in variants assert "5393-9484" in variants def test_very_long_invoice_number(self): """Should handle very long invoice numbers.""" long_number = "1" * 50 variants = FieldNormalizer.normalize_invoice_number(long_number) assert long_number in variants def test_mixed_case_vat_prefix(self): """Should handle mixed case VAT prefix.""" variants = FieldNormalizer.normalize_organisation_number("Se556123456701") assert "5561234567" in variants assert "SE556123456701" in variants def test_date_with_leading_zeros(self): """Should handle dates with leading zeros.""" variants = FieldNormalizer.normalize_date("01.01.2025") assert "2025-01-01" in variants def test_amount_with_kr_suffix(self): """Should handle amount with kr suffix.""" variants = FieldNormalizer.normalize_amount("100 kr") assert "100" in variants def test_amount_with_colon_dash(self): """Should handle amount with :- suffix.""" variants = FieldNormalizer.normalize_amount("100:-") assert "100" in variants class TestOrganisationNumberEdgeCases: """Additional edge case tests for organisation number normalization.""" def test_vat_with_10_digits_after_se(self): """Should handle VAT format SE + 10 digits (without trailing 01).""" # Line 158-159: len(potential_org) == 10 case variants = FieldNormalizer.normalize_organisation_number("SE5561234567") assert "5561234567" in variants assert "556123-4567" in variants def test_vat_with_spaces(self): """Should handle VAT with spaces.""" variants = FieldNormalizer.normalize_organisation_number("SE 5561234567 01") assert "5561234567" in variants def test_short_vat_prefix(self): """Should handle SE prefix with less than 12 chars total.""" # This tests the fallback to digit extraction variants = FieldNormalizer.normalize_organisation_number("SE12345") assert "12345" in variants class TestSupplierAccountsEdgeCases: """Additional edge case tests for supplier accounts normalization.""" def test_empty_account_in_list(self): """Should skip empty accounts in list.""" # Line 224: empty account continue variants = FieldNormalizer.normalize_supplier_accounts("PG:12345678 | | BG:53939484") assert "12345678" in variants assert "53939484" in variants def test_account_without_prefix(self): """Should handle account number without prefix.""" # Line 240: number = account (no colon) variants = FieldNormalizer.normalize_supplier_accounts("12345678") assert "12345678" in variants assert "1234567-8" in variants def test_7_digit_account(self): """Should handle 7-digit account number.""" # Line 254-256: 7-digit format variants = FieldNormalizer.normalize_supplier_accounts("1234567") assert "1234567" in variants assert "123456-7" in variants def test_10_digit_account(self): """Should handle 10-digit account number (org number format).""" # Line 257-259: 10-digit format variants = FieldNormalizer.normalize_supplier_accounts("5561234567") assert "5561234567" in variants assert "556123-4567" in variants def test_mixed_format_accounts(self): """Should handle multiple accounts with different formats.""" variants = FieldNormalizer.normalize_supplier_accounts("PG:1234567 | 53939484") assert "1234567" in variants assert "53939484" in variants class TestDateEdgeCases: """Additional edge case tests for date normalization.""" def test_invalid_iso_date(self): """Should handle invalid ISO date gracefully.""" # Line 483-484: ValueError in date parsing variants = FieldNormalizer.normalize_date("2025-13-45") # Invalid month/day # Should still return original value assert "2025-13-45" in variants def test_invalid_european_date(self): """Should handle invalid European date gracefully.""" # Line 496-497: ValueError in ambiguous date parsing variants = FieldNormalizer.normalize_date("32/13/2025") # Invalid day/month assert "32/13/2025" in variants def test_invalid_2digit_year_date(self): """Should handle invalid 2-digit year date gracefully.""" # Line 521-522, 528-529: ValueError in 2-digit year parsing variants = FieldNormalizer.normalize_date("99.99.25") # Invalid day/month assert "99.99.25" in variants def test_swedish_month_with_short_year(self): """Should handle Swedish month with 2-digit year.""" # Line 544: short year conversion variants = FieldNormalizer.normalize_date("15 jan 25") assert "2025-01-15" in variants def test_swedish_month_with_old_year(self): """Should handle Swedish month with old 2-digit year (50-99 -> 1900s).""" variants = FieldNormalizer.normalize_date("15 jan 99") assert "1999-01-15" in variants def test_swedish_month_invalid_date(self): """Should handle Swedish month with invalid day gracefully.""" # Line 548-549: ValueError continue variants = FieldNormalizer.normalize_date("32 januari 2025") # Invalid day # Should still return original assert "32 januari 2025" in variants def test_ambiguous_date_both_invalid(self): """Should handle ambiguous date where one interpretation is invalid.""" # 30/02/2025 - Feb 30 is invalid, but 02/30 would be invalid too # This should still work for valid interpretations variants = FieldNormalizer.normalize_date("15/06/2025") assert "2025-06-15" in variants # European interpretation # US interpretation (month=15) would be invalid and skipped def test_date_slash_format_2digit_year(self): """Should parse DD/MM/YY format.""" variants = FieldNormalizer.normalize_date("15/06/25") assert "2025-06-15" in variants def test_date_dash_format_2digit_year(self): """Should parse DD-MM-YY format.""" variants = FieldNormalizer.normalize_date("15-06-25") assert "2025-06-15" in variants if __name__ == "__main__": pytest.main([__file__, "-v"])