642 lines
24 KiB
Python
642 lines
24 KiB
Python
"""
|
|
Tests for the Field Normalization Module.
|
|
|
|
Tests cover all normalizer functions in src/normalize/normalizer.py
|
|
|
|
Usage:
|
|
pytest src/normalize/test_normalizer.py -v
|
|
"""
|
|
|
|
import pytest
|
|
from src.normalize.normalizer import (
|
|
FieldNormalizer,
|
|
NormalizedValue,
|
|
normalize_field,
|
|
NORMALIZERS,
|
|
)
|
|
|
|
|
|
class TestCleanText:
|
|
"""Tests for FieldNormalizer.clean_text()"""
|
|
|
|
def test_removes_zero_width_characters(self):
|
|
"""Should remove zero-width characters."""
|
|
text = "hello\u200bworld\u200c\u200d\ufeff"
|
|
assert FieldNormalizer.clean_text(text) == "helloworld"
|
|
|
|
def test_normalizes_dashes(self):
|
|
"""Should normalize different dash types to standard hyphen."""
|
|
# en-dash
|
|
assert FieldNormalizer.clean_text("123\u2013456") == "123-456"
|
|
# em-dash
|
|
assert FieldNormalizer.clean_text("123\u2014456") == "123-456"
|
|
# minus sign
|
|
assert FieldNormalizer.clean_text("123\u2212456") == "123-456"
|
|
# middle dot
|
|
assert FieldNormalizer.clean_text("123\u00b7456") == "123-456"
|
|
|
|
def test_normalizes_whitespace(self):
|
|
"""Should normalize multiple spaces to single space."""
|
|
assert FieldNormalizer.clean_text("hello world") == "hello world"
|
|
assert FieldNormalizer.clean_text(" hello world ") == "hello world"
|
|
|
|
def test_strips_leading_trailing_whitespace(self):
|
|
"""Should strip leading and trailing whitespace."""
|
|
assert FieldNormalizer.clean_text(" hello ") == "hello"
|
|
|
|
|
|
class TestNormalizeInvoiceNumber:
|
|
"""Tests for FieldNormalizer.normalize_invoice_number()"""
|
|
|
|
def test_pure_digits(self):
|
|
"""Should keep pure digit invoice numbers."""
|
|
variants = FieldNormalizer.normalize_invoice_number("100017500321")
|
|
assert "100017500321" in variants
|
|
|
|
def test_with_prefix(self):
|
|
"""Should extract digits and keep original."""
|
|
variants = FieldNormalizer.normalize_invoice_number("INV-100017500321")
|
|
assert "INV-100017500321" in variants
|
|
assert "100017500321" in variants
|
|
|
|
def test_alphanumeric(self):
|
|
"""Should handle alphanumeric invoice numbers."""
|
|
variants = FieldNormalizer.normalize_invoice_number("ABC123DEF456")
|
|
assert "ABC123DEF456" in variants
|
|
assert "123456" in variants
|
|
|
|
def test_empty_string(self):
|
|
"""Should handle empty string gracefully."""
|
|
variants = FieldNormalizer.normalize_invoice_number("")
|
|
assert variants == []
|
|
|
|
|
|
class TestNormalizeOcrNumber:
|
|
"""Tests for FieldNormalizer.normalize_ocr_number()"""
|
|
|
|
def test_delegates_to_invoice_number(self):
|
|
"""OCR normalization should behave like invoice number normalization."""
|
|
value = "123456789"
|
|
ocr_variants = FieldNormalizer.normalize_ocr_number(value)
|
|
invoice_variants = FieldNormalizer.normalize_invoice_number(value)
|
|
assert set(ocr_variants) == set(invoice_variants)
|
|
|
|
|
|
class TestNormalizeBankgiro:
|
|
"""Tests for FieldNormalizer.normalize_bankgiro()"""
|
|
|
|
def test_with_dash_8_digits(self):
|
|
"""Should normalize 8-digit bankgiro with dash."""
|
|
variants = FieldNormalizer.normalize_bankgiro("5393-9484")
|
|
assert "5393-9484" in variants
|
|
assert "53939484" in variants
|
|
|
|
def test_without_dash_8_digits(self):
|
|
"""Should add dash format for 8-digit bankgiro."""
|
|
variants = FieldNormalizer.normalize_bankgiro("53939484")
|
|
assert "53939484" in variants
|
|
assert "5393-9484" in variants
|
|
|
|
def test_7_digits(self):
|
|
"""Should handle 7-digit bankgiro (XXX-XXXX format)."""
|
|
variants = FieldNormalizer.normalize_bankgiro("1234567")
|
|
assert "1234567" in variants
|
|
assert "123-4567" in variants
|
|
|
|
def test_with_dash_7_digits(self):
|
|
"""Should normalize 7-digit bankgiro with dash."""
|
|
variants = FieldNormalizer.normalize_bankgiro("123-4567")
|
|
assert "123-4567" in variants
|
|
assert "1234567" in variants
|
|
|
|
|
|
class TestNormalizePlusgiro:
|
|
"""Tests for FieldNormalizer.normalize_plusgiro()"""
|
|
|
|
def test_with_dash_8_digits(self):
|
|
"""Should normalize 8-digit plusgiro (XXXXXXX-X format)."""
|
|
variants = FieldNormalizer.normalize_plusgiro("1234567-8")
|
|
assert "1234567-8" in variants
|
|
assert "12345678" in variants
|
|
|
|
def test_without_dash_8_digits(self):
|
|
"""Should add dash format for 8-digit plusgiro."""
|
|
variants = FieldNormalizer.normalize_plusgiro("12345678")
|
|
assert "12345678" in variants
|
|
assert "1234567-8" in variants
|
|
|
|
def test_7_digits(self):
|
|
"""Should handle 7-digit plusgiro (XXXXXX-X format)."""
|
|
variants = FieldNormalizer.normalize_plusgiro("1234567")
|
|
assert "1234567" in variants
|
|
assert "123456-7" in variants
|
|
|
|
|
|
class TestNormalizeOrganisationNumber:
|
|
"""Tests for FieldNormalizer.normalize_organisation_number()"""
|
|
|
|
def test_with_dash(self):
|
|
"""Should normalize org number with dash."""
|
|
variants = FieldNormalizer.normalize_organisation_number("556123-4567")
|
|
assert "556123-4567" in variants
|
|
assert "5561234567" in variants
|
|
assert "SE556123456701" in variants
|
|
|
|
def test_without_dash(self):
|
|
"""Should add dash format for org number."""
|
|
variants = FieldNormalizer.normalize_organisation_number("5561234567")
|
|
assert "5561234567" in variants
|
|
assert "556123-4567" in variants
|
|
assert "SE556123456701" in variants
|
|
|
|
def test_from_vat_number(self):
|
|
"""Should extract org number from Swedish VAT number."""
|
|
variants = FieldNormalizer.normalize_organisation_number("SE556123456701")
|
|
assert "SE556123456701" in variants
|
|
assert "5561234567" in variants
|
|
assert "556123-4567" in variants
|
|
|
|
def test_vat_variants(self):
|
|
"""Should generate various VAT number formats."""
|
|
variants = FieldNormalizer.normalize_organisation_number("5561234567")
|
|
assert "SE556123456701" in variants
|
|
assert "se556123456701" in variants
|
|
assert "SE 5561234567 01" in variants
|
|
assert "SE5561234567" in variants
|
|
|
|
def test_12_digit_with_century(self):
|
|
"""Should handle 12-digit org number with century prefix."""
|
|
variants = FieldNormalizer.normalize_organisation_number("195561234567")
|
|
assert "195561234567" in variants
|
|
assert "5561234567" in variants
|
|
assert "556123-4567" in variants
|
|
|
|
|
|
class TestNormalizeSupplierAccounts:
|
|
"""Tests for FieldNormalizer.normalize_supplier_accounts()"""
|
|
|
|
def test_single_plusgiro(self):
|
|
"""Should normalize single plusgiro account."""
|
|
variants = FieldNormalizer.normalize_supplier_accounts("PG:48676043")
|
|
assert "PG:48676043" in variants
|
|
assert "48676043" in variants
|
|
assert "4867604-3" in variants
|
|
|
|
def test_single_bankgiro(self):
|
|
"""Should normalize single bankgiro account."""
|
|
variants = FieldNormalizer.normalize_supplier_accounts("BG:5393-9484")
|
|
assert "BG:5393-9484" in variants
|
|
assert "5393-9484" in variants
|
|
assert "53939484" in variants
|
|
|
|
def test_multiple_accounts(self):
|
|
"""Should handle multiple accounts separated by |."""
|
|
variants = FieldNormalizer.normalize_supplier_accounts(
|
|
"PG:48676043 | PG:49128028"
|
|
)
|
|
assert "PG:48676043" in variants
|
|
assert "48676043" in variants
|
|
assert "PG:49128028" in variants
|
|
assert "49128028" in variants
|
|
|
|
def test_prefix_normalization(self):
|
|
"""Should normalize prefix formats."""
|
|
variants = FieldNormalizer.normalize_supplier_accounts("pg:12345678")
|
|
assert "PG:12345678" in variants
|
|
assert "PG: 12345678" in variants
|
|
|
|
|
|
class TestNormalizeCustomerNumber:
|
|
"""Tests for FieldNormalizer.normalize_customer_number()"""
|
|
|
|
def test_alphanumeric_with_space_and_dash(self):
|
|
"""Should normalize customer number with space and dash."""
|
|
variants = FieldNormalizer.normalize_customer_number("EMM 256-6")
|
|
assert "EMM 256-6" in variants
|
|
assert "EMM256-6" in variants
|
|
assert "EMM2566" in variants
|
|
|
|
def test_alphanumeric_with_space(self):
|
|
"""Should normalize customer number with space."""
|
|
variants = FieldNormalizer.normalize_customer_number("ABC 123")
|
|
assert "ABC 123" in variants
|
|
assert "ABC123" in variants
|
|
|
|
def test_case_variants(self):
|
|
"""Should generate uppercase and lowercase variants."""
|
|
variants = FieldNormalizer.normalize_customer_number("Abc123")
|
|
assert "Abc123" in variants
|
|
assert "ABC123" in variants
|
|
assert "abc123" in variants
|
|
|
|
|
|
class TestNormalizeAmount:
|
|
"""Tests for FieldNormalizer.normalize_amount()"""
|
|
|
|
def test_integer_amount(self):
|
|
"""Should normalize integer amount."""
|
|
variants = FieldNormalizer.normalize_amount("114")
|
|
assert "114" in variants
|
|
assert "114,00" in variants
|
|
assert "114.00" in variants
|
|
|
|
def test_with_comma_decimal(self):
|
|
"""Should normalize amount with comma as decimal separator."""
|
|
variants = FieldNormalizer.normalize_amount("114,00")
|
|
assert "114,00" in variants
|
|
assert "114.00" in variants
|
|
|
|
def test_with_dot_decimal(self):
|
|
"""Should normalize amount with dot as decimal separator."""
|
|
variants = FieldNormalizer.normalize_amount("114.00")
|
|
assert "114.00" in variants
|
|
assert "114,00" in variants
|
|
|
|
def test_with_space_thousand_separator(self):
|
|
"""Should handle space as thousand separator."""
|
|
variants = FieldNormalizer.normalize_amount("1 234,56")
|
|
assert "1234,56" in variants
|
|
assert "1234.56" in variants
|
|
|
|
def test_space_as_decimal_separator(self):
|
|
"""Should handle space as decimal separator (Swedish format)."""
|
|
variants = FieldNormalizer.normalize_amount("3045 52")
|
|
assert "3045.52" in variants
|
|
assert "3045,52" in variants
|
|
assert "304552" in variants
|
|
|
|
def test_us_format(self):
|
|
"""Should handle US format (comma thousand, dot decimal)."""
|
|
variants = FieldNormalizer.normalize_amount("1,390.00")
|
|
assert "1390.00" in variants
|
|
assert "1390,00" in variants
|
|
assert "1.390,00" in variants # European conversion
|
|
|
|
def test_european_format(self):
|
|
"""Should handle European format (dot thousand, comma decimal)."""
|
|
variants = FieldNormalizer.normalize_amount("1.390,00")
|
|
assert "1390.00" in variants
|
|
assert "1390,00" in variants
|
|
assert "1,390.00" in variants # US conversion
|
|
|
|
def test_space_thousand_with_decimal(self):
|
|
"""Should handle space thousand separator with decimal."""
|
|
variants = FieldNormalizer.normalize_amount("10 571,00")
|
|
assert "10571,00" in variants
|
|
assert "10571.00" in variants
|
|
|
|
def test_removes_currency_symbols(self):
|
|
"""Should remove currency symbols."""
|
|
variants = FieldNormalizer.normalize_amount("114 SEK")
|
|
assert "114" in variants
|
|
|
|
def test_large_amount_european_format(self):
|
|
"""Should generate European format for large amounts."""
|
|
variants = FieldNormalizer.normalize_amount("20485")
|
|
assert "20485" in variants
|
|
assert "20.485" in variants
|
|
assert "20.485,00" in variants
|
|
|
|
|
|
class TestNormalizeDate:
|
|
"""Tests for FieldNormalizer.normalize_date()"""
|
|
|
|
def test_iso_format(self):
|
|
"""Should parse and generate variants from ISO format."""
|
|
variants = FieldNormalizer.normalize_date("2025-12-13")
|
|
assert "2025-12-13" in variants
|
|
assert "13/12/2025" in variants
|
|
assert "13.12.2025" in variants
|
|
assert "20251213" in variants
|
|
|
|
def test_european_slash_format(self):
|
|
"""Should parse European slash format DD/MM/YYYY."""
|
|
variants = FieldNormalizer.normalize_date("13/12/2025")
|
|
assert "2025-12-13" in variants
|
|
assert "13/12/2025" in variants
|
|
|
|
def test_european_dot_format(self):
|
|
"""Should parse European dot format DD.MM.YYYY."""
|
|
variants = FieldNormalizer.normalize_date("13.12.2025")
|
|
assert "2025-12-13" in variants
|
|
assert "13.12.2025" in variants
|
|
|
|
def test_compact_format_yyyymmdd(self):
|
|
"""Should parse compact format YYYYMMDD."""
|
|
variants = FieldNormalizer.normalize_date("20251213")
|
|
assert "2025-12-13" in variants
|
|
assert "20251213" in variants
|
|
|
|
def test_compact_format_yymmdd(self):
|
|
"""Should parse compact format YYMMDD."""
|
|
variants = FieldNormalizer.normalize_date("251213")
|
|
assert "2025-12-13" in variants
|
|
assert "251213" in variants
|
|
|
|
def test_short_year_dot_format(self):
|
|
"""Should parse DD.MM.YY format."""
|
|
variants = FieldNormalizer.normalize_date("02.08.25")
|
|
assert "2025-08-02" in variants
|
|
assert "02.08.25" in variants
|
|
|
|
def test_swedish_month_name(self):
|
|
"""Should parse Swedish month names."""
|
|
variants = FieldNormalizer.normalize_date("13 december 2025")
|
|
assert "2025-12-13" in variants
|
|
|
|
def test_swedish_month_abbreviation(self):
|
|
"""Should parse Swedish month abbreviations."""
|
|
variants = FieldNormalizer.normalize_date("13 dec 2025")
|
|
assert "2025-12-13" in variants
|
|
|
|
def test_generates_swedish_month_variants(self):
|
|
"""Should generate Swedish month name variants."""
|
|
variants = FieldNormalizer.normalize_date("2025-01-09")
|
|
assert "9 januari 2025" in variants
|
|
assert "9 jan 2025" in variants
|
|
|
|
def test_generates_hyphen_month_abbrev_format(self):
|
|
"""Should generate DD-MON-YY format."""
|
|
variants = FieldNormalizer.normalize_date("2024-10-30")
|
|
assert "30-OKT-24" in variants
|
|
assert "30-okt-24" in variants
|
|
|
|
def test_iso_with_time(self):
|
|
"""Should handle ISO format with time component."""
|
|
variants = FieldNormalizer.normalize_date("2026-01-09 00:00:00")
|
|
assert "2026-01-09" in variants
|
|
assert "09/01/2026" in variants
|
|
|
|
def test_ambiguous_date_generates_both(self):
|
|
"""Should generate both interpretations for ambiguous dates."""
|
|
# 01/02/2025 could be Jan 2 (US) or Feb 1 (EU)
|
|
variants = FieldNormalizer.normalize_date("01/02/2025")
|
|
# Both interpretations should be present
|
|
assert "2025-02-01" in variants # European: DD/MM/YYYY
|
|
assert "2025-01-02" in variants # US: MM/DD/YYYY
|
|
|
|
def test_middle_dot_separator(self):
|
|
"""Should generate middle dot separator variant."""
|
|
variants = FieldNormalizer.normalize_date("2025-12-13")
|
|
assert "2025·12·13" in variants
|
|
|
|
def test_spaced_format(self):
|
|
"""Should generate spaced format variants."""
|
|
variants = FieldNormalizer.normalize_date("2025-12-13")
|
|
assert "2025 12 13" in variants
|
|
assert "25 12 13" in variants
|
|
|
|
|
|
class TestNormalizeField:
|
|
"""Tests for the normalize_field() function."""
|
|
|
|
def test_uses_correct_normalizer(self):
|
|
"""Should use the correct normalizer for each field type."""
|
|
# Test InvoiceNumber
|
|
result = normalize_field("InvoiceNumber", "INV-123")
|
|
assert "123" in result
|
|
assert "INV-123" in result
|
|
|
|
# Test Amount
|
|
result = normalize_field("Amount", "100")
|
|
assert "100" in result
|
|
assert "100,00" in result
|
|
|
|
# Test Date
|
|
result = normalize_field("InvoiceDate", "2025-01-01")
|
|
assert "2025-01-01" in result
|
|
assert "01/01/2025" in result
|
|
|
|
def test_unknown_field_cleans_text(self):
|
|
"""Should clean text for unknown field types."""
|
|
result = normalize_field("UnknownField", " hello world ")
|
|
assert result == ["hello world"]
|
|
|
|
def test_none_value(self):
|
|
"""Should return empty list for None value."""
|
|
result = normalize_field("InvoiceNumber", None)
|
|
assert result == []
|
|
|
|
def test_empty_string(self):
|
|
"""Should return empty list for empty string."""
|
|
result = normalize_field("InvoiceNumber", "")
|
|
assert result == []
|
|
|
|
def test_whitespace_only(self):
|
|
"""Should return empty list for whitespace-only string."""
|
|
result = normalize_field("InvoiceNumber", " ")
|
|
assert result == []
|
|
|
|
def test_converts_non_string_to_string(self):
|
|
"""Should convert non-string values to string."""
|
|
result = normalize_field("Amount", 100)
|
|
assert "100" in result
|
|
|
|
|
|
class TestNormalizersMapping:
|
|
"""Tests for the NORMALIZERS mapping."""
|
|
|
|
def test_all_expected_fields_mapped(self):
|
|
"""Should have normalizers for all expected field types."""
|
|
expected_fields = [
|
|
"InvoiceNumber",
|
|
"OCR",
|
|
"Bankgiro",
|
|
"Plusgiro",
|
|
"Amount",
|
|
"InvoiceDate",
|
|
"InvoiceDueDate",
|
|
"supplier_organisation_number",
|
|
"supplier_accounts",
|
|
"customer_number",
|
|
]
|
|
for field in expected_fields:
|
|
assert field in NORMALIZERS, f"Missing normalizer for {field}"
|
|
|
|
def test_normalizers_are_callable(self):
|
|
"""All normalizers should be callable."""
|
|
for name, normalizer in NORMALIZERS.items():
|
|
assert callable(normalizer), f"Normalizer {name} is not callable"
|
|
|
|
|
|
class TestNormalizedValueDataclass:
|
|
"""Tests for the NormalizedValue dataclass."""
|
|
|
|
def test_creation(self):
|
|
"""Should create NormalizedValue with all fields."""
|
|
nv = NormalizedValue(
|
|
original="100",
|
|
variants=["100", "100.00", "100,00"],
|
|
field_type="Amount",
|
|
)
|
|
assert nv.original == "100"
|
|
assert nv.variants == ["100", "100.00", "100,00"]
|
|
assert nv.field_type == "Amount"
|
|
|
|
|
|
class TestEdgeCases:
|
|
"""Tests for edge cases and special scenarios."""
|
|
|
|
def test_unicode_normalization(self):
|
|
"""Should handle unicode characters properly."""
|
|
# Non-breaking space
|
|
variants = FieldNormalizer.normalize_amount("1\xa0234,56")
|
|
assert "1234,56" in variants
|
|
|
|
def test_special_dashes_in_bankgiro(self):
|
|
"""Should handle special dash characters in bankgiro."""
|
|
# en-dash
|
|
variants = FieldNormalizer.normalize_bankgiro("5393\u20139484")
|
|
assert "53939484" in variants
|
|
assert "5393-9484" in variants
|
|
|
|
def test_very_long_invoice_number(self):
|
|
"""Should handle very long invoice numbers."""
|
|
long_number = "1" * 50
|
|
variants = FieldNormalizer.normalize_invoice_number(long_number)
|
|
assert long_number in variants
|
|
|
|
def test_mixed_case_vat_prefix(self):
|
|
"""Should handle mixed case VAT prefix."""
|
|
variants = FieldNormalizer.normalize_organisation_number("Se556123456701")
|
|
assert "5561234567" in variants
|
|
assert "SE556123456701" in variants
|
|
|
|
def test_date_with_leading_zeros(self):
|
|
"""Should handle dates with leading zeros."""
|
|
variants = FieldNormalizer.normalize_date("01.01.2025")
|
|
assert "2025-01-01" in variants
|
|
|
|
def test_amount_with_kr_suffix(self):
|
|
"""Should handle amount with kr suffix."""
|
|
variants = FieldNormalizer.normalize_amount("100 kr")
|
|
assert "100" in variants
|
|
|
|
def test_amount_with_colon_dash(self):
|
|
"""Should handle amount with :- suffix."""
|
|
variants = FieldNormalizer.normalize_amount("100:-")
|
|
assert "100" in variants
|
|
|
|
|
|
class TestOrganisationNumberEdgeCases:
|
|
"""Additional edge case tests for organisation number normalization."""
|
|
|
|
def test_vat_with_10_digits_after_se(self):
|
|
"""Should handle VAT format SE + 10 digits (without trailing 01)."""
|
|
# Line 158-159: len(potential_org) == 10 case
|
|
variants = FieldNormalizer.normalize_organisation_number("SE5561234567")
|
|
assert "5561234567" in variants
|
|
assert "556123-4567" in variants
|
|
|
|
def test_vat_with_spaces(self):
|
|
"""Should handle VAT with spaces."""
|
|
variants = FieldNormalizer.normalize_organisation_number("SE 5561234567 01")
|
|
assert "5561234567" in variants
|
|
|
|
def test_short_vat_prefix(self):
|
|
"""Should handle SE prefix with less than 12 chars total."""
|
|
# This tests the fallback to digit extraction
|
|
variants = FieldNormalizer.normalize_organisation_number("SE12345")
|
|
assert "12345" in variants
|
|
|
|
|
|
class TestSupplierAccountsEdgeCases:
|
|
"""Additional edge case tests for supplier accounts normalization."""
|
|
|
|
def test_empty_account_in_list(self):
|
|
"""Should skip empty accounts in list."""
|
|
# Line 224: empty account continue
|
|
variants = FieldNormalizer.normalize_supplier_accounts("PG:12345678 | | BG:53939484")
|
|
assert "12345678" in variants
|
|
assert "53939484" in variants
|
|
|
|
def test_account_without_prefix(self):
|
|
"""Should handle account number without prefix."""
|
|
# Line 240: number = account (no colon)
|
|
variants = FieldNormalizer.normalize_supplier_accounts("12345678")
|
|
assert "12345678" in variants
|
|
assert "1234567-8" in variants
|
|
|
|
def test_7_digit_account(self):
|
|
"""Should handle 7-digit account number."""
|
|
# Line 254-256: 7-digit format
|
|
variants = FieldNormalizer.normalize_supplier_accounts("1234567")
|
|
assert "1234567" in variants
|
|
assert "123456-7" in variants
|
|
|
|
def test_10_digit_account(self):
|
|
"""Should handle 10-digit account number (org number format)."""
|
|
# Line 257-259: 10-digit format
|
|
variants = FieldNormalizer.normalize_supplier_accounts("5561234567")
|
|
assert "5561234567" in variants
|
|
assert "556123-4567" in variants
|
|
|
|
def test_mixed_format_accounts(self):
|
|
"""Should handle multiple accounts with different formats."""
|
|
variants = FieldNormalizer.normalize_supplier_accounts("PG:1234567 | 53939484")
|
|
assert "1234567" in variants
|
|
assert "53939484" in variants
|
|
|
|
|
|
class TestDateEdgeCases:
|
|
"""Additional edge case tests for date normalization."""
|
|
|
|
def test_invalid_iso_date(self):
|
|
"""Should handle invalid ISO date gracefully."""
|
|
# Line 483-484: ValueError in date parsing
|
|
variants = FieldNormalizer.normalize_date("2025-13-45") # Invalid month/day
|
|
# Should still return original value
|
|
assert "2025-13-45" in variants
|
|
|
|
def test_invalid_european_date(self):
|
|
"""Should handle invalid European date gracefully."""
|
|
# Line 496-497: ValueError in ambiguous date parsing
|
|
variants = FieldNormalizer.normalize_date("32/13/2025") # Invalid day/month
|
|
assert "32/13/2025" in variants
|
|
|
|
def test_invalid_2digit_year_date(self):
|
|
"""Should handle invalid 2-digit year date gracefully."""
|
|
# Line 521-522, 528-529: ValueError in 2-digit year parsing
|
|
variants = FieldNormalizer.normalize_date("99.99.25") # Invalid day/month
|
|
assert "99.99.25" in variants
|
|
|
|
def test_swedish_month_with_short_year(self):
|
|
"""Should handle Swedish month with 2-digit year."""
|
|
# Line 544: short year conversion
|
|
variants = FieldNormalizer.normalize_date("15 jan 25")
|
|
assert "2025-01-15" in variants
|
|
|
|
def test_swedish_month_with_old_year(self):
|
|
"""Should handle Swedish month with old 2-digit year (50-99 -> 1900s)."""
|
|
variants = FieldNormalizer.normalize_date("15 jan 99")
|
|
assert "1999-01-15" in variants
|
|
|
|
def test_swedish_month_invalid_date(self):
|
|
"""Should handle Swedish month with invalid day gracefully."""
|
|
# Line 548-549: ValueError continue
|
|
variants = FieldNormalizer.normalize_date("32 januari 2025") # Invalid day
|
|
# Should still return original
|
|
assert "32 januari 2025" in variants
|
|
|
|
def test_ambiguous_date_both_invalid(self):
|
|
"""Should handle ambiguous date where one interpretation is invalid."""
|
|
# 30/02/2025 - Feb 30 is invalid, but 02/30 would be invalid too
|
|
# This should still work for valid interpretations
|
|
variants = FieldNormalizer.normalize_date("15/06/2025")
|
|
assert "2025-06-15" in variants # European interpretation
|
|
# US interpretation (month=15) would be invalid and skipped
|
|
|
|
def test_date_slash_format_2digit_year(self):
|
|
"""Should parse DD/MM/YY format."""
|
|
variants = FieldNormalizer.normalize_date("15/06/25")
|
|
assert "2025-06-15" in variants
|
|
|
|
def test_date_dash_format_2digit_year(self):
|
|
"""Should parse DD-MM-YY format."""
|
|
variants = FieldNormalizer.normalize_date("15-06-25")
|
|
assert "2025-06-15" in variants
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|