""" Tests for customer number parser. """ import pytest import sys from pathlib import Path # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) from inference.pipeline.customer_number_parser import ( CustomerNumberParser, DashFormatPattern, NoDashFormatPattern, CompactFormatPattern, LabeledPattern, ) class TestDashFormatPattern: """Test DashFormatPattern (ABC 123-X).""" def test_standard_dash_format(self): """Test standard format with dash.""" pattern = DashFormatPattern() match = pattern.match("Customer: JTY 576-3") assert match is not None assert match.value == "JTY 576-3" assert match.confidence == 0.95 assert match.pattern_name == "DashFormat" def test_multiple_letter_prefix(self): """Test with different prefix lengths.""" pattern = DashFormatPattern() # 2 letters match = pattern.match("EM 25-6") assert match is not None assert match.value == "EM 25-6" # 3 letters match = pattern.match("EMM 256-6") assert match is not None assert match.value == "EMM 256-6" # 4 letters match = pattern.match("ABCD 123-X") assert match is not None assert match.value == "ABCD 123-X" def test_case_insensitive(self): """Test case insensitivity.""" pattern = DashFormatPattern() match = pattern.match("jty 576-3") assert match is not None assert match.value == "JTY 576-3" # Uppercased def test_exclude_postal_code(self): """Test that Swedish postal codes are excluded.""" pattern = DashFormatPattern() # Should NOT match SE postal codes match = pattern.match("SE 106 43-Stockholm") assert match is None class TestNoDashFormatPattern: """Test NoDashFormatPattern (ABC 123X without dash).""" def test_no_dash_format(self): """Test format without dash (adds dash in output).""" pattern = NoDashFormatPattern() match = pattern.match("Dwq 211X") assert match is not None assert match.value == "DWQ 211-X" # Dash added assert match.confidence == 0.90 def test_uppercase_letter_suffix(self): """Test with uppercase letter suffix.""" pattern = NoDashFormatPattern() match = pattern.match("FFL 019N") assert match is not None assert match.value == "FFL 019-N" def test_exclude_postal_code(self): """Test that postal codes are excluded.""" pattern = NoDashFormatPattern() # Should NOT match SE postal codes match = pattern.match("SE 106 43") assert match is None match = pattern.match("SE10643") assert match is None class TestCompactFormatPattern: """Test CompactFormatPattern (ABC123X compact format).""" def test_compact_format_with_suffix(self): """Test compact format with letter suffix.""" pattern = CompactFormatPattern() text = "JTY5763" match = pattern.match(text) assert match is not None # Should add dash if there's a suffix assert "JTY" in match.value def test_compact_format_without_suffix(self): """Test compact format without letter suffix.""" pattern = CompactFormatPattern() match = pattern.match("FFL019") assert match is not None assert "FFL" in match.value def test_exclude_se_prefix(self): """Test that SE prefix is excluded (postal codes).""" pattern = CompactFormatPattern() match = pattern.match("SE10643") assert match is None # Should be filtered out class TestLabeledPattern: """Test LabeledPattern (with explicit label).""" def test_swedish_label_kundnummer(self): """Test Swedish label 'Kundnummer'.""" pattern = LabeledPattern() match = pattern.match("Kundnummer: JTY 576-3") assert match is not None assert "JTY 576-3" in match.value assert match.confidence == 0.98 # Very high confidence def test_swedish_label_kundnr(self): """Test Swedish abbreviated label.""" pattern = LabeledPattern() match = pattern.match("Kundnr: EMM 256-6") assert match is not None assert "EMM 256-6" in match.value def test_english_label_customer_no(self): """Test English label.""" pattern = LabeledPattern() match = pattern.match("Customer No: ABC 123-X") assert match is not None assert "ABC 123-X" in match.value def test_label_without_colon(self): """Test label without colon.""" pattern = LabeledPattern() match = pattern.match("Kundnummer JTY 576-3") assert match is not None assert "JTY 576-3" in match.value class TestCustomerNumberParser: """Test CustomerNumberParser main class.""" @pytest.fixture def parser(self): """Create parser instance.""" return CustomerNumberParser() def test_parse_with_dash(self, parser): """Test parsing standard format with dash.""" result, is_valid, error = parser.parse("Customer: JTY 576-3") assert is_valid assert result == "JTY 576-3" assert error is None def test_parse_without_dash(self, parser): """Test parsing format without dash.""" result, is_valid, error = parser.parse("Dwq 211X Billo") assert is_valid assert result == "DWQ 211-X" # Dash added assert error is None def test_parse_with_label(self, parser): """Test parsing with explicit label (highest priority).""" text = "Kundnummer: JTY 576-3, also EMM 256-6" result, is_valid, error = parser.parse(text) assert is_valid # Should extract the labeled one assert "JTY 576-3" in result or "EMM 256-6" in result def test_parse_exclude_postal_code(self, parser): """Test that Swedish postal codes are excluded.""" text = "SE 106 43 Stockholm" result, is_valid, error = parser.parse(text) # Should not extract postal code as customer number if result: assert "SE 106" not in result def test_parse_empty_text(self, parser): """Test parsing empty text.""" result, is_valid, error = parser.parse("") assert not is_valid assert result is None assert error == "Empty text" def test_parse_no_match(self, parser): """Test parsing text with no customer number.""" text = "This invoice contains only descriptive text about the product details and pricing" result, is_valid, error = parser.parse(text) assert not is_valid assert result is None assert "No customer number found" in error def test_parse_all_finds_multiple(self, parser): """Test parse_all finds multiple customer numbers.""" text = "Customer codes: JTY 576-3, EMM 256-6, FFL 019N" matches = parser.parse_all(text) # Should find multiple matches assert len(matches) >= 1 # Should be sorted by confidence if len(matches) > 1: for i in range(len(matches) - 1): assert matches[i].confidence >= matches[i + 1].confidence class TestRealWorldExamples: """Test with real-world examples from the codebase.""" @pytest.fixture def parser(self): """Create parser instance.""" return CustomerNumberParser() def test_billo363_customer_number(self, parser): """Test Billo363 PDF customer number.""" # From issue report: "Dwq 211X Billo SE 106 43 Stockholm" text = "Dwq 211X Billo SE 106 43 Stockholm" result, is_valid, error = parser.parse(text) assert is_valid assert result == "DWQ 211-X" def test_customer_number_with_company_name(self, parser): """Test customer number mixed with company name.""" text = "Billo AB, JTY 576-3" result, is_valid, error = parser.parse(text) assert is_valid assert result == "JTY 576-3" def test_customer_number_after_address(self, parser): """Test customer number appearing after address.""" text = "Stockholm 106 43, Customer: EMM 256-6" result, is_valid, error = parser.parse(text) assert is_valid # Should extract customer number, not postal code assert "EMM 256-6" in result assert "106 43" not in result def test_multiple_formats_in_text(self, parser): """Test text with multiple potential formats.""" text = "FFL 019N and JTY 576-3 are customer codes" result, is_valid, error = parser.parse(text) assert is_valid # Should extract one of them (highest confidence) assert result in ["FFL 019-N", "JTY 576-3"] class TestEdgeCases: """Test edge cases and boundary conditions.""" @pytest.fixture def parser(self): """Create parser instance.""" return CustomerNumberParser() def test_short_prefix(self, parser): """Test with 2-letter prefix.""" text = "AB 12-X" result, is_valid, error = parser.parse(text) assert is_valid assert "AB" in result def test_long_prefix(self, parser): """Test with 4-letter prefix.""" text = "ABCD 1234-Z" result, is_valid, error = parser.parse(text) assert is_valid assert "ABCD" in result def test_single_digit_number(self, parser): """Test with single digit number.""" text = "ABC 1-X" result, is_valid, error = parser.parse(text) assert is_valid assert "ABC 1-X" == result def test_four_digit_number(self, parser): """Test with four digit number.""" text = "ABC 1234-X" result, is_valid, error = parser.parse(text) assert is_valid assert "ABC 1234-X" == result def test_whitespace_handling(self, parser): """Test handling of extra whitespace.""" text = " JTY 576-3 " result, is_valid, error = parser.parse(text) assert is_valid assert result == "JTY 576-3" def test_case_normalization(self, parser): """Test that output is normalized to uppercase.""" text = "jty 576-3" result, is_valid, error = parser.parse(text) assert is_valid assert result == "JTY 576-3" # Uppercased def test_none_input(self, parser): """Test with None input.""" result, is_valid, error = parser.parse(None) assert not is_valid assert result is None