349 lines
10 KiB
Python
349 lines
10 KiB
Python
"""
|
|
Tests for customer number parser.
|
|
"""
|
|
|
|
import pytest
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
from inference.pipeline.customer_number_parser import (
|
|
CustomerNumberParser,
|
|
DashFormatPattern,
|
|
NoDashFormatPattern,
|
|
CompactFormatPattern,
|
|
LabeledPattern,
|
|
)
|
|
|
|
|
|
class TestDashFormatPattern:
|
|
"""Test DashFormatPattern (ABC 123-X)."""
|
|
|
|
def test_standard_dash_format(self):
|
|
"""Test standard format with dash."""
|
|
pattern = DashFormatPattern()
|
|
match = pattern.match("Customer: JTY 576-3")
|
|
|
|
assert match is not None
|
|
assert match.value == "JTY 576-3"
|
|
assert match.confidence == 0.95
|
|
assert match.pattern_name == "DashFormat"
|
|
|
|
def test_multiple_letter_prefix(self):
|
|
"""Test with different prefix lengths."""
|
|
pattern = DashFormatPattern()
|
|
|
|
# 2 letters
|
|
match = pattern.match("EM 25-6")
|
|
assert match is not None
|
|
assert match.value == "EM 25-6"
|
|
|
|
# 3 letters
|
|
match = pattern.match("EMM 256-6")
|
|
assert match is not None
|
|
assert match.value == "EMM 256-6"
|
|
|
|
# 4 letters
|
|
match = pattern.match("ABCD 123-X")
|
|
assert match is not None
|
|
assert match.value == "ABCD 123-X"
|
|
|
|
def test_case_insensitive(self):
|
|
"""Test case insensitivity."""
|
|
pattern = DashFormatPattern()
|
|
match = pattern.match("jty 576-3")
|
|
|
|
assert match is not None
|
|
assert match.value == "JTY 576-3" # Uppercased
|
|
|
|
def test_exclude_postal_code(self):
|
|
"""Test that Swedish postal codes are excluded."""
|
|
pattern = DashFormatPattern()
|
|
|
|
# Should NOT match SE postal codes
|
|
match = pattern.match("SE 106 43-Stockholm")
|
|
assert match is None
|
|
|
|
|
|
class TestNoDashFormatPattern:
|
|
"""Test NoDashFormatPattern (ABC 123X without dash)."""
|
|
|
|
def test_no_dash_format(self):
|
|
"""Test format without dash (adds dash in output)."""
|
|
pattern = NoDashFormatPattern()
|
|
match = pattern.match("Dwq 211X")
|
|
|
|
assert match is not None
|
|
assert match.value == "DWQ 211-X" # Dash added
|
|
assert match.confidence == 0.90
|
|
|
|
def test_uppercase_letter_suffix(self):
|
|
"""Test with uppercase letter suffix."""
|
|
pattern = NoDashFormatPattern()
|
|
match = pattern.match("FFL 019N")
|
|
|
|
assert match is not None
|
|
assert match.value == "FFL 019-N"
|
|
|
|
def test_exclude_postal_code(self):
|
|
"""Test that postal codes are excluded."""
|
|
pattern = NoDashFormatPattern()
|
|
|
|
# Should NOT match SE postal codes
|
|
match = pattern.match("SE 106 43")
|
|
assert match is None
|
|
|
|
match = pattern.match("SE10643")
|
|
assert match is None
|
|
|
|
|
|
class TestCompactFormatPattern:
|
|
"""Test CompactFormatPattern (ABC123X compact format)."""
|
|
|
|
def test_compact_format_with_suffix(self):
|
|
"""Test compact format with letter suffix."""
|
|
pattern = CompactFormatPattern()
|
|
text = "JTY5763"
|
|
match = pattern.match(text)
|
|
|
|
assert match is not None
|
|
# Should add dash if there's a suffix
|
|
assert "JTY" in match.value
|
|
|
|
def test_compact_format_without_suffix(self):
|
|
"""Test compact format without letter suffix."""
|
|
pattern = CompactFormatPattern()
|
|
match = pattern.match("FFL019")
|
|
|
|
assert match is not None
|
|
assert "FFL" in match.value
|
|
|
|
def test_exclude_se_prefix(self):
|
|
"""Test that SE prefix is excluded (postal codes)."""
|
|
pattern = CompactFormatPattern()
|
|
match = pattern.match("SE10643")
|
|
|
|
assert match is None # Should be filtered out
|
|
|
|
|
|
class TestLabeledPattern:
|
|
"""Test LabeledPattern (with explicit label)."""
|
|
|
|
def test_swedish_label_kundnummer(self):
|
|
"""Test Swedish label 'Kundnummer'."""
|
|
pattern = LabeledPattern()
|
|
match = pattern.match("Kundnummer: JTY 576-3")
|
|
|
|
assert match is not None
|
|
assert "JTY 576-3" in match.value
|
|
assert match.confidence == 0.98 # Very high confidence
|
|
|
|
def test_swedish_label_kundnr(self):
|
|
"""Test Swedish abbreviated label."""
|
|
pattern = LabeledPattern()
|
|
match = pattern.match("Kundnr: EMM 256-6")
|
|
|
|
assert match is not None
|
|
assert "EMM 256-6" in match.value
|
|
|
|
def test_english_label_customer_no(self):
|
|
"""Test English label."""
|
|
pattern = LabeledPattern()
|
|
match = pattern.match("Customer No: ABC 123-X")
|
|
|
|
assert match is not None
|
|
assert "ABC 123-X" in match.value
|
|
|
|
def test_label_without_colon(self):
|
|
"""Test label without colon."""
|
|
pattern = LabeledPattern()
|
|
match = pattern.match("Kundnummer JTY 576-3")
|
|
|
|
assert match is not None
|
|
assert "JTY 576-3" in match.value
|
|
|
|
|
|
class TestCustomerNumberParser:
|
|
"""Test CustomerNumberParser main class."""
|
|
|
|
@pytest.fixture
|
|
def parser(self):
|
|
"""Create parser instance."""
|
|
return CustomerNumberParser()
|
|
|
|
def test_parse_with_dash(self, parser):
|
|
"""Test parsing standard format with dash."""
|
|
result, is_valid, error = parser.parse("Customer: JTY 576-3")
|
|
|
|
assert is_valid
|
|
assert result == "JTY 576-3"
|
|
assert error is None
|
|
|
|
def test_parse_without_dash(self, parser):
|
|
"""Test parsing format without dash."""
|
|
result, is_valid, error = parser.parse("Dwq 211X Billo")
|
|
|
|
assert is_valid
|
|
assert result == "DWQ 211-X" # Dash added
|
|
assert error is None
|
|
|
|
def test_parse_with_label(self, parser):
|
|
"""Test parsing with explicit label (highest priority)."""
|
|
text = "Kundnummer: JTY 576-3, also EMM 256-6"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
# Should extract the labeled one
|
|
assert "JTY 576-3" in result or "EMM 256-6" in result
|
|
|
|
def test_parse_exclude_postal_code(self, parser):
|
|
"""Test that Swedish postal codes are excluded."""
|
|
text = "SE 106 43 Stockholm"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
# Should not extract postal code as customer number
|
|
if result:
|
|
assert "SE 106" not in result
|
|
|
|
def test_parse_empty_text(self, parser):
|
|
"""Test parsing empty text."""
|
|
result, is_valid, error = parser.parse("")
|
|
|
|
assert not is_valid
|
|
assert result is None
|
|
assert error == "Empty text"
|
|
|
|
def test_parse_no_match(self, parser):
|
|
"""Test parsing text with no customer number."""
|
|
text = "This invoice contains only descriptive text about the product details and pricing"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert not is_valid
|
|
assert result is None
|
|
assert "No customer number found" in error
|
|
|
|
def test_parse_all_finds_multiple(self, parser):
|
|
"""Test parse_all finds multiple customer numbers."""
|
|
text = "Customer codes: JTY 576-3, EMM 256-6, FFL 019N"
|
|
matches = parser.parse_all(text)
|
|
|
|
# Should find multiple matches
|
|
assert len(matches) >= 1
|
|
|
|
# Should be sorted by confidence
|
|
if len(matches) > 1:
|
|
for i in range(len(matches) - 1):
|
|
assert matches[i].confidence >= matches[i + 1].confidence
|
|
|
|
|
|
class TestRealWorldExamples:
|
|
"""Test with real-world examples from the codebase."""
|
|
|
|
@pytest.fixture
|
|
def parser(self):
|
|
"""Create parser instance."""
|
|
return CustomerNumberParser()
|
|
|
|
def test_billo363_customer_number(self, parser):
|
|
"""Test Billo363 PDF customer number."""
|
|
# From issue report: "Dwq 211X Billo SE 106 43 Stockholm"
|
|
text = "Dwq 211X Billo SE 106 43 Stockholm"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert result == "DWQ 211-X"
|
|
|
|
def test_customer_number_with_company_name(self, parser):
|
|
"""Test customer number mixed with company name."""
|
|
text = "Billo AB, JTY 576-3"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert result == "JTY 576-3"
|
|
|
|
def test_customer_number_after_address(self, parser):
|
|
"""Test customer number appearing after address."""
|
|
text = "Stockholm 106 43, Customer: EMM 256-6"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
# Should extract customer number, not postal code
|
|
assert "EMM 256-6" in result
|
|
assert "106 43" not in result
|
|
|
|
def test_multiple_formats_in_text(self, parser):
|
|
"""Test text with multiple potential formats."""
|
|
text = "FFL 019N and JTY 576-3 are customer codes"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
# Should extract one of them (highest confidence)
|
|
assert result in ["FFL 019-N", "JTY 576-3"]
|
|
|
|
|
|
class TestEdgeCases:
|
|
"""Test edge cases and boundary conditions."""
|
|
|
|
@pytest.fixture
|
|
def parser(self):
|
|
"""Create parser instance."""
|
|
return CustomerNumberParser()
|
|
|
|
def test_short_prefix(self, parser):
|
|
"""Test with 2-letter prefix."""
|
|
text = "AB 12-X"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert "AB" in result
|
|
|
|
def test_long_prefix(self, parser):
|
|
"""Test with 4-letter prefix."""
|
|
text = "ABCD 1234-Z"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert "ABCD" in result
|
|
|
|
def test_single_digit_number(self, parser):
|
|
"""Test with single digit number."""
|
|
text = "ABC 1-X"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert "ABC 1-X" == result
|
|
|
|
def test_four_digit_number(self, parser):
|
|
"""Test with four digit number."""
|
|
text = "ABC 1234-X"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert "ABC 1234-X" == result
|
|
|
|
def test_whitespace_handling(self, parser):
|
|
"""Test handling of extra whitespace."""
|
|
text = " JTY 576-3 "
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert result == "JTY 576-3"
|
|
|
|
def test_case_normalization(self, parser):
|
|
"""Test that output is normalized to uppercase."""
|
|
text = "jty 576-3"
|
|
result, is_valid, error = parser.parse(text)
|
|
|
|
assert is_valid
|
|
assert result == "JTY 576-3" # Uppercased
|
|
|
|
def test_none_input(self, parser):
|
|
"""Test with None input."""
|
|
result, is_valid, error = parser.parse(None)
|
|
|
|
assert not is_valid
|
|
assert result is None
|