Files
invoice-master-poc-v2/tests/test_customer_number_parser.py
2026-01-25 15:21:11 +01:00

349 lines
10 KiB
Python

"""
Tests for customer number parser.
"""
import pytest
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from src.inference.customer_number_parser import (
CustomerNumberParser,
DashFormatPattern,
NoDashFormatPattern,
CompactFormatPattern,
LabeledPattern,
)
class TestDashFormatPattern:
"""Test DashFormatPattern (ABC 123-X)."""
def test_standard_dash_format(self):
"""Test standard format with dash."""
pattern = DashFormatPattern()
match = pattern.match("Customer: JTY 576-3")
assert match is not None
assert match.value == "JTY 576-3"
assert match.confidence == 0.95
assert match.pattern_name == "DashFormat"
def test_multiple_letter_prefix(self):
"""Test with different prefix lengths."""
pattern = DashFormatPattern()
# 2 letters
match = pattern.match("EM 25-6")
assert match is not None
assert match.value == "EM 25-6"
# 3 letters
match = pattern.match("EMM 256-6")
assert match is not None
assert match.value == "EMM 256-6"
# 4 letters
match = pattern.match("ABCD 123-X")
assert match is not None
assert match.value == "ABCD 123-X"
def test_case_insensitive(self):
"""Test case insensitivity."""
pattern = DashFormatPattern()
match = pattern.match("jty 576-3")
assert match is not None
assert match.value == "JTY 576-3" # Uppercased
def test_exclude_postal_code(self):
"""Test that Swedish postal codes are excluded."""
pattern = DashFormatPattern()
# Should NOT match SE postal codes
match = pattern.match("SE 106 43-Stockholm")
assert match is None
class TestNoDashFormatPattern:
"""Test NoDashFormatPattern (ABC 123X without dash)."""
def test_no_dash_format(self):
"""Test format without dash (adds dash in output)."""
pattern = NoDashFormatPattern()
match = pattern.match("Dwq 211X")
assert match is not None
assert match.value == "DWQ 211-X" # Dash added
assert match.confidence == 0.90
def test_uppercase_letter_suffix(self):
"""Test with uppercase letter suffix."""
pattern = NoDashFormatPattern()
match = pattern.match("FFL 019N")
assert match is not None
assert match.value == "FFL 019-N"
def test_exclude_postal_code(self):
"""Test that postal codes are excluded."""
pattern = NoDashFormatPattern()
# Should NOT match SE postal codes
match = pattern.match("SE 106 43")
assert match is None
match = pattern.match("SE10643")
assert match is None
class TestCompactFormatPattern:
"""Test CompactFormatPattern (ABC123X compact format)."""
def test_compact_format_with_suffix(self):
"""Test compact format with letter suffix."""
pattern = CompactFormatPattern()
text = "JTY5763"
match = pattern.match(text)
assert match is not None
# Should add dash if there's a suffix
assert "JTY" in match.value
def test_compact_format_without_suffix(self):
"""Test compact format without letter suffix."""
pattern = CompactFormatPattern()
match = pattern.match("FFL019")
assert match is not None
assert "FFL" in match.value
def test_exclude_se_prefix(self):
"""Test that SE prefix is excluded (postal codes)."""
pattern = CompactFormatPattern()
match = pattern.match("SE10643")
assert match is None # Should be filtered out
class TestLabeledPattern:
"""Test LabeledPattern (with explicit label)."""
def test_swedish_label_kundnummer(self):
"""Test Swedish label 'Kundnummer'."""
pattern = LabeledPattern()
match = pattern.match("Kundnummer: JTY 576-3")
assert match is not None
assert "JTY 576-3" in match.value
assert match.confidence == 0.98 # Very high confidence
def test_swedish_label_kundnr(self):
"""Test Swedish abbreviated label."""
pattern = LabeledPattern()
match = pattern.match("Kundnr: EMM 256-6")
assert match is not None
assert "EMM 256-6" in match.value
def test_english_label_customer_no(self):
"""Test English label."""
pattern = LabeledPattern()
match = pattern.match("Customer No: ABC 123-X")
assert match is not None
assert "ABC 123-X" in match.value
def test_label_without_colon(self):
"""Test label without colon."""
pattern = LabeledPattern()
match = pattern.match("Kundnummer JTY 576-3")
assert match is not None
assert "JTY 576-3" in match.value
class TestCustomerNumberParser:
"""Test CustomerNumberParser main class."""
@pytest.fixture
def parser(self):
"""Create parser instance."""
return CustomerNumberParser()
def test_parse_with_dash(self, parser):
"""Test parsing standard format with dash."""
result, is_valid, error = parser.parse("Customer: JTY 576-3")
assert is_valid
assert result == "JTY 576-3"
assert error is None
def test_parse_without_dash(self, parser):
"""Test parsing format without dash."""
result, is_valid, error = parser.parse("Dwq 211X Billo")
assert is_valid
assert result == "DWQ 211-X" # Dash added
assert error is None
def test_parse_with_label(self, parser):
"""Test parsing with explicit label (highest priority)."""
text = "Kundnummer: JTY 576-3, also EMM 256-6"
result, is_valid, error = parser.parse(text)
assert is_valid
# Should extract the labeled one
assert "JTY 576-3" in result or "EMM 256-6" in result
def test_parse_exclude_postal_code(self, parser):
"""Test that Swedish postal codes are excluded."""
text = "SE 106 43 Stockholm"
result, is_valid, error = parser.parse(text)
# Should not extract postal code as customer number
if result:
assert "SE 106" not in result
def test_parse_empty_text(self, parser):
"""Test parsing empty text."""
result, is_valid, error = parser.parse("")
assert not is_valid
assert result is None
assert error == "Empty text"
def test_parse_no_match(self, parser):
"""Test parsing text with no customer number."""
text = "This invoice contains only descriptive text about the product details and pricing"
result, is_valid, error = parser.parse(text)
assert not is_valid
assert result is None
assert "No customer number found" in error
def test_parse_all_finds_multiple(self, parser):
"""Test parse_all finds multiple customer numbers."""
text = "Customer codes: JTY 576-3, EMM 256-6, FFL 019N"
matches = parser.parse_all(text)
# Should find multiple matches
assert len(matches) >= 1
# Should be sorted by confidence
if len(matches) > 1:
for i in range(len(matches) - 1):
assert matches[i].confidence >= matches[i + 1].confidence
class TestRealWorldExamples:
"""Test with real-world examples from the codebase."""
@pytest.fixture
def parser(self):
"""Create parser instance."""
return CustomerNumberParser()
def test_billo363_customer_number(self, parser):
"""Test Billo363 PDF customer number."""
# From issue report: "Dwq 211X Billo SE 106 43 Stockholm"
text = "Dwq 211X Billo SE 106 43 Stockholm"
result, is_valid, error = parser.parse(text)
assert is_valid
assert result == "DWQ 211-X"
def test_customer_number_with_company_name(self, parser):
"""Test customer number mixed with company name."""
text = "Billo AB, JTY 576-3"
result, is_valid, error = parser.parse(text)
assert is_valid
assert result == "JTY 576-3"
def test_customer_number_after_address(self, parser):
"""Test customer number appearing after address."""
text = "Stockholm 106 43, Customer: EMM 256-6"
result, is_valid, error = parser.parse(text)
assert is_valid
# Should extract customer number, not postal code
assert "EMM 256-6" in result
assert "106 43" not in result
def test_multiple_formats_in_text(self, parser):
"""Test text with multiple potential formats."""
text = "FFL 019N and JTY 576-3 are customer codes"
result, is_valid, error = parser.parse(text)
assert is_valid
# Should extract one of them (highest confidence)
assert result in ["FFL 019-N", "JTY 576-3"]
class TestEdgeCases:
"""Test edge cases and boundary conditions."""
@pytest.fixture
def parser(self):
"""Create parser instance."""
return CustomerNumberParser()
def test_short_prefix(self, parser):
"""Test with 2-letter prefix."""
text = "AB 12-X"
result, is_valid, error = parser.parse(text)
assert is_valid
assert "AB" in result
def test_long_prefix(self, parser):
"""Test with 4-letter prefix."""
text = "ABCD 1234-Z"
result, is_valid, error = parser.parse(text)
assert is_valid
assert "ABCD" in result
def test_single_digit_number(self, parser):
"""Test with single digit number."""
text = "ABC 1-X"
result, is_valid, error = parser.parse(text)
assert is_valid
assert "ABC 1-X" == result
def test_four_digit_number(self, parser):
"""Test with four digit number."""
text = "ABC 1234-X"
result, is_valid, error = parser.parse(text)
assert is_valid
assert "ABC 1234-X" == result
def test_whitespace_handling(self, parser):
"""Test handling of extra whitespace."""
text = " JTY 576-3 "
result, is_valid, error = parser.parse(text)
assert is_valid
assert result == "JTY 576-3"
def test_case_normalization(self, parser):
"""Test that output is normalized to uppercase."""
text = "jty 576-3"
result, is_valid, error = parser.parse(text)
assert is_valid
assert result == "JTY 576-3" # Uppercased
def test_none_input(self, parser):
"""Test with None input."""
result, is_valid, error = parser.parse(None)
assert not is_valid
assert result is None