177 lines
5.5 KiB
Python
177 lines
5.5 KiB
Python
"""
|
|
Tests for DocumentClassifier - TDD RED phase.
|
|
|
|
Test document type classification based on extracted fields.
|
|
"""
|
|
import pytest
|
|
|
|
from backend.domain.document_classifier import DocumentClassifier, ClassificationResult
|
|
|
|
|
|
class TestDocumentClassifier:
|
|
"""Test document classification logic."""
|
|
|
|
@pytest.fixture
|
|
def classifier(self) -> DocumentClassifier:
|
|
"""Create classifier instance."""
|
|
return DocumentClassifier()
|
|
|
|
# ==================== Invoice Detection Tests ====================
|
|
|
|
def test_classify_with_payment_line_returns_invoice(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""Payment line is the strongest invoice indicator."""
|
|
fields = {"payment_line": "# 123456 # 100 00 5 > 308-2963#"}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "invoice"
|
|
assert result.confidence >= 0.9
|
|
assert "payment_line" in result.reason
|
|
|
|
def test_classify_with_multiple_indicators_returns_invoice(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""Multiple invoice indicators -> invoice with medium confidence."""
|
|
fields = {
|
|
"Amount": "1200.00",
|
|
"Bankgiro": "123-4567",
|
|
"payment_line": None,
|
|
}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "invoice"
|
|
assert result.confidence >= 0.7
|
|
|
|
def test_classify_with_ocr_and_amount_returns_invoice(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""OCR + Amount is typical invoice pattern."""
|
|
fields = {
|
|
"OCR": "123456789012",
|
|
"Amount": "500.00",
|
|
}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "invoice"
|
|
assert result.confidence >= 0.7
|
|
|
|
def test_classify_with_single_indicator_returns_invoice_lower_confidence(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""Single indicator -> invoice but lower confidence."""
|
|
fields = {"Amount": "100.00"}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "invoice"
|
|
assert 0.5 <= result.confidence < 0.8
|
|
|
|
def test_classify_with_invoice_number_only(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""Invoice number alone suggests invoice."""
|
|
fields = {"InvoiceNumber": "INV-2024-001"}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "invoice"
|
|
|
|
# ==================== Letter Detection Tests ====================
|
|
|
|
def test_classify_with_no_indicators_returns_letter(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""No invoice indicators -> letter."""
|
|
fields: dict[str, str | None] = {}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "letter"
|
|
assert result.confidence >= 0.5
|
|
|
|
def test_classify_with_empty_fields_returns_letter(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""All fields empty or None -> letter."""
|
|
fields = {
|
|
"payment_line": None,
|
|
"OCR": None,
|
|
"Amount": None,
|
|
"Bankgiro": None,
|
|
}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "letter"
|
|
|
|
def test_classify_with_only_non_indicator_fields_returns_letter(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""Fields that don't indicate invoice -> letter."""
|
|
fields = {
|
|
"CustomerNumber": "C12345",
|
|
"SupplierOrgNumber": "556677-8899",
|
|
}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "letter"
|
|
|
|
# ==================== Edge Cases ====================
|
|
|
|
def test_classify_with_empty_string_fields_returns_letter(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""Empty strings should be treated as missing."""
|
|
fields = {
|
|
"payment_line": "",
|
|
"Amount": "",
|
|
}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "letter"
|
|
|
|
def test_classify_with_whitespace_only_fields_returns_letter(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""Whitespace-only strings should be treated as missing."""
|
|
fields = {
|
|
"payment_line": " ",
|
|
"Amount": "\t\n",
|
|
}
|
|
|
|
result = classifier.classify(fields)
|
|
|
|
assert result.document_type == "letter"
|
|
|
|
# ==================== ClassificationResult Immutability ====================
|
|
|
|
def test_classification_result_is_immutable(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""ClassificationResult should be a frozen dataclass."""
|
|
fields = {"payment_line": "test"}
|
|
result = classifier.classify(fields)
|
|
|
|
with pytest.raises((AttributeError, TypeError)):
|
|
result.document_type = "modified" # type: ignore
|
|
|
|
def test_classification_result_has_required_fields(
|
|
self, classifier: DocumentClassifier
|
|
) -> None:
|
|
"""ClassificationResult must have document_type, confidence, reason."""
|
|
fields = {"Amount": "100.00"}
|
|
result = classifier.classify(fields)
|
|
|
|
assert hasattr(result, "document_type")
|
|
assert hasattr(result, "confidence")
|
|
assert hasattr(result, "reason")
|
|
assert isinstance(result.document_type, str)
|
|
assert isinstance(result.confidence, float)
|
|
assert isinstance(result.reason, str)
|