319 lines
9.7 KiB
Python
319 lines
9.7 KiB
Python
"""
|
|
Tests for ValueSelector -- field-aware OCR token selection.
|
|
|
|
Verifies that ValueSelector picks the most likely value token(s)
|
|
from OCR output, filtering out label text before sending to normalizer.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from shared.ocr.paddle_ocr import OCRToken
|
|
from backend.pipeline.value_selector import ValueSelector
|
|
|
|
|
|
def _token(text: str) -> OCRToken:
|
|
"""Helper to create OCRToken with dummy bbox and confidence."""
|
|
return OCRToken(text=text, bbox=(0, 0, 100, 20), confidence=0.95)
|
|
|
|
|
|
def _tokens(*texts: str) -> list[OCRToken]:
|
|
"""Helper to create multiple OCRTokens."""
|
|
return [_token(t) for t in texts]
|
|
|
|
|
|
class TestValueSelectorDateFields:
|
|
"""Tests for date field value selection (InvoiceDate, InvoiceDueDate)."""
|
|
|
|
def test_selects_iso_date_from_label_and_value(self):
|
|
tokens = _tokens("Fakturadatum", "2024-01-15")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "2024-01-15"
|
|
|
|
def test_selects_dot_separated_date(self):
|
|
tokens = _tokens("Datum", "2024.03.20")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "2024.03.20"
|
|
|
|
def test_selects_slash_separated_date(self):
|
|
tokens = _tokens("Forfallodag", "15/01/2024")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceDueDate")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "15/01/2024"
|
|
|
|
def test_selects_compact_date(self):
|
|
tokens = _tokens("Datum", "20240115")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "20240115"
|
|
|
|
def test_fallback_when_no_date_pattern(self):
|
|
"""No date pattern found -> return all tokens."""
|
|
tokens = _tokens("Fakturadatum", "pending")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
|
|
|
assert len(result) == 2
|
|
|
|
|
|
class TestValueSelectorAmountField:
|
|
"""Tests for amount field value selection."""
|
|
|
|
def test_selects_amount_with_comma_decimal(self):
|
|
tokens = _tokens("Belopp", "1 234,56", "kr")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "1 234,56"
|
|
|
|
def test_selects_amount_with_dot_decimal(self):
|
|
tokens = _tokens("Summa", "1234.56")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "1234.56"
|
|
|
|
def test_selects_simple_amount(self):
|
|
tokens = _tokens("Att", "betala", "500,00")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "500,00"
|
|
|
|
def test_selects_european_amount_with_dot_thousand(self):
|
|
"""European format: dot as thousand separator, comma as decimal."""
|
|
tokens = _tokens("Fakturabelopp:", "2.254,50 SEK")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "2.254,50 SEK"
|
|
|
|
def test_selects_european_amount_without_currency(self):
|
|
"""European format without currency suffix."""
|
|
tokens = _tokens("Belopp", "1.234,56")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "1.234,56"
|
|
|
|
def test_selects_amount_with_kr_suffix(self):
|
|
"""Amount with 'kr' currency suffix."""
|
|
tokens = _tokens("Summa", "20.485,00 kr")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "20.485,00 kr"
|
|
|
|
def test_selects_anglo_amount_with_sek(self):
|
|
"""Anglo format with SEK suffix."""
|
|
tokens = _tokens("Amount", "1,234.56 SEK")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "1,234.56 SEK"
|
|
|
|
def test_fallback_when_no_amount_pattern(self):
|
|
tokens = _tokens("Belopp", "TBD")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
|
|
|
assert len(result) == 2
|
|
|
|
|
|
class TestValueSelectorBankgiroField:
|
|
"""Tests for Bankgiro field value selection."""
|
|
|
|
def test_selects_hyphenated_bankgiro(self):
|
|
tokens = _tokens("BG:", "123-4567")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "123-4567"
|
|
|
|
def test_selects_bankgiro_digits(self):
|
|
tokens = _tokens("Bankgiro", "1234567")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "1234567"
|
|
|
|
def test_selects_eight_digit_bankgiro(self):
|
|
tokens = _tokens("Bankgiro:", "12345678")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "12345678"
|
|
|
|
|
|
class TestValueSelectorPlusgiroField:
|
|
"""Tests for Plusgiro field value selection."""
|
|
|
|
def test_selects_hyphenated_plusgiro(self):
|
|
tokens = _tokens("PG:", "12345-6")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Plusgiro")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "12345-6"
|
|
|
|
def test_selects_plusgiro_digits(self):
|
|
tokens = _tokens("Plusgiro", "1234567")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "Plusgiro")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "1234567"
|
|
|
|
|
|
class TestValueSelectorOcrField:
|
|
"""Tests for OCR reference number field value selection."""
|
|
|
|
def test_selects_longest_digit_sequence(self):
|
|
tokens = _tokens("OCR", "1234567890")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "OCR")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "1234567890"
|
|
|
|
def test_selects_token_with_most_digits(self):
|
|
tokens = _tokens("Ref", "nr", "94228110015950070")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "OCR")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "94228110015950070"
|
|
|
|
def test_ignores_short_digit_tokens(self):
|
|
"""Tokens with fewer than 5 digits are not OCR references."""
|
|
tokens = _tokens("OCR", "123")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "OCR")
|
|
|
|
# Fallback: return all tokens since no valid OCR found
|
|
assert len(result) == 2
|
|
|
|
|
|
class TestValueSelectorInvoiceNumberField:
|
|
"""Tests for InvoiceNumber field value selection."""
|
|
|
|
def test_removes_swedish_label_keywords(self):
|
|
tokens = _tokens("Fakturanummer", "INV-2024-001")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "INV-2024-001"
|
|
|
|
def test_keeps_non_label_tokens(self):
|
|
tokens = _tokens("Nr", "12345")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "12345"
|
|
|
|
def test_multiple_value_tokens_kept(self):
|
|
"""Multiple non-label tokens are all kept."""
|
|
tokens = _tokens("Fakturanr", "INV", "2024", "001")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
|
|
|
|
# "Fakturanr" is a label keyword, the rest are values
|
|
result_texts = [t.text for t in result]
|
|
assert "Fakturanr" not in result_texts
|
|
assert "INV" in result_texts
|
|
|
|
|
|
class TestValueSelectorOrgNumberField:
|
|
"""Tests for supplier_org_number field value selection."""
|
|
|
|
def test_selects_org_number_with_hyphen(self):
|
|
tokens = _tokens("Org.nr", "556123-4567")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "supplier_org_number")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "556123-4567"
|
|
|
|
def test_selects_org_number_without_hyphen(self):
|
|
tokens = _tokens("Organisationsnummer", "5561234567")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "supplier_org_number")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "5561234567"
|
|
|
|
|
|
class TestValueSelectorCustomerNumberField:
|
|
"""Tests for customer_number field value selection."""
|
|
|
|
def test_removes_label_keeps_value(self):
|
|
tokens = _tokens("Kundnummer", "ABC-123")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "customer_number")
|
|
|
|
assert len(result) == 1
|
|
assert result[0].text == "ABC-123"
|
|
|
|
|
|
class TestValueSelectorPaymentLineField:
|
|
"""Tests for payment_line field -- keeps all tokens."""
|
|
|
|
def test_keeps_all_tokens(self):
|
|
tokens = _tokens("#", "94228110015950070", "#", "15658", "00", "8", ">", "48666036#14#")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "payment_line")
|
|
|
|
assert len(result) == len(tokens)
|
|
|
|
|
|
class TestValueSelectorFallback:
|
|
"""Tests for fallback behavior."""
|
|
|
|
def test_unknown_field_returns_all_tokens(self):
|
|
tokens = _tokens("some", "unknown", "text")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "unknown_field")
|
|
|
|
assert len(result) == 3
|
|
|
|
def test_empty_tokens_returns_empty(self):
|
|
result = ValueSelector.select_value_tokens([], "InvoiceDate")
|
|
|
|
assert result == []
|
|
|
|
def test_single_token_returns_it(self):
|
|
tokens = _tokens("2024-01-15")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
|
|
|
assert len(result) == 1
|
|
|
|
def test_never_returns_empty_when_tokens_exist(self):
|
|
"""Fallback ensures we never lose data -- always return something."""
|
|
tokens = _tokens("Fakturadatum", "unknown_format")
|
|
|
|
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
|
|
|
assert len(result) > 0
|