WIP
This commit is contained in:
0
tests/pipeline/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
318
tests/pipeline/test_value_selector.py
Normal file
318
tests/pipeline/test_value_selector.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""
|
||||
Tests for ValueSelector -- field-aware OCR token selection.
|
||||
|
||||
Verifies that ValueSelector picks the most likely value token(s)
|
||||
from OCR output, filtering out label text before sending to normalizer.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from shared.ocr.paddle_ocr import OCRToken
|
||||
from backend.pipeline.value_selector import ValueSelector
|
||||
|
||||
|
||||
def _token(text: str) -> OCRToken:
|
||||
"""Helper to create OCRToken with dummy bbox and confidence."""
|
||||
return OCRToken(text=text, bbox=(0, 0, 100, 20), confidence=0.95)
|
||||
|
||||
|
||||
def _tokens(*texts: str) -> list[OCRToken]:
|
||||
"""Helper to create multiple OCRTokens."""
|
||||
return [_token(t) for t in texts]
|
||||
|
||||
|
||||
class TestValueSelectorDateFields:
|
||||
"""Tests for date field value selection (InvoiceDate, InvoiceDueDate)."""
|
||||
|
||||
def test_selects_iso_date_from_label_and_value(self):
|
||||
tokens = _tokens("Fakturadatum", "2024-01-15")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "2024-01-15"
|
||||
|
||||
def test_selects_dot_separated_date(self):
|
||||
tokens = _tokens("Datum", "2024.03.20")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "2024.03.20"
|
||||
|
||||
def test_selects_slash_separated_date(self):
|
||||
tokens = _tokens("Forfallodag", "15/01/2024")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceDueDate")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "15/01/2024"
|
||||
|
||||
def test_selects_compact_date(self):
|
||||
tokens = _tokens("Datum", "20240115")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "20240115"
|
||||
|
||||
def test_fallback_when_no_date_pattern(self):
|
||||
"""No date pattern found -> return all tokens."""
|
||||
tokens = _tokens("Fakturadatum", "pending")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
||||
|
||||
assert len(result) == 2
|
||||
|
||||
|
||||
class TestValueSelectorAmountField:
|
||||
"""Tests for amount field value selection."""
|
||||
|
||||
def test_selects_amount_with_comma_decimal(self):
|
||||
tokens = _tokens("Belopp", "1 234,56", "kr")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "1 234,56"
|
||||
|
||||
def test_selects_amount_with_dot_decimal(self):
|
||||
tokens = _tokens("Summa", "1234.56")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "1234.56"
|
||||
|
||||
def test_selects_simple_amount(self):
|
||||
tokens = _tokens("Att", "betala", "500,00")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "500,00"
|
||||
|
||||
def test_selects_european_amount_with_dot_thousand(self):
|
||||
"""European format: dot as thousand separator, comma as decimal."""
|
||||
tokens = _tokens("Fakturabelopp:", "2.254,50 SEK")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "2.254,50 SEK"
|
||||
|
||||
def test_selects_european_amount_without_currency(self):
|
||||
"""European format without currency suffix."""
|
||||
tokens = _tokens("Belopp", "1.234,56")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "1.234,56"
|
||||
|
||||
def test_selects_amount_with_kr_suffix(self):
|
||||
"""Amount with 'kr' currency suffix."""
|
||||
tokens = _tokens("Summa", "20.485,00 kr")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "20.485,00 kr"
|
||||
|
||||
def test_selects_anglo_amount_with_sek(self):
|
||||
"""Anglo format with SEK suffix."""
|
||||
tokens = _tokens("Amount", "1,234.56 SEK")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "1,234.56 SEK"
|
||||
|
||||
def test_fallback_when_no_amount_pattern(self):
|
||||
tokens = _tokens("Belopp", "TBD")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Amount")
|
||||
|
||||
assert len(result) == 2
|
||||
|
||||
|
||||
class TestValueSelectorBankgiroField:
|
||||
"""Tests for Bankgiro field value selection."""
|
||||
|
||||
def test_selects_hyphenated_bankgiro(self):
|
||||
tokens = _tokens("BG:", "123-4567")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "123-4567"
|
||||
|
||||
def test_selects_bankgiro_digits(self):
|
||||
tokens = _tokens("Bankgiro", "1234567")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "1234567"
|
||||
|
||||
def test_selects_eight_digit_bankgiro(self):
|
||||
tokens = _tokens("Bankgiro:", "12345678")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "12345678"
|
||||
|
||||
|
||||
class TestValueSelectorPlusgiroField:
|
||||
"""Tests for Plusgiro field value selection."""
|
||||
|
||||
def test_selects_hyphenated_plusgiro(self):
|
||||
tokens = _tokens("PG:", "12345-6")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Plusgiro")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "12345-6"
|
||||
|
||||
def test_selects_plusgiro_digits(self):
|
||||
tokens = _tokens("Plusgiro", "1234567")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "Plusgiro")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "1234567"
|
||||
|
||||
|
||||
class TestValueSelectorOcrField:
|
||||
"""Tests for OCR reference number field value selection."""
|
||||
|
||||
def test_selects_longest_digit_sequence(self):
|
||||
tokens = _tokens("OCR", "1234567890")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "OCR")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "1234567890"
|
||||
|
||||
def test_selects_token_with_most_digits(self):
|
||||
tokens = _tokens("Ref", "nr", "94228110015950070")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "OCR")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "94228110015950070"
|
||||
|
||||
def test_ignores_short_digit_tokens(self):
|
||||
"""Tokens with fewer than 5 digits are not OCR references."""
|
||||
tokens = _tokens("OCR", "123")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "OCR")
|
||||
|
||||
# Fallback: return all tokens since no valid OCR found
|
||||
assert len(result) == 2
|
||||
|
||||
|
||||
class TestValueSelectorInvoiceNumberField:
|
||||
"""Tests for InvoiceNumber field value selection."""
|
||||
|
||||
def test_removes_swedish_label_keywords(self):
|
||||
tokens = _tokens("Fakturanummer", "INV-2024-001")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "INV-2024-001"
|
||||
|
||||
def test_keeps_non_label_tokens(self):
|
||||
tokens = _tokens("Nr", "12345")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "12345"
|
||||
|
||||
def test_multiple_value_tokens_kept(self):
|
||||
"""Multiple non-label tokens are all kept."""
|
||||
tokens = _tokens("Fakturanr", "INV", "2024", "001")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
|
||||
|
||||
# "Fakturanr" is a label keyword, the rest are values
|
||||
result_texts = [t.text for t in result]
|
||||
assert "Fakturanr" not in result_texts
|
||||
assert "INV" in result_texts
|
||||
|
||||
|
||||
class TestValueSelectorOrgNumberField:
|
||||
"""Tests for supplier_org_number field value selection."""
|
||||
|
||||
def test_selects_org_number_with_hyphen(self):
|
||||
tokens = _tokens("Org.nr", "556123-4567")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "supplier_org_number")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "556123-4567"
|
||||
|
||||
def test_selects_org_number_without_hyphen(self):
|
||||
tokens = _tokens("Organisationsnummer", "5561234567")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "supplier_org_number")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "5561234567"
|
||||
|
||||
|
||||
class TestValueSelectorCustomerNumberField:
|
||||
"""Tests for customer_number field value selection."""
|
||||
|
||||
def test_removes_label_keeps_value(self):
|
||||
tokens = _tokens("Kundnummer", "ABC-123")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "customer_number")
|
||||
|
||||
assert len(result) == 1
|
||||
assert result[0].text == "ABC-123"
|
||||
|
||||
|
||||
class TestValueSelectorPaymentLineField:
|
||||
"""Tests for payment_line field -- keeps all tokens."""
|
||||
|
||||
def test_keeps_all_tokens(self):
|
||||
tokens = _tokens("#", "94228110015950070", "#", "15658", "00", "8", ">", "48666036#14#")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "payment_line")
|
||||
|
||||
assert len(result) == len(tokens)
|
||||
|
||||
|
||||
class TestValueSelectorFallback:
|
||||
"""Tests for fallback behavior."""
|
||||
|
||||
def test_unknown_field_returns_all_tokens(self):
|
||||
tokens = _tokens("some", "unknown", "text")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "unknown_field")
|
||||
|
||||
assert len(result) == 3
|
||||
|
||||
def test_empty_tokens_returns_empty(self):
|
||||
result = ValueSelector.select_value_tokens([], "InvoiceDate")
|
||||
|
||||
assert result == []
|
||||
|
||||
def test_single_token_returns_it(self):
|
||||
tokens = _tokens("2024-01-15")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
||||
|
||||
assert len(result) == 1
|
||||
|
||||
def test_never_returns_empty_when_tokens_exist(self):
|
||||
"""Fallback ensures we never lose data -- always return something."""
|
||||
tokens = _tokens("Fakturadatum", "unknown_format")
|
||||
|
||||
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
|
||||
|
||||
assert len(result) > 0
|
||||
Reference in New Issue
Block a user