Files
invoice-master-poc-v2/tests/pipeline/test_value_selector.py
Yaojia Wang 58d36c8927 WIP
2026-02-12 23:06:00 +01:00

337 lines
10 KiB
Python

"""
Tests for ValueSelector -- field-aware OCR token selection.
Verifies that ValueSelector picks the most likely value token(s)
from OCR output, filtering out label text before sending to normalizer.
"""
import pytest
from shared.ocr.paddle_ocr import OCRToken
from backend.pipeline.value_selector import ValueSelector
def _token(text: str) -> OCRToken:
"""Helper to create OCRToken with dummy bbox and confidence."""
return OCRToken(text=text, bbox=(0, 0, 100, 20), confidence=0.95)
def _tokens(*texts: str) -> list[OCRToken]:
"""Helper to create multiple OCRTokens."""
return [_token(t) for t in texts]
class TestValueSelectorDateFields:
"""Tests for date field value selection (InvoiceDate, InvoiceDueDate)."""
def test_selects_iso_date_from_label_and_value(self):
tokens = _tokens("Fakturadatum", "2024-01-15")
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
assert len(result) == 1
assert result[0].text == "2024-01-15"
def test_selects_dot_separated_date(self):
tokens = _tokens("Datum", "2024.03.20")
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
assert len(result) == 1
assert result[0].text == "2024.03.20"
def test_selects_slash_separated_date(self):
tokens = _tokens("Forfallodag", "15/01/2024")
result = ValueSelector.select_value_tokens(tokens, "InvoiceDueDate")
assert len(result) == 1
assert result[0].text == "15/01/2024"
def test_selects_compact_date(self):
tokens = _tokens("Datum", "20240115")
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
assert len(result) == 1
assert result[0].text == "20240115"
def test_fallback_when_no_date_pattern(self):
"""No date pattern found -> return all tokens."""
tokens = _tokens("Fakturadatum", "pending")
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
assert len(result) == 2
class TestValueSelectorAmountField:
"""Tests for amount field value selection."""
def test_selects_amount_with_comma_decimal(self):
tokens = _tokens("Belopp", "1 234,56", "kr")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 1
assert result[0].text == "1 234,56"
def test_selects_amount_with_dot_decimal(self):
tokens = _tokens("Summa", "1234.56")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 1
assert result[0].text == "1234.56"
def test_selects_simple_amount(self):
tokens = _tokens("Att", "betala", "500,00")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 1
assert result[0].text == "500,00"
def test_selects_european_amount_with_dot_thousand(self):
"""European format: dot as thousand separator, comma as decimal."""
tokens = _tokens("Fakturabelopp:", "2.254,50 SEK")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 1
assert result[0].text == "2.254,50 SEK"
def test_selects_european_amount_without_currency(self):
"""European format without currency suffix."""
tokens = _tokens("Belopp", "1.234,56")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 1
assert result[0].text == "1.234,56"
def test_selects_amount_with_kr_suffix(self):
"""Amount with 'kr' currency suffix."""
tokens = _tokens("Summa", "20.485,00 kr")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 1
assert result[0].text == "20.485,00 kr"
def test_selects_anglo_amount_with_sek(self):
"""Anglo format with SEK suffix."""
tokens = _tokens("Amount", "1,234.56 SEK")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 1
assert result[0].text == "1,234.56 SEK"
def test_fallback_when_no_amount_pattern(self):
tokens = _tokens("Belopp", "TBD")
result = ValueSelector.select_value_tokens(tokens, "Amount")
assert len(result) == 2
class TestValueSelectorBankgiroField:
"""Tests for Bankgiro field value selection."""
def test_selects_hyphenated_bankgiro(self):
tokens = _tokens("BG:", "123-4567")
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
assert len(result) == 1
assert result[0].text == "123-4567"
def test_selects_bankgiro_digits(self):
tokens = _tokens("Bankgiro", "1234567")
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
assert len(result) == 1
assert result[0].text == "1234567"
def test_selects_eight_digit_bankgiro(self):
tokens = _tokens("Bankgiro:", "12345678")
result = ValueSelector.select_value_tokens(tokens, "Bankgiro")
assert len(result) == 1
assert result[0].text == "12345678"
class TestValueSelectorPlusgiroField:
"""Tests for Plusgiro field value selection."""
def test_selects_hyphenated_plusgiro(self):
tokens = _tokens("PG:", "12345-6")
result = ValueSelector.select_value_tokens(tokens, "Plusgiro")
assert len(result) == 1
assert result[0].text == "12345-6"
def test_selects_plusgiro_digits(self):
tokens = _tokens("Plusgiro", "1234567")
result = ValueSelector.select_value_tokens(tokens, "Plusgiro")
assert len(result) == 1
assert result[0].text == "1234567"
class TestValueSelectorOcrField:
"""Tests for OCR reference number field value selection."""
def test_selects_longest_digit_sequence(self):
tokens = _tokens("OCR", "1234567890")
result = ValueSelector.select_value_tokens(tokens, "OCR")
assert len(result) == 1
assert result[0].text == "1234567890"
def test_selects_token_with_most_digits(self):
tokens = _tokens("Ref", "nr", "94228110015950070")
result = ValueSelector.select_value_tokens(tokens, "OCR")
assert len(result) == 1
assert result[0].text == "94228110015950070"
def test_ignores_single_digit_tokens(self):
"""Tokens with fewer than 2 digits are not OCR references."""
tokens = _tokens("OCR", "5")
result = ValueSelector.select_value_tokens(tokens, "OCR")
# Fallback: return all tokens since no valid OCR found
assert len(result) == 2
def test_ocr_4_digit_token_selected(self):
"""4-digit OCR token should be selected."""
tokens = _tokens("OCR", "3046")
result = ValueSelector.select_value_tokens(tokens, "OCR")
assert len(result) == 1
assert result[0].text == "3046"
def test_ocr_2_digit_token_selected(self):
"""2-digit OCR token should be selected."""
tokens = _tokens("OCR", "42")
result = ValueSelector.select_value_tokens(tokens, "OCR")
assert len(result) == 1
assert result[0].text == "42"
class TestValueSelectorInvoiceNumberField:
"""Tests for InvoiceNumber field value selection."""
def test_removes_swedish_label_keywords(self):
tokens = _tokens("Fakturanummer", "INV-2024-001")
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
assert len(result) == 1
assert result[0].text == "INV-2024-001"
def test_keeps_non_label_tokens(self):
tokens = _tokens("Nr", "12345")
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
assert len(result) == 1
assert result[0].text == "12345"
def test_multiple_value_tokens_kept(self):
"""Multiple non-label tokens are all kept."""
tokens = _tokens("Fakturanr", "INV", "2024", "001")
result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber")
# "Fakturanr" is a label keyword, the rest are values
result_texts = [t.text for t in result]
assert "Fakturanr" not in result_texts
assert "INV" in result_texts
class TestValueSelectorOrgNumberField:
"""Tests for supplier_org_number field value selection."""
def test_selects_org_number_with_hyphen(self):
tokens = _tokens("Org.nr", "556123-4567")
result = ValueSelector.select_value_tokens(tokens, "supplier_org_number")
assert len(result) == 1
assert result[0].text == "556123-4567"
def test_selects_org_number_without_hyphen(self):
tokens = _tokens("Organisationsnummer", "5561234567")
result = ValueSelector.select_value_tokens(tokens, "supplier_org_number")
assert len(result) == 1
assert result[0].text == "5561234567"
class TestValueSelectorCustomerNumberField:
"""Tests for customer_number field value selection."""
def test_removes_label_keeps_value(self):
tokens = _tokens("Kundnummer", "ABC-123")
result = ValueSelector.select_value_tokens(tokens, "customer_number")
assert len(result) == 1
assert result[0].text == "ABC-123"
class TestValueSelectorPaymentLineField:
"""Tests for payment_line field -- keeps all tokens."""
def test_keeps_all_tokens(self):
tokens = _tokens("#", "94228110015950070", "#", "15658", "00", "8", ">", "48666036#14#")
result = ValueSelector.select_value_tokens(tokens, "payment_line")
assert len(result) == len(tokens)
class TestValueSelectorFallback:
"""Tests for fallback behavior."""
def test_unknown_field_returns_all_tokens(self):
tokens = _tokens("some", "unknown", "text")
result = ValueSelector.select_value_tokens(tokens, "unknown_field")
assert len(result) == 3
def test_empty_tokens_returns_empty(self):
result = ValueSelector.select_value_tokens([], "InvoiceDate")
assert result == []
def test_single_token_returns_it(self):
tokens = _tokens("2024-01-15")
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
assert len(result) == 1
def test_never_returns_empty_when_tokens_exist(self):
"""Fallback ensures we never lose data -- always return something."""
tokens = _tokens("Fakturadatum", "unknown_format")
result = ValueSelector.select_value_tokens(tokens, "InvoiceDate")
assert len(result) > 0