""" Tests for ValueSelector -- field-aware OCR token selection. Verifies that ValueSelector picks the most likely value token(s) from OCR output, filtering out label text before sending to normalizer. """ import pytest from shared.ocr.paddle_ocr import OCRToken from backend.pipeline.value_selector import ValueSelector def _token(text: str) -> OCRToken: """Helper to create OCRToken with dummy bbox and confidence.""" return OCRToken(text=text, bbox=(0, 0, 100, 20), confidence=0.95) def _tokens(*texts: str) -> list[OCRToken]: """Helper to create multiple OCRTokens.""" return [_token(t) for t in texts] class TestValueSelectorDateFields: """Tests for date field value selection (InvoiceDate, InvoiceDueDate).""" def test_selects_iso_date_from_label_and_value(self): tokens = _tokens("Fakturadatum", "2024-01-15") result = ValueSelector.select_value_tokens(tokens, "InvoiceDate") assert len(result) == 1 assert result[0].text == "2024-01-15" def test_selects_dot_separated_date(self): tokens = _tokens("Datum", "2024.03.20") result = ValueSelector.select_value_tokens(tokens, "InvoiceDate") assert len(result) == 1 assert result[0].text == "2024.03.20" def test_selects_slash_separated_date(self): tokens = _tokens("Forfallodag", "15/01/2024") result = ValueSelector.select_value_tokens(tokens, "InvoiceDueDate") assert len(result) == 1 assert result[0].text == "15/01/2024" def test_selects_compact_date(self): tokens = _tokens("Datum", "20240115") result = ValueSelector.select_value_tokens(tokens, "InvoiceDate") assert len(result) == 1 assert result[0].text == "20240115" def test_fallback_when_no_date_pattern(self): """No date pattern found -> return all tokens.""" tokens = _tokens("Fakturadatum", "pending") result = ValueSelector.select_value_tokens(tokens, "InvoiceDate") assert len(result) == 2 class TestValueSelectorAmountField: """Tests for amount field value selection.""" def test_selects_amount_with_comma_decimal(self): tokens = _tokens("Belopp", "1 234,56", "kr") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 1 assert result[0].text == "1 234,56" def test_selects_amount_with_dot_decimal(self): tokens = _tokens("Summa", "1234.56") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 1 assert result[0].text == "1234.56" def test_selects_simple_amount(self): tokens = _tokens("Att", "betala", "500,00") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 1 assert result[0].text == "500,00" def test_selects_european_amount_with_dot_thousand(self): """European format: dot as thousand separator, comma as decimal.""" tokens = _tokens("Fakturabelopp:", "2.254,50 SEK") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 1 assert result[0].text == "2.254,50 SEK" def test_selects_european_amount_without_currency(self): """European format without currency suffix.""" tokens = _tokens("Belopp", "1.234,56") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 1 assert result[0].text == "1.234,56" def test_selects_amount_with_kr_suffix(self): """Amount with 'kr' currency suffix.""" tokens = _tokens("Summa", "20.485,00 kr") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 1 assert result[0].text == "20.485,00 kr" def test_selects_anglo_amount_with_sek(self): """Anglo format with SEK suffix.""" tokens = _tokens("Amount", "1,234.56 SEK") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 1 assert result[0].text == "1,234.56 SEK" def test_fallback_when_no_amount_pattern(self): tokens = _tokens("Belopp", "TBD") result = ValueSelector.select_value_tokens(tokens, "Amount") assert len(result) == 2 class TestValueSelectorBankgiroField: """Tests for Bankgiro field value selection.""" def test_selects_hyphenated_bankgiro(self): tokens = _tokens("BG:", "123-4567") result = ValueSelector.select_value_tokens(tokens, "Bankgiro") assert len(result) == 1 assert result[0].text == "123-4567" def test_selects_bankgiro_digits(self): tokens = _tokens("Bankgiro", "1234567") result = ValueSelector.select_value_tokens(tokens, "Bankgiro") assert len(result) == 1 assert result[0].text == "1234567" def test_selects_eight_digit_bankgiro(self): tokens = _tokens("Bankgiro:", "12345678") result = ValueSelector.select_value_tokens(tokens, "Bankgiro") assert len(result) == 1 assert result[0].text == "12345678" class TestValueSelectorPlusgiroField: """Tests for Plusgiro field value selection.""" def test_selects_hyphenated_plusgiro(self): tokens = _tokens("PG:", "12345-6") result = ValueSelector.select_value_tokens(tokens, "Plusgiro") assert len(result) == 1 assert result[0].text == "12345-6" def test_selects_plusgiro_digits(self): tokens = _tokens("Plusgiro", "1234567") result = ValueSelector.select_value_tokens(tokens, "Plusgiro") assert len(result) == 1 assert result[0].text == "1234567" class TestValueSelectorOcrField: """Tests for OCR reference number field value selection.""" def test_selects_longest_digit_sequence(self): tokens = _tokens("OCR", "1234567890") result = ValueSelector.select_value_tokens(tokens, "OCR") assert len(result) == 1 assert result[0].text == "1234567890" def test_selects_token_with_most_digits(self): tokens = _tokens("Ref", "nr", "94228110015950070") result = ValueSelector.select_value_tokens(tokens, "OCR") assert len(result) == 1 assert result[0].text == "94228110015950070" def test_ignores_single_digit_tokens(self): """Tokens with fewer than 2 digits are not OCR references.""" tokens = _tokens("OCR", "5") result = ValueSelector.select_value_tokens(tokens, "OCR") # Fallback: return all tokens since no valid OCR found assert len(result) == 2 def test_ocr_4_digit_token_selected(self): """4-digit OCR token should be selected.""" tokens = _tokens("OCR", "3046") result = ValueSelector.select_value_tokens(tokens, "OCR") assert len(result) == 1 assert result[0].text == "3046" def test_ocr_2_digit_token_selected(self): """2-digit OCR token should be selected.""" tokens = _tokens("OCR", "42") result = ValueSelector.select_value_tokens(tokens, "OCR") assert len(result) == 1 assert result[0].text == "42" class TestValueSelectorInvoiceNumberField: """Tests for InvoiceNumber field value selection.""" def test_removes_swedish_label_keywords(self): tokens = _tokens("Fakturanummer", "INV-2024-001") result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber") assert len(result) == 1 assert result[0].text == "INV-2024-001" def test_keeps_non_label_tokens(self): tokens = _tokens("Nr", "12345") result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber") assert len(result) == 1 assert result[0].text == "12345" def test_multiple_value_tokens_kept(self): """Multiple non-label tokens are all kept.""" tokens = _tokens("Fakturanr", "INV", "2024", "001") result = ValueSelector.select_value_tokens(tokens, "InvoiceNumber") # "Fakturanr" is a label keyword, the rest are values result_texts = [t.text for t in result] assert "Fakturanr" not in result_texts assert "INV" in result_texts class TestValueSelectorOrgNumberField: """Tests for supplier_org_number field value selection.""" def test_selects_org_number_with_hyphen(self): tokens = _tokens("Org.nr", "556123-4567") result = ValueSelector.select_value_tokens(tokens, "supplier_org_number") assert len(result) == 1 assert result[0].text == "556123-4567" def test_selects_org_number_without_hyphen(self): tokens = _tokens("Organisationsnummer", "5561234567") result = ValueSelector.select_value_tokens(tokens, "supplier_org_number") assert len(result) == 1 assert result[0].text == "5561234567" class TestValueSelectorCustomerNumberField: """Tests for customer_number field value selection.""" def test_removes_label_keeps_value(self): tokens = _tokens("Kundnummer", "ABC-123") result = ValueSelector.select_value_tokens(tokens, "customer_number") assert len(result) == 1 assert result[0].text == "ABC-123" class TestValueSelectorPaymentLineField: """Tests for payment_line field -- keeps all tokens.""" def test_keeps_all_tokens(self): tokens = _tokens("#", "94228110015950070", "#", "15658", "00", "8", ">", "48666036#14#") result = ValueSelector.select_value_tokens(tokens, "payment_line") assert len(result) == len(tokens) class TestValueSelectorFallback: """Tests for fallback behavior.""" def test_unknown_field_returns_all_tokens(self): tokens = _tokens("some", "unknown", "text") result = ValueSelector.select_value_tokens(tokens, "unknown_field") assert len(result) == 3 def test_empty_tokens_returns_empty(self): result = ValueSelector.select_value_tokens([], "InvoiceDate") assert result == [] def test_single_token_returns_it(self): tokens = _tokens("2024-01-15") result = ValueSelector.select_value_tokens(tokens, "InvoiceDate") assert len(result) == 1 def test_never_returns_empty_when_tokens_exist(self): """Fallback ensures we never lose data -- always return something.""" tokens = _tokens("Fakturadatum", "unknown_format") result = ValueSelector.select_value_tokens(tokens, "InvoiceDate") assert len(result) > 0