WIP
This commit is contained in:
@@ -100,6 +100,22 @@ class TestInvoiceNumberNormalizer:
|
||||
result = normalizer.normalize("Invoice 54321 OCR 12345678901234")
|
||||
assert result.value == "54321"
|
||||
|
||||
def test_year_not_extracted_when_real_number_exists(self, normalizer):
|
||||
"""4-digit year should be skipped when a real invoice number is present."""
|
||||
result = normalizer.normalize("Faktura 12345 Datum 2025")
|
||||
assert result.value == "12345"
|
||||
|
||||
def test_year_2026_not_extracted(self, normalizer):
|
||||
"""Year '2026' should not be preferred over a real invoice number."""
|
||||
result = normalizer.normalize("Invoice 54321 Date 2026")
|
||||
assert result.value == "54321"
|
||||
|
||||
def test_non_year_4_digit_still_matches(self, normalizer):
|
||||
"""4-digit numbers that are NOT years should still match."""
|
||||
result = normalizer.normalize("Invoice 3456")
|
||||
assert result.value == "3456"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_fallback_extraction(self, normalizer):
|
||||
"""Test fallback to digit extraction."""
|
||||
# This matches Pattern 3 (short digit sequence 3-10 digits)
|
||||
@@ -107,6 +123,16 @@ class TestInvoiceNumberNormalizer:
|
||||
assert result.value == "123"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_amount_fragment_not_selected(self, normalizer):
|
||||
"""Amount fragment '775' from '9 775,96' should lose to real invoice number."""
|
||||
result = normalizer.normalize("9 775,96 Belopp Kontoutdragsnr 04862823")
|
||||
assert result.value == "04862823"
|
||||
|
||||
def test_prefer_medium_length_over_shortest(self, normalizer):
|
||||
"""Prefer 4-8 digit sequences over very short 3-digit ones."""
|
||||
result = normalizer.normalize("Ref 999 Fakturanr 12345")
|
||||
assert result.value == "12345"
|
||||
|
||||
def test_no_valid_sequence(self, normalizer):
|
||||
"""Test failure when no valid sequence found."""
|
||||
result = normalizer.normalize("no numbers here")
|
||||
@@ -134,8 +160,21 @@ class TestOcrNumberNormalizer:
|
||||
assert result.value == "310196187399952"
|
||||
assert " " not in result.value
|
||||
|
||||
def test_4_digit_ocr_valid(self, normalizer):
|
||||
"""4-digit OCR numbers like '3046' should be accepted."""
|
||||
result = normalizer.normalize("3046")
|
||||
assert result.is_valid is True
|
||||
assert result.value == "3046"
|
||||
|
||||
def test_2_digit_ocr_valid(self, normalizer):
|
||||
"""2-digit OCR numbers should be accepted."""
|
||||
result = normalizer.normalize("42")
|
||||
assert result.is_valid is True
|
||||
assert result.value == "42"
|
||||
|
||||
def test_too_short(self, normalizer):
|
||||
result = normalizer.normalize("1234")
|
||||
"""Single-digit OCR should be rejected."""
|
||||
result = normalizer.normalize("5")
|
||||
assert result.is_valid is False
|
||||
|
||||
def test_empty_string(self, normalizer):
|
||||
@@ -477,6 +516,38 @@ class TestAmountNormalizer:
|
||||
assert result.value == "100.00"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_astronomical_amount_rejected(self, normalizer):
|
||||
"""IBAN digits should NOT produce astronomical amounts (>10M)."""
|
||||
# IBAN "SE14120000001201138650" contains long digit sequences
|
||||
# The standalone fallback pattern should not extract these as amounts
|
||||
result = normalizer.normalize("SE14120000001201138650")
|
||||
if result.is_valid:
|
||||
assert float(result.value) < 10_000_000
|
||||
|
||||
def test_large_valid_amount_accepted(self, normalizer):
|
||||
"""Valid large amount like 108000,00 should be accepted."""
|
||||
result = normalizer.normalize("108000,00")
|
||||
assert result.value == "108000.00"
|
||||
assert result.is_valid is True
|
||||
|
||||
def test_standalone_iban_digits_rejected(self, normalizer):
|
||||
"""Very long digit sequence (IBAN fragment) should not produce >10M."""
|
||||
result = normalizer.normalize("1036149234823114")
|
||||
if result.is_valid:
|
||||
assert float(result.value) < 10_000_000
|
||||
|
||||
def test_main_pattern_rejects_over_10m(self, normalizer):
|
||||
"""Main regex path should reject amounts over 10M (e.g. IBAN-like digits)."""
|
||||
result = normalizer.normalize("Belopp 81648164,00 kr")
|
||||
# 81648164.00 > 10M, should be rejected
|
||||
assert not result.is_valid or float(result.value) < 10_000_000
|
||||
|
||||
def test_main_pattern_accepts_under_10m(self, normalizer):
|
||||
"""Main regex path should accept valid amounts under 10M."""
|
||||
result = normalizer.normalize("Summa 999999,99 kr")
|
||||
assert result.value == "999999.99"
|
||||
assert result.is_valid is True
|
||||
|
||||
|
||||
class TestEnhancedAmountNormalizer:
|
||||
"""Tests for EnhancedAmountNormalizer."""
|
||||
|
||||
Reference in New Issue
Block a user