This commit is contained in:
Yaojia Wang
2026-02-12 23:06:00 +01:00
parent ad5ed46b4c
commit 58d36c8927
26 changed files with 3903 additions and 2551 deletions

View File

@@ -100,6 +100,22 @@ class TestInvoiceNumberNormalizer:
result = normalizer.normalize("Invoice 54321 OCR 12345678901234")
assert result.value == "54321"
def test_year_not_extracted_when_real_number_exists(self, normalizer):
"""4-digit year should be skipped when a real invoice number is present."""
result = normalizer.normalize("Faktura 12345 Datum 2025")
assert result.value == "12345"
def test_year_2026_not_extracted(self, normalizer):
"""Year '2026' should not be preferred over a real invoice number."""
result = normalizer.normalize("Invoice 54321 Date 2026")
assert result.value == "54321"
def test_non_year_4_digit_still_matches(self, normalizer):
"""4-digit numbers that are NOT years should still match."""
result = normalizer.normalize("Invoice 3456")
assert result.value == "3456"
assert result.is_valid is True
def test_fallback_extraction(self, normalizer):
"""Test fallback to digit extraction."""
# This matches Pattern 3 (short digit sequence 3-10 digits)
@@ -107,6 +123,16 @@ class TestInvoiceNumberNormalizer:
assert result.value == "123"
assert result.is_valid is True
def test_amount_fragment_not_selected(self, normalizer):
"""Amount fragment '775' from '9 775,96' should lose to real invoice number."""
result = normalizer.normalize("9 775,96 Belopp Kontoutdragsnr 04862823")
assert result.value == "04862823"
def test_prefer_medium_length_over_shortest(self, normalizer):
"""Prefer 4-8 digit sequences over very short 3-digit ones."""
result = normalizer.normalize("Ref 999 Fakturanr 12345")
assert result.value == "12345"
def test_no_valid_sequence(self, normalizer):
"""Test failure when no valid sequence found."""
result = normalizer.normalize("no numbers here")
@@ -134,8 +160,21 @@ class TestOcrNumberNormalizer:
assert result.value == "310196187399952"
assert " " not in result.value
def test_4_digit_ocr_valid(self, normalizer):
"""4-digit OCR numbers like '3046' should be accepted."""
result = normalizer.normalize("3046")
assert result.is_valid is True
assert result.value == "3046"
def test_2_digit_ocr_valid(self, normalizer):
"""2-digit OCR numbers should be accepted."""
result = normalizer.normalize("42")
assert result.is_valid is True
assert result.value == "42"
def test_too_short(self, normalizer):
result = normalizer.normalize("1234")
"""Single-digit OCR should be rejected."""
result = normalizer.normalize("5")
assert result.is_valid is False
def test_empty_string(self, normalizer):
@@ -477,6 +516,38 @@ class TestAmountNormalizer:
assert result.value == "100.00"
assert result.is_valid is True
def test_astronomical_amount_rejected(self, normalizer):
"""IBAN digits should NOT produce astronomical amounts (>10M)."""
# IBAN "SE14120000001201138650" contains long digit sequences
# The standalone fallback pattern should not extract these as amounts
result = normalizer.normalize("SE14120000001201138650")
if result.is_valid:
assert float(result.value) < 10_000_000
def test_large_valid_amount_accepted(self, normalizer):
"""Valid large amount like 108000,00 should be accepted."""
result = normalizer.normalize("108000,00")
assert result.value == "108000.00"
assert result.is_valid is True
def test_standalone_iban_digits_rejected(self, normalizer):
"""Very long digit sequence (IBAN fragment) should not produce >10M."""
result = normalizer.normalize("1036149234823114")
if result.is_valid:
assert float(result.value) < 10_000_000
def test_main_pattern_rejects_over_10m(self, normalizer):
"""Main regex path should reject amounts over 10M (e.g. IBAN-like digits)."""
result = normalizer.normalize("Belopp 81648164,00 kr")
# 81648164.00 > 10M, should be rejected
assert not result.is_valid or float(result.value) < 10_000_000
def test_main_pattern_accepts_under_10m(self, normalizer):
"""Main regex path should accept valid amounts under 10M."""
result = normalizer.normalize("Summa 999999,99 kr")
assert result.value == "999999.99"
assert result.is_valid is True
class TestEnhancedAmountNormalizer:
"""Tests for EnhancedAmountNormalizer."""