Re-structure the project.

This commit is contained in:
Yaojia Wang
2026-01-25 15:21:11 +01:00
parent 8fd61ea928
commit e599424a92
80 changed files with 10672 additions and 1584 deletions

View File

@@ -0,0 +1,282 @@
"""
Tests for payment line parser.
"""
import pytest
import sys
from pathlib import Path
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
from src.inference.payment_line_parser import PaymentLineParser, PaymentLineData
class TestPaymentLineParser:
"""Test PaymentLineParser class."""
@pytest.fixture
def parser(self):
"""Create parser instance."""
return PaymentLineParser()
def test_parse_full_format_with_amount(self, parser):
"""Test parsing full format with amount."""
text = "# 94228110015950070 # 15658 00 8 > 48666036#14#"
data = parser.parse(text)
assert data.is_valid
assert data.ocr_number == "94228110015950070"
assert data.amount == "15658.00"
assert data.account_number == "48666036"
assert data.record_type == "8"
assert data.check_digits == "14"
assert data.parse_method == "full"
def test_parse_with_spaces_in_amount(self, parser):
"""Test parsing with OCR-induced spaces in amount."""
text = "# 11000770600242 # 12 0 0 00 5 > 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.ocr_number == "11000770600242"
assert data.amount == "1200.00" # Spaces removed
assert data.account_number == "3082963"
assert data.record_type == "5"
assert data.check_digits == "41"
def test_parse_with_spaces_in_check_digits(self, parser):
"""Test parsing with spaces around check digits: #41 # instead of #41#."""
text = "# 6026726908 # 736 00 9 > 5692041 #41 #"
data = parser.parse(text)
assert data.is_valid
assert data.ocr_number == "6026726908"
assert data.amount == "736.00"
assert data.account_number == "5692041"
assert data.check_digits == "41"
def test_parse_without_greater_than_symbol(self, parser):
"""Test parsing when > symbol is missing (OCR error)."""
text = "# 11000770600242 # 1200 00 5 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.ocr_number == "11000770600242"
assert data.amount == "1200.00"
assert data.account_number == "3082963"
def test_parse_format_without_amount(self, parser):
"""Test parsing format without amount."""
text = "# 11000770600242 # > 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.ocr_number == "11000770600242"
assert data.amount is None
assert data.account_number == "3082963"
assert data.check_digits == "41"
assert data.parse_method == "no_amount"
def test_parse_account_only_format(self, parser):
"""Test parsing account-only format."""
text = "> 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.ocr_number == ""
assert data.amount is None
assert data.account_number == "3082963"
assert data.check_digits == "41"
assert data.parse_method == "account_only"
assert "Partial" in data.error
def test_parse_invalid_format(self, parser):
"""Test parsing invalid format."""
text = "This is not a payment line"
data = parser.parse(text)
assert not data.is_valid
assert data.error is not None
assert "No valid payment line format" in data.error
def test_parse_empty_text(self, parser):
"""Test parsing empty text."""
data = parser.parse("")
assert not data.is_valid
assert data.error == "Empty payment line text"
def test_format_machine_readable_full(self, parser):
"""Test formatting full data to machine-readable format."""
data = PaymentLineData(
ocr_number="94228110015950070",
amount="15658.00",
account_number="48666036",
record_type="8",
check_digits="14",
raw_text="original",
is_valid=True
)
formatted = parser.format_machine_readable(data)
assert "# 94228110015950070 #" in formatted
assert "15658 00 8" in formatted
assert "48666036#14#" in formatted
def test_format_machine_readable_no_amount(self, parser):
"""Test formatting data without amount."""
data = PaymentLineData(
ocr_number="11000770600242",
amount=None,
account_number="3082963",
record_type=None,
check_digits="41",
raw_text="original",
is_valid=True
)
formatted = parser.format_machine_readable(data)
assert "# 11000770600242 #" in formatted
assert "3082963#41#" in formatted
def test_format_machine_readable_account_only(self, parser):
"""Test formatting account-only data."""
data = PaymentLineData(
ocr_number="",
amount=None,
account_number="3082963",
record_type=None,
check_digits="41",
raw_text="original",
is_valid=True
)
formatted = parser.format_machine_readable(data)
assert "> 3082963#41#" in formatted
def test_format_for_field_extractor_valid(self, parser):
"""Test formatting for FieldExtractor API (valid data)."""
text = "# 6026726908 # 736 00 9 > 5692041#41#"
data = parser.parse(text)
formatted, is_valid, error = parser.format_for_field_extractor(data)
assert is_valid
assert formatted is not None
assert "# 6026726908 #" in formatted
assert "736 00" in formatted
def test_format_for_field_extractor_invalid(self, parser):
"""Test formatting for FieldExtractor API (invalid data)."""
text = "invalid payment line"
data = parser.parse(text)
formatted, is_valid, error = parser.format_for_field_extractor(data)
assert not is_valid
assert formatted is None
assert error is not None
class TestRealWorldExamples:
"""Test with real-world payment line examples from the codebase."""
@pytest.fixture
def parser(self):
"""Create parser instance."""
return PaymentLineParser()
def test_billo310_payment_line(self, parser):
"""Test Billo310 PDF payment line (from issue report)."""
# This is the payment line that had Amount extraction issue
text = "# 6026726908 # 736 00 9 > 5692041 #41 #"
data = parser.parse(text)
assert data.is_valid
assert data.amount == "736.00" # Correct amount
assert data.account_number == "5692041"
def test_billo363_payment_line(self, parser):
"""Test Billo363 PDF payment line."""
text = "# 11000770600242 # 12 0 0 00 5 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.amount == "1200.00"
assert data.ocr_number == "11000770600242"
def test_payment_line_with_spaces_in_account(self, parser):
"""Test payment line with spaces in account number."""
text = "# 94228110015950070 # 15658 00 8 > 4 8 6 6 6 0 3 6#14#"
data = parser.parse(text)
assert data.is_valid
assert data.account_number == "48666036" # Spaces removed
def test_multiple_spaces_in_amounts(self, parser):
"""Test handling multiple spaces in amount."""
text = "# 11000770600242 # 1 2 0 0 00 5 > 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.amount == "1200.00"
class TestEdgeCases:
"""Test edge cases and error conditions."""
@pytest.fixture
def parser(self):
"""Create parser instance."""
return PaymentLineParser()
def test_very_long_ocr_number(self, parser):
"""Test with very long OCR number."""
text = "# 123456789012345678901234567890 # 1000 00 5 > 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.ocr_number == "123456789012345678901234567890"
def test_zero_amount(self, parser):
"""Test with zero amount."""
text = "# 11000770600242 # 0 00 5 > 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.amount == "0.00"
def test_large_amount(self, parser):
"""Test with large amount."""
text = "# 11000770600242 # 999999 99 5 > 3082963#41#"
data = parser.parse(text)
assert data.is_valid
assert data.amount == "999999.99"
def test_text_with_extra_characters(self, parser):
"""Test with extra characters around payment line."""
text = "Some text before # 6026726908 # 736 00 9 > 5692041#41# and after"
data = parser.parse(text)
assert data.is_valid
assert data.amount == "736.00"
def test_none_input(self, parser):
"""Test with None input."""
data = parser.parse(None)
assert not data.is_valid
assert data.error is not None
def test_whitespace_only(self, parser):
"""Test with whitespace only."""
data = parser.parse(" \t\n ")
assert not data.is_valid
assert "Empty" in data.error