394 lines
12 KiB
Python
394 lines
12 KiB
Python
"""
|
|
Field Validators Module
|
|
|
|
Provides validation functions for Swedish invoice fields.
|
|
Used by both inference (to validate extracted values) and matching (to filter candidates).
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
from .text_cleaner import TextCleaner
|
|
|
|
|
|
class FieldValidators:
|
|
"""
|
|
Validators for Swedish invoice field values.
|
|
|
|
Includes:
|
|
- Luhn (Mod10) checksum validation
|
|
- Format validation for specific field types
|
|
- Range validation for dates and amounts
|
|
"""
|
|
|
|
# =========================================================================
|
|
# Luhn (Mod10) Checksum
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def luhn_checksum(cls, digits: str) -> bool:
|
|
"""
|
|
Validate using Luhn (Mod10) algorithm.
|
|
|
|
Used for:
|
|
- Bankgiro numbers
|
|
- Plusgiro numbers
|
|
- OCR reference numbers
|
|
- Swedish organization numbers
|
|
|
|
The checksum is valid if the total modulo 10 equals 0.
|
|
"""
|
|
# 只保留数字
|
|
digits = TextCleaner.extract_digits(digits, apply_ocr_correction=False)
|
|
|
|
if not digits or not digits.isdigit():
|
|
return False
|
|
|
|
total = 0
|
|
for i, char in enumerate(reversed(digits)):
|
|
digit = int(char)
|
|
if i % 2 == 1: # 从右往左,每隔一位加倍
|
|
digit *= 2
|
|
if digit > 9:
|
|
digit -= 9
|
|
total += digit
|
|
|
|
return total % 10 == 0
|
|
|
|
@classmethod
|
|
def calculate_luhn_check_digit(cls, digits: str) -> int:
|
|
"""
|
|
Calculate the Luhn check digit for a number.
|
|
|
|
Given a number without check digit, returns the digit that would make it valid.
|
|
"""
|
|
digits = TextCleaner.extract_digits(digits, apply_ocr_correction=False)
|
|
|
|
# 计算现有数字的 Luhn 和
|
|
total = 0
|
|
for i, char in enumerate(reversed(digits)):
|
|
digit = int(char)
|
|
if i % 2 == 0: # 注意:因为还要加一位,所以偶数位置加倍
|
|
digit *= 2
|
|
if digit > 9:
|
|
digit -= 9
|
|
total += digit
|
|
|
|
# 计算需要的校验位
|
|
check_digit = (10 - (total % 10)) % 10
|
|
return check_digit
|
|
|
|
# =========================================================================
|
|
# Organisation Number Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def is_valid_organisation_number(cls, value: str) -> bool:
|
|
"""
|
|
Validate Swedish organisation number.
|
|
|
|
Format: NNNNNN-NNNN (10 digits)
|
|
- First digit: 1-9
|
|
- Third digit: >= 2 (distinguishes from personal numbers)
|
|
- Last digit: Luhn check digit
|
|
"""
|
|
digits = TextCleaner.extract_digits(value, apply_ocr_correction=True)
|
|
|
|
# 处理 VAT 格式
|
|
if len(digits) == 12 and digits.endswith('01'):
|
|
digits = digits[:10]
|
|
elif len(digits) == 14 and digits.startswith('46') and digits.endswith('01'):
|
|
digits = digits[2:12]
|
|
|
|
if len(digits) != 10:
|
|
return False
|
|
|
|
# 第一位 1-9
|
|
if digits[0] == '0':
|
|
return False
|
|
|
|
# 第三位 >= 2 (区分组织号和个人号)
|
|
# 注意:有些特殊组织可能不符合此规则,所以这里放宽
|
|
# if int(digits[2]) < 2:
|
|
# return False
|
|
|
|
# Luhn 校验
|
|
return cls.luhn_checksum(digits)
|
|
|
|
# =========================================================================
|
|
# Bankgiro Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def is_valid_bankgiro(cls, value: str) -> bool:
|
|
"""
|
|
Validate Swedish Bankgiro number.
|
|
|
|
Format: 7 or 8 digits with Luhn checksum
|
|
"""
|
|
digits = TextCleaner.extract_digits(value, apply_ocr_correction=True)
|
|
|
|
if len(digits) < 7 or len(digits) > 8:
|
|
return False
|
|
|
|
return cls.luhn_checksum(digits)
|
|
|
|
@classmethod
|
|
def format_bankgiro(cls, value: str) -> Optional[str]:
|
|
"""
|
|
Format Bankgiro number to standard format.
|
|
|
|
Returns: XXX-XXXX (7 digits) or XXXX-XXXX (8 digits), or None if invalid
|
|
"""
|
|
digits = TextCleaner.extract_digits(value, apply_ocr_correction=True)
|
|
|
|
if len(digits) == 7:
|
|
return f"{digits[:3]}-{digits[3:]}"
|
|
elif len(digits) == 8:
|
|
return f"{digits[:4]}-{digits[4:]}"
|
|
else:
|
|
return None
|
|
|
|
# =========================================================================
|
|
# Plusgiro Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def is_valid_plusgiro(cls, value: str) -> bool:
|
|
"""
|
|
Validate Swedish Plusgiro number.
|
|
|
|
Format: 2-8 digits with Luhn checksum
|
|
"""
|
|
digits = TextCleaner.extract_digits(value, apply_ocr_correction=True)
|
|
|
|
if len(digits) < 2 or len(digits) > 8:
|
|
return False
|
|
|
|
return cls.luhn_checksum(digits)
|
|
|
|
@classmethod
|
|
def format_plusgiro(cls, value: str) -> Optional[str]:
|
|
"""
|
|
Format Plusgiro number to standard format.
|
|
|
|
Returns: XXXXXXX-X format, or None if invalid
|
|
"""
|
|
digits = TextCleaner.extract_digits(value, apply_ocr_correction=True)
|
|
|
|
if len(digits) < 2 or len(digits) > 8:
|
|
return None
|
|
|
|
return f"{digits[:-1]}-{digits[-1]}"
|
|
|
|
# =========================================================================
|
|
# OCR Number Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def is_valid_ocr_number(cls, value: str, validate_checksum: bool = True) -> bool:
|
|
"""
|
|
Validate Swedish OCR reference number.
|
|
|
|
- Typically 10-25 digits
|
|
- Usually has Luhn checksum (but not always enforced)
|
|
"""
|
|
digits = TextCleaner.extract_digits(value, apply_ocr_correction=True)
|
|
|
|
if len(digits) < 5 or len(digits) > 25:
|
|
return False
|
|
|
|
if validate_checksum:
|
|
return cls.luhn_checksum(digits)
|
|
|
|
return True
|
|
|
|
# =========================================================================
|
|
# Amount Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def is_valid_amount(cls, value: str, min_amount: float = 0.0, max_amount: float = 10_000_000.0) -> bool:
|
|
"""
|
|
Validate monetary amount.
|
|
|
|
- Must be positive (or at least >= min_amount)
|
|
- Should be within reasonable range
|
|
"""
|
|
try:
|
|
# 尝试解析
|
|
text = TextCleaner.normalize_amount_text(value)
|
|
# 统一为点作为小数分隔符
|
|
text = text.replace(' ', '').replace(',', '.')
|
|
# 如果有多个点,保留最后一个
|
|
if text.count('.') > 1:
|
|
parts = text.rsplit('.', 1)
|
|
text = parts[0].replace('.', '') + '.' + parts[1]
|
|
|
|
amount = float(text)
|
|
return min_amount <= amount <= max_amount
|
|
except (ValueError, TypeError):
|
|
return False
|
|
|
|
@classmethod
|
|
def parse_amount(cls, value: str) -> Optional[float]:
|
|
"""
|
|
Parse amount from string, handling various formats.
|
|
|
|
Returns float or None if parsing fails.
|
|
"""
|
|
try:
|
|
text = TextCleaner.normalize_amount_text(value)
|
|
text = text.replace(' ', '')
|
|
|
|
# 检测格式并解析
|
|
# 瑞典/德国格式: 逗号是小数点
|
|
if re.match(r'^[\d.]+,\d{1,2}$', text):
|
|
text = text.replace('.', '').replace(',', '.')
|
|
# 美国格式: 点是小数点
|
|
elif re.match(r'^[\d,]+\.\d{1,2}$', text):
|
|
text = text.replace(',', '')
|
|
else:
|
|
# 简单格式
|
|
text = text.replace(',', '.')
|
|
if text.count('.') > 1:
|
|
parts = text.rsplit('.', 1)
|
|
text = parts[0].replace('.', '') + '.' + parts[1]
|
|
|
|
return float(text)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
# =========================================================================
|
|
# Date Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def is_valid_date(cls, value: str, min_year: int = 2000, max_year: int = 2100) -> bool:
|
|
"""
|
|
Validate date string.
|
|
|
|
- Year should be within reasonable range
|
|
- Month 1-12
|
|
- Day 1-31 (basic check)
|
|
"""
|
|
parsed = cls.parse_date(value)
|
|
if parsed is None:
|
|
return False
|
|
|
|
year, month, day = parsed
|
|
if not (min_year <= year <= max_year):
|
|
return False
|
|
if not (1 <= month <= 12):
|
|
return False
|
|
if not (1 <= day <= 31):
|
|
return False
|
|
|
|
# 更精确的日期验证
|
|
try:
|
|
datetime(year, month, day)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
@classmethod
|
|
def parse_date(cls, value: str) -> Optional[tuple[int, int, int]]:
|
|
"""
|
|
Parse date from string.
|
|
|
|
Returns (year, month, day) tuple or None.
|
|
"""
|
|
from .format_variants import FormatVariants
|
|
return FormatVariants._parse_date(value)
|
|
|
|
@classmethod
|
|
def format_date_iso(cls, value: str) -> Optional[str]:
|
|
"""
|
|
Format date to ISO format (YYYY-MM-DD).
|
|
|
|
Returns formatted string or None if parsing fails.
|
|
"""
|
|
parsed = cls.parse_date(value)
|
|
if parsed is None:
|
|
return None
|
|
|
|
year, month, day = parsed
|
|
return f"{year}-{month:02d}-{day:02d}"
|
|
|
|
# =========================================================================
|
|
# Invoice Number Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def is_valid_invoice_number(cls, value: str, min_length: int = 1, max_length: int = 30) -> bool:
|
|
"""
|
|
Validate invoice number.
|
|
|
|
Basic validation - just length check since invoice numbers are highly variable.
|
|
"""
|
|
clean = TextCleaner.clean_text(value)
|
|
if not clean:
|
|
return False
|
|
|
|
# 提取有意义的字符(字母和数字)
|
|
meaningful = re.sub(r'[^a-zA-Z0-9]', '', clean)
|
|
return min_length <= len(meaningful) <= max_length
|
|
|
|
# =========================================================================
|
|
# Generic Validation
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def validate_field(cls, field_name: str, value: str) -> tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate a field by name.
|
|
|
|
Returns (is_valid, error_message).
|
|
"""
|
|
if not value:
|
|
return False, "Empty value"
|
|
|
|
field_lower = field_name.lower()
|
|
|
|
if 'organisation' in field_lower or 'org' in field_lower:
|
|
if cls.is_valid_organisation_number(value):
|
|
return True, None
|
|
return False, "Invalid organisation number format or checksum"
|
|
|
|
elif 'bankgiro' in field_lower:
|
|
if cls.is_valid_bankgiro(value):
|
|
return True, None
|
|
return False, "Invalid Bankgiro format or checksum"
|
|
|
|
elif 'plusgiro' in field_lower:
|
|
if cls.is_valid_plusgiro(value):
|
|
return True, None
|
|
return False, "Invalid Plusgiro format or checksum"
|
|
|
|
elif 'ocr' in field_lower:
|
|
if cls.is_valid_ocr_number(value, validate_checksum=False):
|
|
return True, None
|
|
return False, "Invalid OCR number length"
|
|
|
|
elif 'amount' in field_lower:
|
|
if cls.is_valid_amount(value):
|
|
return True, None
|
|
return False, "Invalid amount format"
|
|
|
|
elif 'date' in field_lower:
|
|
if cls.is_valid_date(value):
|
|
return True, None
|
|
return False, "Invalid date format"
|
|
|
|
elif 'invoice' in field_lower and 'number' in field_lower:
|
|
if cls.is_valid_invoice_number(value):
|
|
return True, None
|
|
return False, "Invalid invoice number"
|
|
|
|
else:
|
|
# 默认:只检查非空
|
|
if TextCleaner.clean_text(value):
|
|
return True, None
|
|
return False, "Empty value after cleaning"
|