Re-structure the project.
This commit is contained in:
31
src/normalize/normalizers/ocr_normalizer.py
Normal file
31
src/normalize/normalizers/ocr_normalizer.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""
|
||||
OCR Number Normalizer
|
||||
|
||||
Normalizes OCR reference numbers (Swedish payment system).
|
||||
"""
|
||||
|
||||
import re
|
||||
from .base import BaseNormalizer
|
||||
|
||||
|
||||
class OCRNormalizer(BaseNormalizer):
|
||||
"""
|
||||
Normalizes OCR reference numbers.
|
||||
|
||||
Similar to invoice number - primarily digits.
|
||||
|
||||
Examples:
|
||||
'94228110015950070' -> ['94228110015950070']
|
||||
'OCR: 94228110015950070' -> ['94228110015950070', 'OCR: 94228110015950070']
|
||||
"""
|
||||
|
||||
def normalize(self, value: str) -> list[str]:
|
||||
"""Generate variants of OCR number."""
|
||||
value = self.clean_text(value)
|
||||
digits_only = re.sub(r'\D', '', value)
|
||||
|
||||
variants = [value]
|
||||
if digits_only and digits_only != value:
|
||||
variants.append(digits_only)
|
||||
|
||||
return list(set(v for v in variants if v))
|
||||
Reference in New Issue
Block a user