WOP
This commit is contained in:
@@ -72,7 +72,7 @@ class FieldExtractor:
|
||||
"""Lazy-load OCR engine only when needed."""
|
||||
if self._ocr_engine is None:
|
||||
from ..ocr import OCREngine
|
||||
self._ocr_engine = OCREngine(lang=self.ocr_lang, use_gpu=self.use_gpu)
|
||||
self._ocr_engine = OCREngine(lang=self.ocr_lang)
|
||||
return self._ocr_engine
|
||||
|
||||
def extract_from_detection_with_pdf(
|
||||
@@ -290,31 +290,65 @@ class FieldExtractor:
|
||||
|
||||
def _normalize_amount(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""Normalize monetary amount."""
|
||||
# Remove currency and common suffixes
|
||||
text = re.sub(r'[SEK|kr|:-]+', '', text, flags=re.IGNORECASE)
|
||||
text = text.replace(' ', '').replace('\xa0', '')
|
||||
# Try to extract amount using regex patterns
|
||||
# Pattern 1: Number with comma as decimal (Swedish format: 1 234,56)
|
||||
# Pattern 2: Number with dot as decimal (1234.56)
|
||||
# Pattern 3: Number followed by currency (275,60 kr or 275.60 SEK)
|
||||
|
||||
# Handle comma as decimal separator
|
||||
if ',' in text and '.' not in text:
|
||||
text = text.replace(',', '.')
|
||||
patterns = [
|
||||
# Swedish format with space thousand separator: 1 234,56 or 1234,56
|
||||
r'(\d[\d\s]*[,\.]\d{2})\s*(?:kr|SEK)?',
|
||||
# Simple decimal: 350.00 or 350,00
|
||||
r'(\d+[,\.]\d{2})',
|
||||
# Integer amount
|
||||
r'(\d{2,})',
|
||||
]
|
||||
|
||||
# Try to parse as float
|
||||
try:
|
||||
amount = float(text)
|
||||
return f"{amount:.2f}", True, None
|
||||
except ValueError:
|
||||
return None, False, f"Cannot parse amount: {text}"
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, text, re.IGNORECASE)
|
||||
if matches:
|
||||
# Take the last match (usually the total amount)
|
||||
amount_str = matches[-1]
|
||||
# Clean up
|
||||
amount_str = amount_str.replace(' ', '').replace('\xa0', '')
|
||||
# Handle comma as decimal separator
|
||||
if ',' in amount_str:
|
||||
amount_str = amount_str.replace(',', '.')
|
||||
|
||||
try:
|
||||
amount = float(amount_str)
|
||||
if amount > 0:
|
||||
return f"{amount:.2f}", True, None
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return None, False, f"Cannot parse amount: {text}"
|
||||
|
||||
def _normalize_date(self, text: str) -> tuple[str | None, bool, str | None]:
|
||||
"""Normalize date."""
|
||||
"""
|
||||
Normalize date from text that may contain surrounding text.
|
||||
|
||||
Handles various date formats found in Swedish invoices:
|
||||
- 2025-08-29 (ISO format)
|
||||
- 2025.08.29 (dot separator)
|
||||
- 29/08/2025 (European format)
|
||||
- 29.08.2025 (European with dots)
|
||||
- 20250829 (compact format)
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
# Common date patterns
|
||||
# Common date patterns - order matters, most specific first
|
||||
patterns = [
|
||||
(r'(\d{4})-(\d{1,2})-(\d{1,2})', lambda m: f"{m[1]}-{int(m[2]):02d}-{int(m[3]):02d}"),
|
||||
(r'(\d{1,2})/(\d{1,2})/(\d{4})', lambda m: f"{m[3]}-{int(m[2]):02d}-{int(m[1]):02d}"),
|
||||
(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', lambda m: f"{m[3]}-{int(m[2]):02d}-{int(m[1]):02d}"),
|
||||
(r'(\d{4})(\d{2})(\d{2})', lambda m: f"{m[1]}-{m[2]}-{m[3]}"),
|
||||
# ISO format: 2025-08-29
|
||||
(r'(\d{4})-(\d{1,2})-(\d{1,2})', lambda m: f"{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}"),
|
||||
# Dot format: 2025.08.29 (common in Swedish)
|
||||
(r'(\d{4})\.(\d{1,2})\.(\d{1,2})', lambda m: f"{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}"),
|
||||
# European slash format: 29/08/2025
|
||||
(r'(\d{1,2})/(\d{1,2})/(\d{4})', lambda m: f"{m.group(3)}-{int(m.group(2)):02d}-{int(m.group(1)):02d}"),
|
||||
# European dot format: 29.08.2025
|
||||
(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', lambda m: f"{m.group(3)}-{int(m.group(2)):02d}-{int(m.group(1)):02d}"),
|
||||
# Compact format: 20250829
|
||||
(r'(?<!\d)(\d{4})(\d{2})(\d{2})(?!\d)', lambda m: f"{m.group(1)}-{m.group(2)}-{m.group(3)}"),
|
||||
]
|
||||
|
||||
for pattern, formatter in patterns:
|
||||
@@ -323,8 +357,10 @@ class FieldExtractor:
|
||||
try:
|
||||
date_str = formatter(match)
|
||||
# Validate date
|
||||
datetime.strptime(date_str, '%Y-%m-%d')
|
||||
return date_str, True, None
|
||||
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
|
||||
# Sanity check: year should be reasonable (2000-2100)
|
||||
if 2000 <= parsed_date.year <= 2100:
|
||||
return date_str, True, None
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
|
||||
Reference in New Issue
Block a user