This commit is contained in:
Yaojia Wang
2026-01-13 00:10:27 +01:00
parent 1b7c61cdd8
commit b26fd61852
43 changed files with 7751 additions and 578 deletions

View File

@@ -72,7 +72,7 @@ class FieldExtractor:
"""Lazy-load OCR engine only when needed."""
if self._ocr_engine is None:
from ..ocr import OCREngine
self._ocr_engine = OCREngine(lang=self.ocr_lang, use_gpu=self.use_gpu)
self._ocr_engine = OCREngine(lang=self.ocr_lang)
return self._ocr_engine
def extract_from_detection_with_pdf(
@@ -290,31 +290,65 @@ class FieldExtractor:
def _normalize_amount(self, text: str) -> tuple[str | None, bool, str | None]:
"""Normalize monetary amount."""
# Remove currency and common suffixes
text = re.sub(r'[SEK|kr|:-]+', '', text, flags=re.IGNORECASE)
text = text.replace(' ', '').replace('\xa0', '')
# Try to extract amount using regex patterns
# Pattern 1: Number with comma as decimal (Swedish format: 1 234,56)
# Pattern 2: Number with dot as decimal (1234.56)
# Pattern 3: Number followed by currency (275,60 kr or 275.60 SEK)
# Handle comma as decimal separator
if ',' in text and '.' not in text:
text = text.replace(',', '.')
patterns = [
# Swedish format with space thousand separator: 1 234,56 or 1234,56
r'(\d[\d\s]*[,\.]\d{2})\s*(?:kr|SEK)?',
# Simple decimal: 350.00 or 350,00
r'(\d+[,\.]\d{2})',
# Integer amount
r'(\d{2,})',
]
# Try to parse as float
try:
amount = float(text)
return f"{amount:.2f}", True, None
except ValueError:
return None, False, f"Cannot parse amount: {text}"
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
# Take the last match (usually the total amount)
amount_str = matches[-1]
# Clean up
amount_str = amount_str.replace(' ', '').replace('\xa0', '')
# Handle comma as decimal separator
if ',' in amount_str:
amount_str = amount_str.replace(',', '.')
try:
amount = float(amount_str)
if amount > 0:
return f"{amount:.2f}", True, None
except ValueError:
continue
return None, False, f"Cannot parse amount: {text}"
def _normalize_date(self, text: str) -> tuple[str | None, bool, str | None]:
"""Normalize date."""
"""
Normalize date from text that may contain surrounding text.
Handles various date formats found in Swedish invoices:
- 2025-08-29 (ISO format)
- 2025.08.29 (dot separator)
- 29/08/2025 (European format)
- 29.08.2025 (European with dots)
- 20250829 (compact format)
"""
from datetime import datetime
# Common date patterns
# Common date patterns - order matters, most specific first
patterns = [
(r'(\d{4})-(\d{1,2})-(\d{1,2})', lambda m: f"{m[1]}-{int(m[2]):02d}-{int(m[3]):02d}"),
(r'(\d{1,2})/(\d{1,2})/(\d{4})', lambda m: f"{m[3]}-{int(m[2]):02d}-{int(m[1]):02d}"),
(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', lambda m: f"{m[3]}-{int(m[2]):02d}-{int(m[1]):02d}"),
(r'(\d{4})(\d{2})(\d{2})', lambda m: f"{m[1]}-{m[2]}-{m[3]}"),
# ISO format: 2025-08-29
(r'(\d{4})-(\d{1,2})-(\d{1,2})', lambda m: f"{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}"),
# Dot format: 2025.08.29 (common in Swedish)
(r'(\d{4})\.(\d{1,2})\.(\d{1,2})', lambda m: f"{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}"),
# European slash format: 29/08/2025
(r'(\d{1,2})/(\d{1,2})/(\d{4})', lambda m: f"{m.group(3)}-{int(m.group(2)):02d}-{int(m.group(1)):02d}"),
# European dot format: 29.08.2025
(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', lambda m: f"{m.group(3)}-{int(m.group(2)):02d}-{int(m.group(1)):02d}"),
# Compact format: 20250829
(r'(?<!\d)(\d{4})(\d{2})(\d{2})(?!\d)', lambda m: f"{m.group(1)}-{m.group(2)}-{m.group(3)}"),
]
for pattern, formatter in patterns:
@@ -323,8 +357,10 @@ class FieldExtractor:
try:
date_str = formatter(match)
# Validate date
datetime.strptime(date_str, '%Y-%m-%d')
return date_str, True, None
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
# Sanity check: year should be reasonable (2000-2100)
if 2000 <= parsed_date.year <= 2100:
return date_str, True, None
except ValueError:
continue