WOP

2026-01-13 00:10:27 +01:00
parent 1b7c61cdd8
commit b26fd61852
43 changed files with 7751 additions and 578 deletions
--- a/src/inference/field_extractor.py
+++ b/src/inference/field_extractor.py
@@ -72,7 +72,7 @@ class FieldExtractor:
        """Lazy-load OCR engine only when needed."""
        if self._ocr_engine is None:
            from ..ocr import OCREngine
-            self._ocr_engine = OCREngine(lang=self.ocr_lang, use_gpu=self.use_gpu)
+            self._ocr_engine = OCREngine(lang=self.ocr_lang)
        return self._ocr_engine

    def extract_from_detection_with_pdf(
@@ -290,31 +290,65 @@ class FieldExtractor:

    def _normalize_amount(self, text: str) -> tuple[str | None, bool, str | None]:
        """Normalize monetary amount."""
-        # Remove currency and common suffixes
-        text = re.sub(r'[SEK|kr|:-]+', '', text, flags=re.IGNORECASE)
-        text = text.replace(' ', '').replace('\xa0', '')
+        # Try to extract amount using regex patterns
+        # Pattern 1: Number with comma as decimal (Swedish format: 1 234,56)
+        # Pattern 2: Number with dot as decimal (1234.56)
+        # Pattern 3: Number followed by currency (275,60 kr or 275.60 SEK)

-        # Handle comma as decimal separator
-        if ',' in text and '.' not in text:
-            text = text.replace(',', '.')
+        patterns = [
+            # Swedish format with space thousand separator: 1 234,56 or 1234,56
+            r'(\d[\d\s]*[,\.]\d{2})\s*(?:kr|SEK)?',
+            # Simple decimal: 350.00 or 350,00
+            r'(\d+[,\.]\d{2})',
+            # Integer amount
+            r'(\d{2,})',
+        ]

-        # Try to parse as float
-        try:
-            amount = float(text)
-            return f"{amount:.2f}", True, None
-        except ValueError:
-            return None, False, f"Cannot parse amount: {text}"
+        for pattern in patterns:
+            matches = re.findall(pattern, text, re.IGNORECASE)
+            if matches:
+                # Take the last match (usually the total amount)
+                amount_str = matches[-1]
+                # Clean up
+                amount_str = amount_str.replace(' ', '').replace('\xa0', '')
+                # Handle comma as decimal separator
+                if ',' in amount_str:
+                    amount_str = amount_str.replace(',', '.')
+
+                try:
+                    amount = float(amount_str)
+                    if amount > 0:
+                        return f"{amount:.2f}", True, None
+                except ValueError:
+                    continue
+
+        return None, False, f"Cannot parse amount: {text}"

    def _normalize_date(self, text: str) -> tuple[str | None, bool, str | None]:
-        """Normalize date."""
+        """
+        Normalize date from text that may contain surrounding text.
+
+        Handles various date formats found in Swedish invoices:
+        - 2025-08-29 (ISO format)
+        - 2025.08.29 (dot separator)
+        - 29/08/2025 (European format)
+        - 29.08.2025 (European with dots)
+        - 20250829 (compact format)
+        """
        from datetime import datetime

-        # Common date patterns
+        # Common date patterns - order matters, most specific first
        patterns = [
-            (r'(\d{4})-(\d{1,2})-(\d{1,2})', lambda m: f"{m[1]}-{int(m[2]):02d}-{int(m[3]):02d}"),
-            (r'(\d{1,2})/(\d{1,2})/(\d{4})', lambda m: f"{m[3]}-{int(m[2]):02d}-{int(m[1]):02d}"),
-            (r'(\d{1,2})\.(\d{1,2})\.(\d{4})', lambda m: f"{m[3]}-{int(m[2]):02d}-{int(m[1]):02d}"),
-            (r'(\d{4})(\d{2})(\d{2})', lambda m: f"{m[1]}-{m[2]}-{m[3]}"),
+            # ISO format: 2025-08-29
+            (r'(\d{4})-(\d{1,2})-(\d{1,2})', lambda m: f"{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}"),
+            # Dot format: 2025.08.29 (common in Swedish)
+            (r'(\d{4})\.(\d{1,2})\.(\d{1,2})', lambda m: f"{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}"),
+            # European slash format: 29/08/2025
+            (r'(\d{1,2})/(\d{1,2})/(\d{4})', lambda m: f"{m.group(3)}-{int(m.group(2)):02d}-{int(m.group(1)):02d}"),
+            # European dot format: 29.08.2025
+            (r'(\d{1,2})\.(\d{1,2})\.(\d{4})', lambda m: f"{m.group(3)}-{int(m.group(2)):02d}-{int(m.group(1)):02d}"),
+            # Compact format: 20250829
+            (r'(?<!\d)(\d{4})(\d{2})(\d{2})(?!\d)', lambda m: f"{m.group(1)}-{m.group(2)}-{m.group(3)}"),
        ]

        for pattern, formatter in patterns:
@@ -323,8 +357,10 @@ class FieldExtractor:
                try:
                    date_str = formatter(match)
                    # Validate date
-                    datetime.strptime(date_str, '%Y-%m-%d')
-                    return date_str, True, None
+                    parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
+                    # Sanity check: year should be reasonable (2000-2100)
+                    if 2000 <= parsed_date.year <= 2100:
+                        return date_str, True, None
                except ValueError:
                    continue