WIP

2026-02-11 23:40:38 +01:00
parent f1a7bfe6b7
commit ad5ed46b4c
117 changed files with 5741 additions and 7669 deletions
--- a/packages/backend/backend/pipeline/field_extractor.py
+++ b/packages/backend/backend/pipeline/field_extractor.py
@@ -40,6 +40,7 @@ from .normalizers import (
    EnhancedAmountNormalizer,
    EnhancedDateNormalizer,
 )
+from .value_selector import ValueSelector


@dataclass
@@ -169,13 +170,21 @@ class FieldExtractor:
                    overlap_ratio = overlap_area / token_area if token_area > 0 else 0
                    matching_tokens.append((token, overlap_ratio))

-        # Sort by overlap ratio and combine text
+        # Sort by overlap ratio
        matching_tokens.sort(key=lambda x: -x[1])
-        raw_text = ' '.join(t[0].text for t in matching_tokens)

        # Get field name
        field_name = CLASS_TO_FIELD.get(detection.class_name, detection.class_name)

+        # Convert to OCRTokens for value selection, then filter
+        from shared.ocr.paddle_ocr import OCRToken
+        pdf_ocr_tokens = [
+            OCRToken(text=t[0].text, bbox=t[0].bbox, confidence=1.0)
+            for t in matching_tokens
+        ]
+        value_tokens = ValueSelector.select_value_tokens(pdf_ocr_tokens, field_name)
+        raw_text = ' '.join(t.text for t in value_tokens)
+
        # Normalize and validate
        normalized_value, is_valid, validation_error = self._normalize_and_validate(
            field_name, raw_text
@@ -223,13 +232,14 @@ class FieldExtractor:
        # Run OCR on region
        ocr_tokens = self.ocr_engine.extract_from_image(region)

-        # Combine all OCR text
-        raw_text = ' '.join(t.text for t in ocr_tokens)
-        ocr_confidence = sum(t.confidence for t in ocr_tokens) / len(ocr_tokens) if ocr_tokens else 0.0
-
        # Get field name
        field_name = CLASS_TO_FIELD.get(detection.class_name, detection.class_name)

+        # Select value tokens (filter out label text)
+        value_tokens = ValueSelector.select_value_tokens(ocr_tokens, field_name)
+        raw_text = ' '.join(t.text for t in value_tokens)
+        ocr_confidence = sum(t.confidence for t in ocr_tokens) / len(ocr_tokens) if ocr_tokens else 0.0
+
        # Normalize and validate
        normalized_value, is_valid, validation_error = self._normalize_and_validate(
            field_name, raw_text