WIP
This commit is contained in:
@@ -40,6 +40,7 @@ from .normalizers import (
|
||||
EnhancedAmountNormalizer,
|
||||
EnhancedDateNormalizer,
|
||||
)
|
||||
from .value_selector import ValueSelector
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -169,13 +170,21 @@ class FieldExtractor:
|
||||
overlap_ratio = overlap_area / token_area if token_area > 0 else 0
|
||||
matching_tokens.append((token, overlap_ratio))
|
||||
|
||||
# Sort by overlap ratio and combine text
|
||||
# Sort by overlap ratio
|
||||
matching_tokens.sort(key=lambda x: -x[1])
|
||||
raw_text = ' '.join(t[0].text for t in matching_tokens)
|
||||
|
||||
# Get field name
|
||||
field_name = CLASS_TO_FIELD.get(detection.class_name, detection.class_name)
|
||||
|
||||
# Convert to OCRTokens for value selection, then filter
|
||||
from shared.ocr.paddle_ocr import OCRToken
|
||||
pdf_ocr_tokens = [
|
||||
OCRToken(text=t[0].text, bbox=t[0].bbox, confidence=1.0)
|
||||
for t in matching_tokens
|
||||
]
|
||||
value_tokens = ValueSelector.select_value_tokens(pdf_ocr_tokens, field_name)
|
||||
raw_text = ' '.join(t.text for t in value_tokens)
|
||||
|
||||
# Normalize and validate
|
||||
normalized_value, is_valid, validation_error = self._normalize_and_validate(
|
||||
field_name, raw_text
|
||||
@@ -223,13 +232,14 @@ class FieldExtractor:
|
||||
# Run OCR on region
|
||||
ocr_tokens = self.ocr_engine.extract_from_image(region)
|
||||
|
||||
# Combine all OCR text
|
||||
raw_text = ' '.join(t.text for t in ocr_tokens)
|
||||
ocr_confidence = sum(t.confidence for t in ocr_tokens) / len(ocr_tokens) if ocr_tokens else 0.0
|
||||
|
||||
# Get field name
|
||||
field_name = CLASS_TO_FIELD.get(detection.class_name, detection.class_name)
|
||||
|
||||
# Select value tokens (filter out label text)
|
||||
value_tokens = ValueSelector.select_value_tokens(ocr_tokens, field_name)
|
||||
raw_text = ' '.join(t.text for t in value_tokens)
|
||||
ocr_confidence = sum(t.confidence for t in ocr_tokens) / len(ocr_tokens) if ocr_tokens else 0.0
|
||||
|
||||
# Normalize and validate
|
||||
normalized_value, is_valid, validation_error = self._normalize_and_validate(
|
||||
field_name, raw_text
|
||||
|
||||
Reference in New Issue
Block a user