feat: add field-specific bbox expansion strategies for YOLO training

Implement center-point based bbox scaling with directional compensation
to capture field labels that typically appear above or to the left of
field values. This improves YOLO training data quality by including
contextual information around field values.

Key changes:
- Add shared.bbox module with ScaleStrategy dataclass and expand_bbox function
- Define field-specific strategies (ocr_number, bankgiro, invoice_date, etc.)
- Support manual_mode for minimal padding (no scaling)
- Integrate expand_bbox into AnnotationGenerator
- Add FIELD_TO_CLASS mapping for field_name to class_name lookup
- Comprehensive tests with 100% coverage (45 tests)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-02-04 22:56:52 +01:00
parent 8723ef4653
commit 0990239e9c
13 changed files with 1424 additions and 18 deletions

View File

@@ -2,6 +2,7 @@
YOLO Annotation Generator
Generates YOLO format annotations from matched fields.
Uses field-specific bbox expansion strategies for optimal training data.
"""
from dataclasses import dataclass
@@ -14,7 +15,9 @@ from shared.fields import (
TRAINING_FIELD_CLASSES as FIELD_CLASSES,
CLASS_NAMES,
ACCOUNT_FIELD_MAPPING,
FIELD_TO_CLASS,
)
from shared.bbox import expand_bbox
@dataclass
@@ -38,19 +41,16 @@ class AnnotationGenerator:
def __init__(
self,
min_confidence: float = 0.7,
bbox_padding_px: int = 20, # Absolute padding in pixels
min_bbox_height_px: int = 30 # Minimum bbox height
min_bbox_height_px: int = 30, # Minimum bbox height
):
"""
Initialize annotation generator.
Args:
min_confidence: Minimum match score to include in training
bbox_padding_px: Absolute padding in pixels to add around bboxes
min_bbox_height_px: Minimum bbox height in pixels
"""
self.min_confidence = min_confidence
self.bbox_padding_px = bbox_padding_px
self.min_bbox_height_px = min_bbox_height_px
def generate_from_matches(
@@ -63,6 +63,10 @@ class AnnotationGenerator:
"""
Generate YOLO annotations from field matches.
Uses field-specific bbox expansion strategies for optimal training data.
Each field type has customized scale factors and directional compensation
to capture field labels and context.
Args:
matches: Dict of field_name -> list of Match objects
image_width: Width of the rendered image in pixels
@@ -82,6 +86,8 @@ class AnnotationGenerator:
continue
class_id = FIELD_CLASSES[field_name]
# Get class_name for bbox expansion strategy
class_name = FIELD_TO_CLASS.get(field_name, field_name)
# Take only the best match per field
if field_matches:
@@ -94,19 +100,20 @@ class AnnotationGenerator:
x0, y0, x1, y1 = best_match.bbox
x0, y0, x1, y1 = x0 * scale, y0 * scale, x1 * scale, y1 * scale
# Add absolute padding
pad = self.bbox_padding_px
x0 = max(0, x0 - pad)
y0 = max(0, y0 - pad)
x1 = min(image_width, x1 + pad)
y1 = min(image_height, y1 + pad)
# Apply field-specific bbox expansion strategy
x0, y0, x1, y1 = expand_bbox(
bbox=(x0, y0, x1, y1),
image_width=image_width,
image_height=image_height,
field_type=class_name,
)
# Ensure minimum height
current_height = y1 - y0
if current_height < self.min_bbox_height_px:
extra = (self.min_bbox_height_px - current_height) / 2
y0 = max(0, y0 - extra)
y1 = min(image_height, y1 + extra)
y0 = max(0, int(y0 - extra))
y1 = min(int(image_height), int(y1 + extra))
# Convert to YOLO format (normalized center + size)
x_center = (x0 + x1) / 2 / image_width
@@ -143,6 +150,9 @@ class AnnotationGenerator:
"""
Add payment_line annotation from machine code parser result.
Uses "payment_line" scale strategy for minimal expansion
(machine-readable code needs less context).
Args:
annotations: Existing list of annotations to append to
payment_line_bbox: Bounding box (x0, y0, x1, y1) in PDF coordinates
@@ -163,12 +173,13 @@ class AnnotationGenerator:
x0, y0, x1, y1 = payment_line_bbox
x0, y0, x1, y1 = x0 * scale, y0 * scale, x1 * scale, y1 * scale
# Add absolute padding
pad = self.bbox_padding_px
x0 = max(0, x0 - pad)
y0 = max(0, y0 - pad)
x1 = min(image_width, x1 + pad)
y1 = min(image_height, y1 + pad)
# Apply field-specific bbox expansion strategy for payment_line
x0, y0, x1, y1 = expand_bbox(
bbox=(x0, y0, x1, y1),
image_width=image_width,
image_height=image_height,
field_type="payment_line",
)
# Convert to YOLO format (normalized center + size)
x_center = (x0 + x1) / 2 / image_width