feat: add field-specific bbox expansion strategies for YOLO training

Implement center-point based bbox scaling with directional compensation
to capture field labels that typically appear above or to the left of
field values. This improves YOLO training data quality by including
contextual information around field values.

Key changes:
- Add shared.bbox module with ScaleStrategy dataclass and expand_bbox function
- Define field-specific strategies (ocr_number, bankgiro, invoice_date, etc.)
- Support manual_mode for minimal padding (no scaling)
- Integrate expand_bbox into AnnotationGenerator
- Add FIELD_TO_CLASS mapping for field_name to class_name lookup
- Comprehensive tests with 100% coverage (45 tests)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Yaojia Wang
2026-02-04 22:56:52 +01:00
parent 8723ef4653
commit 0990239e9c
13 changed files with 1424 additions and 18 deletions

View File

@@ -0,0 +1,37 @@
"""
BBox Scale Strategy Module.
Provides field-specific bounding box expansion strategies for YOLO training data.
Expands bboxes using center-point scaling with directional compensation to capture
field labels that typically appear above or to the left of field values.
Two modes are supported:
- Auto-label: Field-specific scale strategies with directional compensation
- Manual-label: Minimal padding only to prevent edge clipping
Usage:
from shared.bbox import expand_bbox, ScaleStrategy, FIELD_SCALE_STRATEGIES
Available exports:
- ScaleStrategy: Dataclass for scale strategy configuration
- DEFAULT_STRATEGY: Default strategy for unknown fields (auto-label)
- MANUAL_LABEL_STRATEGY: Minimal padding strategy for manual labels
- FIELD_SCALE_STRATEGIES: dict[str, ScaleStrategy] - field-specific strategies
- expand_bbox: Function to expand bbox using field-specific strategy
"""
from .scale_strategy import (
ScaleStrategy,
DEFAULT_STRATEGY,
MANUAL_LABEL_STRATEGY,
FIELD_SCALE_STRATEGIES,
)
from .expander import expand_bbox
__all__ = [
"ScaleStrategy",
"DEFAULT_STRATEGY",
"MANUAL_LABEL_STRATEGY",
"FIELD_SCALE_STRATEGIES",
"expand_bbox",
]

View File

@@ -0,0 +1,101 @@
"""
BBox Expander Module.
Provides functions to expand bounding boxes using field-specific strategies.
Expansion is center-point based with directional compensation.
Two modes:
- Auto-label (default): Field-specific scale strategies
- Manual-label: Minimal padding only to prevent edge clipping
"""
from .scale_strategy import (
ScaleStrategy,
DEFAULT_STRATEGY,
MANUAL_LABEL_STRATEGY,
FIELD_SCALE_STRATEGIES,
)
def expand_bbox(
bbox: tuple[float, float, float, float],
image_width: float,
image_height: float,
field_type: str,
strategies: dict[str, ScaleStrategy] | None = None,
manual_mode: bool = False,
) -> tuple[int, int, int, int]:
"""
Expand bbox using field-specific scale strategy.
The expansion follows these steps:
1. Scale bbox around center point (scale_x, scale_y)
2. Apply directional compensation (extra_*_ratio)
3. Clamp expansion to max_pad limits
4. Clamp to image boundaries
Args:
bbox: (x0, y0, x1, y1) in pixels
image_width: Image width for boundary clamping
image_height: Image height for boundary clamping
field_type: Field class_name (e.g., "ocr_number")
strategies: Custom strategies dict, defaults to FIELD_SCALE_STRATEGIES
manual_mode: If True, use MANUAL_LABEL_STRATEGY (minimal padding only)
Returns:
Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds
"""
x0, y0, x1, y1 = bbox
w = x1 - x0
h = y1 - y0
# Get strategy based on mode
if manual_mode:
strategy = MANUAL_LABEL_STRATEGY
elif strategies is None:
strategy = FIELD_SCALE_STRATEGIES.get(field_type, DEFAULT_STRATEGY)
else:
strategy = strategies.get(field_type, DEFAULT_STRATEGY)
# Step 1: Scale around center point
cx = (x0 + x1) / 2
cy = (y0 + y1) / 2
new_w = w * strategy.scale_x
new_h = h * strategy.scale_y
nx0 = cx - new_w / 2
nx1 = cx + new_w / 2
ny0 = cy - new_h / 2
ny1 = cy + new_h / 2
# Step 2: Apply directional compensation
nx0 -= w * strategy.extra_left_ratio
nx1 += w * strategy.extra_right_ratio
ny0 -= h * strategy.extra_top_ratio
ny1 += h * strategy.extra_bottom_ratio
# Step 3: Clamp expansion to max_pad limits (preserve asymmetry)
left_pad = min(x0 - nx0, strategy.max_pad_x)
right_pad = min(nx1 - x1, strategy.max_pad_x)
top_pad = min(y0 - ny0, strategy.max_pad_y)
bottom_pad = min(ny1 - y1, strategy.max_pad_y)
# Ensure pads are non-negative (in case of contraction)
left_pad = max(0, left_pad)
right_pad = max(0, right_pad)
top_pad = max(0, top_pad)
bottom_pad = max(0, bottom_pad)
nx0 = x0 - left_pad
nx1 = x1 + right_pad
ny0 = y0 - top_pad
ny1 = y1 + bottom_pad
# Step 4: Clamp to image boundaries
nx0 = max(0, int(nx0))
ny0 = max(0, int(ny0))
nx1 = min(int(image_width), int(nx1))
ny1 = min(int(image_height), int(ny1))
return (nx0, ny0, nx1, ny1)

View File

@@ -0,0 +1,140 @@
"""
Scale Strategy Configuration.
Defines field-specific bbox expansion strategies for YOLO training data.
Each strategy controls how bboxes are expanded around field values to
capture contextual information like labels.
"""
from dataclasses import dataclass
from typing import Final
@dataclass(frozen=True)
class ScaleStrategy:
"""Immutable scale strategy for bbox expansion.
Attributes:
scale_x: Horizontal scale factor (1.0 = no scaling)
scale_y: Vertical scale factor (1.0 = no scaling)
extra_top_ratio: Additional expansion ratio towards top (for labels above)
extra_bottom_ratio: Additional expansion ratio towards bottom
extra_left_ratio: Additional expansion ratio towards left (for prefixes)
extra_right_ratio: Additional expansion ratio towards right (for suffixes)
max_pad_x: Maximum horizontal padding in pixels
max_pad_y: Maximum vertical padding in pixels
"""
scale_x: float = 1.15
scale_y: float = 1.15
extra_top_ratio: float = 0.0
extra_bottom_ratio: float = 0.0
extra_left_ratio: float = 0.0
extra_right_ratio: float = 0.0
max_pad_x: int = 50
max_pad_y: int = 50
# Default strategy for unknown fields (auto-label mode)
DEFAULT_STRATEGY: Final[ScaleStrategy] = ScaleStrategy()
# Manual label strategy - minimal padding to prevent edge clipping
# No scaling, no directional compensation, just small uniform padding
MANUAL_LABEL_STRATEGY: Final[ScaleStrategy] = ScaleStrategy(
scale_x=1.0,
scale_y=1.0,
extra_top_ratio=0.0,
extra_bottom_ratio=0.0,
extra_left_ratio=0.0,
extra_right_ratio=0.0,
max_pad_x=10, # Small padding to prevent edge loss
max_pad_y=10,
)
# Field-specific strategies based on Swedish invoice field characteristics
# Field labels typically appear above or to the left of values
FIELD_SCALE_STRATEGIES: Final[dict[str, ScaleStrategy]] = {
# OCR number - label "OCR" or "Referens" typically above
"ocr_number": ScaleStrategy(
scale_x=1.15,
scale_y=1.80,
extra_top_ratio=0.60,
max_pad_x=50,
max_pad_y=140,
),
# Bankgiro - prefix "Bankgiro:" or "BG:" typically to the left
"bankgiro": ScaleStrategy(
scale_x=1.45,
scale_y=1.35,
extra_left_ratio=0.80,
max_pad_x=160,
max_pad_y=90,
),
# Plusgiro - prefix "Plusgiro:" or "PG:" typically to the left
"plusgiro": ScaleStrategy(
scale_x=1.45,
scale_y=1.35,
extra_left_ratio=0.80,
max_pad_x=160,
max_pad_y=90,
),
# Invoice date - label "Fakturadatum" typically above
"invoice_date": ScaleStrategy(
scale_x=1.25,
scale_y=1.55,
extra_top_ratio=0.40,
max_pad_x=80,
max_pad_y=110,
),
# Due date - label "Forfalldatum" typically above, sometimes left
"invoice_due_date": ScaleStrategy(
scale_x=1.30,
scale_y=1.65,
extra_top_ratio=0.45,
extra_left_ratio=0.35,
max_pad_x=100,
max_pad_y=120,
),
# Amount - currency symbol "SEK" or "kr" may be to the right
"amount": ScaleStrategy(
scale_x=1.20,
scale_y=1.35,
extra_right_ratio=0.30,
max_pad_x=70,
max_pad_y=80,
),
# Invoice number - label "Fakturanummer" typically above
"invoice_number": ScaleStrategy(
scale_x=1.20,
scale_y=1.50,
extra_top_ratio=0.40,
max_pad_x=80,
max_pad_y=100,
),
# Supplier org number - label "Org.nr" typically above or left
"supplier_org_number": ScaleStrategy(
scale_x=1.25,
scale_y=1.40,
extra_top_ratio=0.30,
extra_left_ratio=0.20,
max_pad_x=90,
max_pad_y=90,
),
# Customer number - label "Kundnummer" typically above or left
"customer_number": ScaleStrategy(
scale_x=1.25,
scale_y=1.45,
extra_top_ratio=0.35,
extra_left_ratio=0.25,
max_pad_x=90,
max_pad_y=100,
),
# Payment line - machine-readable code, minimal expansion needed
"payment_line": ScaleStrategy(
scale_x=1.10,
scale_y=1.20,
max_pad_x=40,
max_pad_y=40,
),
}

View File

@@ -16,6 +16,7 @@ Available exports:
- FIELD_CLASSES: dict[int, str] - class_id to class_name
- FIELD_CLASS_IDS: dict[str, int] - class_name to class_id
- CLASS_TO_FIELD: dict[str, str] - class_name to field_name
- FIELD_TO_CLASS: dict[str, str] - field_name to class_name
- CSV_TO_CLASS_MAPPING: dict[str, int] - field_name to class_id (excludes derived)
- TRAINING_FIELD_CLASSES: dict[str, int] - field_name to class_id (all fields)
- ACCOUNT_FIELD_MAPPING: Mapping for supplier_accounts handling
@@ -27,6 +28,7 @@ from .mappings import (
FIELD_CLASSES,
FIELD_CLASS_IDS,
CLASS_TO_FIELD,
FIELD_TO_CLASS,
CSV_TO_CLASS_MAPPING,
TRAINING_FIELD_CLASSES,
ACCOUNT_FIELD_MAPPING,
@@ -40,6 +42,7 @@ __all__ = [
"FIELD_CLASSES",
"FIELD_CLASS_IDS",
"CLASS_TO_FIELD",
"FIELD_TO_CLASS",
"CSV_TO_CLASS_MAPPING",
"TRAINING_FIELD_CLASSES",
"ACCOUNT_FIELD_MAPPING",

View File

@@ -47,6 +47,12 @@ TRAINING_FIELD_CLASSES: Final[dict[str, int]] = {
fd.field_name: fd.class_id for fd in FIELD_DEFINITIONS
}
# field_name -> class_name mapping (reverse of CLASS_TO_FIELD)
# Example: {"InvoiceNumber": "invoice_number", "OCR": "ocr_number", ...}
FIELD_TO_CLASS: Final[dict[str, str]] = {
fd.field_name: fd.class_name for fd in FIELD_DEFINITIONS
}
# Account field mapping for supplier_accounts special handling
# BG:xxx -> Bankgiro, PG:xxx -> Plusgiro
ACCOUNT_FIELD_MAPPING: Final[dict[str, dict[str, str]]] = {

View File

@@ -2,6 +2,7 @@
YOLO Annotation Generator
Generates YOLO format annotations from matched fields.
Uses field-specific bbox expansion strategies for optimal training data.
"""
from dataclasses import dataclass
@@ -14,7 +15,9 @@ from shared.fields import (
TRAINING_FIELD_CLASSES as FIELD_CLASSES,
CLASS_NAMES,
ACCOUNT_FIELD_MAPPING,
FIELD_TO_CLASS,
)
from shared.bbox import expand_bbox
@dataclass
@@ -38,19 +41,16 @@ class AnnotationGenerator:
def __init__(
self,
min_confidence: float = 0.7,
bbox_padding_px: int = 20, # Absolute padding in pixels
min_bbox_height_px: int = 30 # Minimum bbox height
min_bbox_height_px: int = 30, # Minimum bbox height
):
"""
Initialize annotation generator.
Args:
min_confidence: Minimum match score to include in training
bbox_padding_px: Absolute padding in pixels to add around bboxes
min_bbox_height_px: Minimum bbox height in pixels
"""
self.min_confidence = min_confidence
self.bbox_padding_px = bbox_padding_px
self.min_bbox_height_px = min_bbox_height_px
def generate_from_matches(
@@ -63,6 +63,10 @@ class AnnotationGenerator:
"""
Generate YOLO annotations from field matches.
Uses field-specific bbox expansion strategies for optimal training data.
Each field type has customized scale factors and directional compensation
to capture field labels and context.
Args:
matches: Dict of field_name -> list of Match objects
image_width: Width of the rendered image in pixels
@@ -82,6 +86,8 @@ class AnnotationGenerator:
continue
class_id = FIELD_CLASSES[field_name]
# Get class_name for bbox expansion strategy
class_name = FIELD_TO_CLASS.get(field_name, field_name)
# Take only the best match per field
if field_matches:
@@ -94,19 +100,20 @@ class AnnotationGenerator:
x0, y0, x1, y1 = best_match.bbox
x0, y0, x1, y1 = x0 * scale, y0 * scale, x1 * scale, y1 * scale
# Add absolute padding
pad = self.bbox_padding_px
x0 = max(0, x0 - pad)
y0 = max(0, y0 - pad)
x1 = min(image_width, x1 + pad)
y1 = min(image_height, y1 + pad)
# Apply field-specific bbox expansion strategy
x0, y0, x1, y1 = expand_bbox(
bbox=(x0, y0, x1, y1),
image_width=image_width,
image_height=image_height,
field_type=class_name,
)
# Ensure minimum height
current_height = y1 - y0
if current_height < self.min_bbox_height_px:
extra = (self.min_bbox_height_px - current_height) / 2
y0 = max(0, y0 - extra)
y1 = min(image_height, y1 + extra)
y0 = max(0, int(y0 - extra))
y1 = min(int(image_height), int(y1 + extra))
# Convert to YOLO format (normalized center + size)
x_center = (x0 + x1) / 2 / image_width
@@ -143,6 +150,9 @@ class AnnotationGenerator:
"""
Add payment_line annotation from machine code parser result.
Uses "payment_line" scale strategy for minimal expansion
(machine-readable code needs less context).
Args:
annotations: Existing list of annotations to append to
payment_line_bbox: Bounding box (x0, y0, x1, y1) in PDF coordinates
@@ -163,12 +173,13 @@ class AnnotationGenerator:
x0, y0, x1, y1 = payment_line_bbox
x0, y0, x1, y1 = x0 * scale, y0 * scale, x1 * scale, y1 * scale
# Add absolute padding
pad = self.bbox_padding_px
x0 = max(0, x0 - pad)
y0 = max(0, y0 - pad)
x1 = min(image_width, x1 + pad)
y1 = min(image_height, y1 + pad)
# Apply field-specific bbox expansion strategy for payment_line
x0, y0, x1, y1 = expand_bbox(
bbox=(x0, y0, x1, y1),
image_width=image_width,
image_height=image_height,
field_type="payment_line",
)
# Convert to YOLO format (normalized center + size)
x_center = (x0 + x1) / 2 / image_width