feat: add field-specific bbox expansion strategies for YOLO training
Implement center-point based bbox scaling with directional compensation to capture field labels that typically appear above or to the left of field values. This improves YOLO training data quality by including contextual information around field values. Key changes: - Add shared.bbox module with ScaleStrategy dataclass and expand_bbox function - Define field-specific strategies (ocr_number, bankgiro, invoice_date, etc.) - Support manual_mode for minimal padding (no scaling) - Integrate expand_bbox into AnnotationGenerator - Add FIELD_TO_CLASS mapping for field_name to class_name lookup - Comprehensive tests with 100% coverage (45 tests) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
37
packages/shared/shared/bbox/__init__.py
Normal file
37
packages/shared/shared/bbox/__init__.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
BBox Scale Strategy Module.
|
||||
|
||||
Provides field-specific bounding box expansion strategies for YOLO training data.
|
||||
Expands bboxes using center-point scaling with directional compensation to capture
|
||||
field labels that typically appear above or to the left of field values.
|
||||
|
||||
Two modes are supported:
|
||||
- Auto-label: Field-specific scale strategies with directional compensation
|
||||
- Manual-label: Minimal padding only to prevent edge clipping
|
||||
|
||||
Usage:
|
||||
from shared.bbox import expand_bbox, ScaleStrategy, FIELD_SCALE_STRATEGIES
|
||||
|
||||
Available exports:
|
||||
- ScaleStrategy: Dataclass for scale strategy configuration
|
||||
- DEFAULT_STRATEGY: Default strategy for unknown fields (auto-label)
|
||||
- MANUAL_LABEL_STRATEGY: Minimal padding strategy for manual labels
|
||||
- FIELD_SCALE_STRATEGIES: dict[str, ScaleStrategy] - field-specific strategies
|
||||
- expand_bbox: Function to expand bbox using field-specific strategy
|
||||
"""
|
||||
|
||||
from .scale_strategy import (
|
||||
ScaleStrategy,
|
||||
DEFAULT_STRATEGY,
|
||||
MANUAL_LABEL_STRATEGY,
|
||||
FIELD_SCALE_STRATEGIES,
|
||||
)
|
||||
from .expander import expand_bbox
|
||||
|
||||
__all__ = [
|
||||
"ScaleStrategy",
|
||||
"DEFAULT_STRATEGY",
|
||||
"MANUAL_LABEL_STRATEGY",
|
||||
"FIELD_SCALE_STRATEGIES",
|
||||
"expand_bbox",
|
||||
]
|
||||
101
packages/shared/shared/bbox/expander.py
Normal file
101
packages/shared/shared/bbox/expander.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""
|
||||
BBox Expander Module.
|
||||
|
||||
Provides functions to expand bounding boxes using field-specific strategies.
|
||||
Expansion is center-point based with directional compensation.
|
||||
|
||||
Two modes:
|
||||
- Auto-label (default): Field-specific scale strategies
|
||||
- Manual-label: Minimal padding only to prevent edge clipping
|
||||
"""
|
||||
|
||||
from .scale_strategy import (
|
||||
ScaleStrategy,
|
||||
DEFAULT_STRATEGY,
|
||||
MANUAL_LABEL_STRATEGY,
|
||||
FIELD_SCALE_STRATEGIES,
|
||||
)
|
||||
|
||||
|
||||
def expand_bbox(
|
||||
bbox: tuple[float, float, float, float],
|
||||
image_width: float,
|
||||
image_height: float,
|
||||
field_type: str,
|
||||
strategies: dict[str, ScaleStrategy] | None = None,
|
||||
manual_mode: bool = False,
|
||||
) -> tuple[int, int, int, int]:
|
||||
"""
|
||||
Expand bbox using field-specific scale strategy.
|
||||
|
||||
The expansion follows these steps:
|
||||
1. Scale bbox around center point (scale_x, scale_y)
|
||||
2. Apply directional compensation (extra_*_ratio)
|
||||
3. Clamp expansion to max_pad limits
|
||||
4. Clamp to image boundaries
|
||||
|
||||
Args:
|
||||
bbox: (x0, y0, x1, y1) in pixels
|
||||
image_width: Image width for boundary clamping
|
||||
image_height: Image height for boundary clamping
|
||||
field_type: Field class_name (e.g., "ocr_number")
|
||||
strategies: Custom strategies dict, defaults to FIELD_SCALE_STRATEGIES
|
||||
manual_mode: If True, use MANUAL_LABEL_STRATEGY (minimal padding only)
|
||||
|
||||
Returns:
|
||||
Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds
|
||||
"""
|
||||
x0, y0, x1, y1 = bbox
|
||||
w = x1 - x0
|
||||
h = y1 - y0
|
||||
|
||||
# Get strategy based on mode
|
||||
if manual_mode:
|
||||
strategy = MANUAL_LABEL_STRATEGY
|
||||
elif strategies is None:
|
||||
strategy = FIELD_SCALE_STRATEGIES.get(field_type, DEFAULT_STRATEGY)
|
||||
else:
|
||||
strategy = strategies.get(field_type, DEFAULT_STRATEGY)
|
||||
|
||||
# Step 1: Scale around center point
|
||||
cx = (x0 + x1) / 2
|
||||
cy = (y0 + y1) / 2
|
||||
|
||||
new_w = w * strategy.scale_x
|
||||
new_h = h * strategy.scale_y
|
||||
|
||||
nx0 = cx - new_w / 2
|
||||
nx1 = cx + new_w / 2
|
||||
ny0 = cy - new_h / 2
|
||||
ny1 = cy + new_h / 2
|
||||
|
||||
# Step 2: Apply directional compensation
|
||||
nx0 -= w * strategy.extra_left_ratio
|
||||
nx1 += w * strategy.extra_right_ratio
|
||||
ny0 -= h * strategy.extra_top_ratio
|
||||
ny1 += h * strategy.extra_bottom_ratio
|
||||
|
||||
# Step 3: Clamp expansion to max_pad limits (preserve asymmetry)
|
||||
left_pad = min(x0 - nx0, strategy.max_pad_x)
|
||||
right_pad = min(nx1 - x1, strategy.max_pad_x)
|
||||
top_pad = min(y0 - ny0, strategy.max_pad_y)
|
||||
bottom_pad = min(ny1 - y1, strategy.max_pad_y)
|
||||
|
||||
# Ensure pads are non-negative (in case of contraction)
|
||||
left_pad = max(0, left_pad)
|
||||
right_pad = max(0, right_pad)
|
||||
top_pad = max(0, top_pad)
|
||||
bottom_pad = max(0, bottom_pad)
|
||||
|
||||
nx0 = x0 - left_pad
|
||||
nx1 = x1 + right_pad
|
||||
ny0 = y0 - top_pad
|
||||
ny1 = y1 + bottom_pad
|
||||
|
||||
# Step 4: Clamp to image boundaries
|
||||
nx0 = max(0, int(nx0))
|
||||
ny0 = max(0, int(ny0))
|
||||
nx1 = min(int(image_width), int(nx1))
|
||||
ny1 = min(int(image_height), int(ny1))
|
||||
|
||||
return (nx0, ny0, nx1, ny1)
|
||||
140
packages/shared/shared/bbox/scale_strategy.py
Normal file
140
packages/shared/shared/bbox/scale_strategy.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""
|
||||
Scale Strategy Configuration.
|
||||
|
||||
Defines field-specific bbox expansion strategies for YOLO training data.
|
||||
Each strategy controls how bboxes are expanded around field values to
|
||||
capture contextual information like labels.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Final
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ScaleStrategy:
|
||||
"""Immutable scale strategy for bbox expansion.
|
||||
|
||||
Attributes:
|
||||
scale_x: Horizontal scale factor (1.0 = no scaling)
|
||||
scale_y: Vertical scale factor (1.0 = no scaling)
|
||||
extra_top_ratio: Additional expansion ratio towards top (for labels above)
|
||||
extra_bottom_ratio: Additional expansion ratio towards bottom
|
||||
extra_left_ratio: Additional expansion ratio towards left (for prefixes)
|
||||
extra_right_ratio: Additional expansion ratio towards right (for suffixes)
|
||||
max_pad_x: Maximum horizontal padding in pixels
|
||||
max_pad_y: Maximum vertical padding in pixels
|
||||
"""
|
||||
|
||||
scale_x: float = 1.15
|
||||
scale_y: float = 1.15
|
||||
extra_top_ratio: float = 0.0
|
||||
extra_bottom_ratio: float = 0.0
|
||||
extra_left_ratio: float = 0.0
|
||||
extra_right_ratio: float = 0.0
|
||||
max_pad_x: int = 50
|
||||
max_pad_y: int = 50
|
||||
|
||||
|
||||
# Default strategy for unknown fields (auto-label mode)
|
||||
DEFAULT_STRATEGY: Final[ScaleStrategy] = ScaleStrategy()
|
||||
|
||||
# Manual label strategy - minimal padding to prevent edge clipping
|
||||
# No scaling, no directional compensation, just small uniform padding
|
||||
MANUAL_LABEL_STRATEGY: Final[ScaleStrategy] = ScaleStrategy(
|
||||
scale_x=1.0,
|
||||
scale_y=1.0,
|
||||
extra_top_ratio=0.0,
|
||||
extra_bottom_ratio=0.0,
|
||||
extra_left_ratio=0.0,
|
||||
extra_right_ratio=0.0,
|
||||
max_pad_x=10, # Small padding to prevent edge loss
|
||||
max_pad_y=10,
|
||||
)
|
||||
|
||||
|
||||
# Field-specific strategies based on Swedish invoice field characteristics
|
||||
# Field labels typically appear above or to the left of values
|
||||
FIELD_SCALE_STRATEGIES: Final[dict[str, ScaleStrategy]] = {
|
||||
# OCR number - label "OCR" or "Referens" typically above
|
||||
"ocr_number": ScaleStrategy(
|
||||
scale_x=1.15,
|
||||
scale_y=1.80,
|
||||
extra_top_ratio=0.60,
|
||||
max_pad_x=50,
|
||||
max_pad_y=140,
|
||||
),
|
||||
# Bankgiro - prefix "Bankgiro:" or "BG:" typically to the left
|
||||
"bankgiro": ScaleStrategy(
|
||||
scale_x=1.45,
|
||||
scale_y=1.35,
|
||||
extra_left_ratio=0.80,
|
||||
max_pad_x=160,
|
||||
max_pad_y=90,
|
||||
),
|
||||
# Plusgiro - prefix "Plusgiro:" or "PG:" typically to the left
|
||||
"plusgiro": ScaleStrategy(
|
||||
scale_x=1.45,
|
||||
scale_y=1.35,
|
||||
extra_left_ratio=0.80,
|
||||
max_pad_x=160,
|
||||
max_pad_y=90,
|
||||
),
|
||||
# Invoice date - label "Fakturadatum" typically above
|
||||
"invoice_date": ScaleStrategy(
|
||||
scale_x=1.25,
|
||||
scale_y=1.55,
|
||||
extra_top_ratio=0.40,
|
||||
max_pad_x=80,
|
||||
max_pad_y=110,
|
||||
),
|
||||
# Due date - label "Forfalldatum" typically above, sometimes left
|
||||
"invoice_due_date": ScaleStrategy(
|
||||
scale_x=1.30,
|
||||
scale_y=1.65,
|
||||
extra_top_ratio=0.45,
|
||||
extra_left_ratio=0.35,
|
||||
max_pad_x=100,
|
||||
max_pad_y=120,
|
||||
),
|
||||
# Amount - currency symbol "SEK" or "kr" may be to the right
|
||||
"amount": ScaleStrategy(
|
||||
scale_x=1.20,
|
||||
scale_y=1.35,
|
||||
extra_right_ratio=0.30,
|
||||
max_pad_x=70,
|
||||
max_pad_y=80,
|
||||
),
|
||||
# Invoice number - label "Fakturanummer" typically above
|
||||
"invoice_number": ScaleStrategy(
|
||||
scale_x=1.20,
|
||||
scale_y=1.50,
|
||||
extra_top_ratio=0.40,
|
||||
max_pad_x=80,
|
||||
max_pad_y=100,
|
||||
),
|
||||
# Supplier org number - label "Org.nr" typically above or left
|
||||
"supplier_org_number": ScaleStrategy(
|
||||
scale_x=1.25,
|
||||
scale_y=1.40,
|
||||
extra_top_ratio=0.30,
|
||||
extra_left_ratio=0.20,
|
||||
max_pad_x=90,
|
||||
max_pad_y=90,
|
||||
),
|
||||
# Customer number - label "Kundnummer" typically above or left
|
||||
"customer_number": ScaleStrategy(
|
||||
scale_x=1.25,
|
||||
scale_y=1.45,
|
||||
extra_top_ratio=0.35,
|
||||
extra_left_ratio=0.25,
|
||||
max_pad_x=90,
|
||||
max_pad_y=100,
|
||||
),
|
||||
# Payment line - machine-readable code, minimal expansion needed
|
||||
"payment_line": ScaleStrategy(
|
||||
scale_x=1.10,
|
||||
scale_y=1.20,
|
||||
max_pad_x=40,
|
||||
max_pad_y=40,
|
||||
),
|
||||
}
|
||||
@@ -16,6 +16,7 @@ Available exports:
|
||||
- FIELD_CLASSES: dict[int, str] - class_id to class_name
|
||||
- FIELD_CLASS_IDS: dict[str, int] - class_name to class_id
|
||||
- CLASS_TO_FIELD: dict[str, str] - class_name to field_name
|
||||
- FIELD_TO_CLASS: dict[str, str] - field_name to class_name
|
||||
- CSV_TO_CLASS_MAPPING: dict[str, int] - field_name to class_id (excludes derived)
|
||||
- TRAINING_FIELD_CLASSES: dict[str, int] - field_name to class_id (all fields)
|
||||
- ACCOUNT_FIELD_MAPPING: Mapping for supplier_accounts handling
|
||||
@@ -27,6 +28,7 @@ from .mappings import (
|
||||
FIELD_CLASSES,
|
||||
FIELD_CLASS_IDS,
|
||||
CLASS_TO_FIELD,
|
||||
FIELD_TO_CLASS,
|
||||
CSV_TO_CLASS_MAPPING,
|
||||
TRAINING_FIELD_CLASSES,
|
||||
ACCOUNT_FIELD_MAPPING,
|
||||
@@ -40,6 +42,7 @@ __all__ = [
|
||||
"FIELD_CLASSES",
|
||||
"FIELD_CLASS_IDS",
|
||||
"CLASS_TO_FIELD",
|
||||
"FIELD_TO_CLASS",
|
||||
"CSV_TO_CLASS_MAPPING",
|
||||
"TRAINING_FIELD_CLASSES",
|
||||
"ACCOUNT_FIELD_MAPPING",
|
||||
|
||||
@@ -47,6 +47,12 @@ TRAINING_FIELD_CLASSES: Final[dict[str, int]] = {
|
||||
fd.field_name: fd.class_id for fd in FIELD_DEFINITIONS
|
||||
}
|
||||
|
||||
# field_name -> class_name mapping (reverse of CLASS_TO_FIELD)
|
||||
# Example: {"InvoiceNumber": "invoice_number", "OCR": "ocr_number", ...}
|
||||
FIELD_TO_CLASS: Final[dict[str, str]] = {
|
||||
fd.field_name: fd.class_name for fd in FIELD_DEFINITIONS
|
||||
}
|
||||
|
||||
# Account field mapping for supplier_accounts special handling
|
||||
# BG:xxx -> Bankgiro, PG:xxx -> Plusgiro
|
||||
ACCOUNT_FIELD_MAPPING: Final[dict[str, dict[str, str]]] = {
|
||||
|
||||
Reference in New Issue
Block a user