This commit is contained in:
Yaojia Wang
2026-02-11 23:40:38 +01:00
parent f1a7bfe6b7
commit ad5ed46b4c
117 changed files with 5741 additions and 7669 deletions

View File

@@ -1,37 +1,20 @@
"""
BBox Scale Strategy Module.
BBox Expansion Module.
Provides field-specific bounding box expansion strategies for YOLO training data.
Expands bboxes using center-point scaling with directional compensation to capture
field labels that typically appear above or to the left of field values.
Two modes are supported:
- Auto-label: Field-specific scale strategies with directional compensation
- Manual-label: Minimal padding only to prevent edge clipping
Provides uniform bounding box expansion for YOLO training data.
Usage:
from shared.bbox import expand_bbox, ScaleStrategy, FIELD_SCALE_STRATEGIES
from shared.bbox import expand_bbox, UNIFORM_PAD
Available exports:
- ScaleStrategy: Dataclass for scale strategy configuration
- DEFAULT_STRATEGY: Default strategy for unknown fields (auto-label)
- MANUAL_LABEL_STRATEGY: Minimal padding strategy for manual labels
- FIELD_SCALE_STRATEGIES: dict[str, ScaleStrategy] - field-specific strategies
- expand_bbox: Function to expand bbox using field-specific strategy
- UNIFORM_PAD: Default uniform pixel padding (15px at 150 DPI)
- expand_bbox: Function to expand bbox with uniform padding
"""
from .scale_strategy import (
ScaleStrategy,
DEFAULT_STRATEGY,
MANUAL_LABEL_STRATEGY,
FIELD_SCALE_STRATEGIES,
)
from .scale_strategy import UNIFORM_PAD
from .expander import expand_bbox
__all__ = [
"ScaleStrategy",
"DEFAULT_STRATEGY",
"MANUAL_LABEL_STRATEGY",
"FIELD_SCALE_STRATEGIES",
"UNIFORM_PAD",
"expand_bbox",
]

View File

@@ -1,101 +1,35 @@
"""
BBox Expander Module.
Provides functions to expand bounding boxes using field-specific strategies.
Expansion is center-point based with directional compensation.
Two modes:
- Auto-label (default): Field-specific scale strategies
- Manual-label: Minimal padding only to prevent edge clipping
Expands bounding boxes by a uniform pixel padding on all sides,
clamped to image boundaries. No field-specific or directional logic.
"""
from .scale_strategy import (
ScaleStrategy,
DEFAULT_STRATEGY,
MANUAL_LABEL_STRATEGY,
FIELD_SCALE_STRATEGIES,
)
from .scale_strategy import UNIFORM_PAD
def expand_bbox(
bbox: tuple[float, float, float, float],
image_width: float,
image_height: float,
field_type: str,
strategies: dict[str, ScaleStrategy] | None = None,
manual_mode: bool = False,
pad: int = UNIFORM_PAD,
) -> tuple[int, int, int, int]:
"""
Expand bbox using field-specific scale strategy.
The expansion follows these steps:
1. Scale bbox around center point (scale_x, scale_y)
2. Apply directional compensation (extra_*_ratio)
3. Clamp expansion to max_pad limits
4. Clamp to image boundaries
"""Expand bbox by uniform pixel padding, clamped to image bounds.
Args:
bbox: (x0, y0, x1, y1) in pixels
image_width: Image width for boundary clamping
image_height: Image height for boundary clamping
field_type: Field class_name (e.g., "ocr_number")
strategies: Custom strategies dict, defaults to FIELD_SCALE_STRATEGIES
manual_mode: If True, use MANUAL_LABEL_STRATEGY (minimal padding only)
bbox: (x0, y0, x1, y1) in pixels.
image_width: Image width for boundary clamping.
image_height: Image height for boundary clamping.
pad: Uniform pixel padding on all sides (default: UNIFORM_PAD).
Returns:
Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds
Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds.
"""
x0, y0, x1, y1 = bbox
w = x1 - x0
h = y1 - y0
# Get strategy based on mode
if manual_mode:
strategy = MANUAL_LABEL_STRATEGY
elif strategies is None:
strategy = FIELD_SCALE_STRATEGIES.get(field_type, DEFAULT_STRATEGY)
else:
strategy = strategies.get(field_type, DEFAULT_STRATEGY)
# Step 1: Scale around center point
cx = (x0 + x1) / 2
cy = (y0 + y1) / 2
new_w = w * strategy.scale_x
new_h = h * strategy.scale_y
nx0 = cx - new_w / 2
nx1 = cx + new_w / 2
ny0 = cy - new_h / 2
ny1 = cy + new_h / 2
# Step 2: Apply directional compensation
nx0 -= w * strategy.extra_left_ratio
nx1 += w * strategy.extra_right_ratio
ny0 -= h * strategy.extra_top_ratio
ny1 += h * strategy.extra_bottom_ratio
# Step 3: Clamp expansion to max_pad limits (preserve asymmetry)
left_pad = min(x0 - nx0, strategy.max_pad_x)
right_pad = min(nx1 - x1, strategy.max_pad_x)
top_pad = min(y0 - ny0, strategy.max_pad_y)
bottom_pad = min(ny1 - y1, strategy.max_pad_y)
# Ensure pads are non-negative (in case of contraction)
left_pad = max(0, left_pad)
right_pad = max(0, right_pad)
top_pad = max(0, top_pad)
bottom_pad = max(0, bottom_pad)
nx0 = x0 - left_pad
nx1 = x1 + right_pad
ny0 = y0 - top_pad
ny1 = y1 + bottom_pad
# Step 4: Clamp to image boundaries
nx0 = max(0, int(nx0))
ny0 = max(0, int(ny0))
nx1 = min(int(image_width), int(nx1))
ny1 = min(int(image_height), int(ny1))
nx0 = max(0, int(x0 - pad))
ny0 = max(0, int(y0 - pad))
nx1 = min(int(image_width), int(x1 + pad))
ny1 = min(int(image_height), int(y1 + pad))
return (nx0, ny0, nx1, ny1)

View File

@@ -1,140 +1,12 @@
"""
Scale Strategy Configuration.
Defines field-specific bbox expansion strategies for YOLO training data.
Each strategy controls how bboxes are expanded around field values to
capture contextual information like labels.
Defines uniform bbox expansion padding for YOLO training data.
All fields use the same fixed-pixel padding -- no layout assumptions.
"""
from dataclasses import dataclass
from typing import Final
@dataclass(frozen=True)
class ScaleStrategy:
"""Immutable scale strategy for bbox expansion.
Attributes:
scale_x: Horizontal scale factor (1.0 = no scaling)
scale_y: Vertical scale factor (1.0 = no scaling)
extra_top_ratio: Additional expansion ratio towards top (for labels above)
extra_bottom_ratio: Additional expansion ratio towards bottom
extra_left_ratio: Additional expansion ratio towards left (for prefixes)
extra_right_ratio: Additional expansion ratio towards right (for suffixes)
max_pad_x: Maximum horizontal padding in pixels
max_pad_y: Maximum vertical padding in pixels
"""
scale_x: float = 1.15
scale_y: float = 1.15
extra_top_ratio: float = 0.0
extra_bottom_ratio: float = 0.0
extra_left_ratio: float = 0.0
extra_right_ratio: float = 0.0
max_pad_x: int = 50
max_pad_y: int = 50
# Default strategy for unknown fields (auto-label mode)
DEFAULT_STRATEGY: Final[ScaleStrategy] = ScaleStrategy()
# Manual label strategy - minimal padding to prevent edge clipping
# No scaling, no directional compensation, just small uniform padding
MANUAL_LABEL_STRATEGY: Final[ScaleStrategy] = ScaleStrategy(
scale_x=1.0,
scale_y=1.0,
extra_top_ratio=0.0,
extra_bottom_ratio=0.0,
extra_left_ratio=0.0,
extra_right_ratio=0.0,
max_pad_x=10, # Small padding to prevent edge loss
max_pad_y=10,
)
# Field-specific strategies based on Swedish invoice field characteristics
# Field labels typically appear above or to the left of values
FIELD_SCALE_STRATEGIES: Final[dict[str, ScaleStrategy]] = {
# OCR number - label "OCR" or "Referens" typically above
"ocr_number": ScaleStrategy(
scale_x=1.15,
scale_y=1.80,
extra_top_ratio=0.60,
max_pad_x=50,
max_pad_y=140,
),
# Bankgiro - prefix "Bankgiro:" or "BG:" typically to the left
"bankgiro": ScaleStrategy(
scale_x=1.45,
scale_y=1.35,
extra_left_ratio=0.80,
max_pad_x=160,
max_pad_y=90,
),
# Plusgiro - prefix "Plusgiro:" or "PG:" typically to the left
"plusgiro": ScaleStrategy(
scale_x=1.45,
scale_y=1.35,
extra_left_ratio=0.80,
max_pad_x=160,
max_pad_y=90,
),
# Invoice date - label "Fakturadatum" typically above
"invoice_date": ScaleStrategy(
scale_x=1.25,
scale_y=1.55,
extra_top_ratio=0.40,
max_pad_x=80,
max_pad_y=110,
),
# Due date - label "Forfalldatum" typically above, sometimes left
"invoice_due_date": ScaleStrategy(
scale_x=1.30,
scale_y=1.65,
extra_top_ratio=0.45,
extra_left_ratio=0.35,
max_pad_x=100,
max_pad_y=120,
),
# Amount - currency symbol "SEK" or "kr" may be to the right
"amount": ScaleStrategy(
scale_x=1.20,
scale_y=1.35,
extra_right_ratio=0.30,
max_pad_x=70,
max_pad_y=80,
),
# Invoice number - label "Fakturanummer" typically above
"invoice_number": ScaleStrategy(
scale_x=1.20,
scale_y=1.50,
extra_top_ratio=0.40,
max_pad_x=80,
max_pad_y=100,
),
# Supplier org number - label "Org.nr" typically above or left
"supplier_org_number": ScaleStrategy(
scale_x=1.25,
scale_y=1.40,
extra_top_ratio=0.30,
extra_left_ratio=0.20,
max_pad_x=90,
max_pad_y=90,
),
# Customer number - label "Kundnummer" typically above or left
"customer_number": ScaleStrategy(
scale_x=1.25,
scale_y=1.45,
extra_top_ratio=0.35,
extra_left_ratio=0.25,
max_pad_x=90,
max_pad_y=100,
),
# Payment line - machine-readable code, minimal expansion needed
"payment_line": ScaleStrategy(
scale_x=1.10,
scale_y=1.20,
max_pad_x=40,
max_pad_y=40,
),
}
# 15px at 150 DPI = ~2.5mm real-world padding around text.
# Enough for OCR safety margin without capturing neighboring label text.
UNIFORM_PAD: Final[int] = 15

View File

@@ -17,7 +17,7 @@ class TrainingConfig:
"""Training configuration."""
# Model settings
model_path: str = "yolo11n.pt" # Base model or path to trained model
model_path: str = "yolo26s.pt" # Base model or path to trained model
data_yaml: str = "" # Path to data.yaml
# Training hyperparameters
@@ -39,6 +39,10 @@ class TrainingConfig:
resume: bool = False
resume_from: str | None = None # Path to checkpoint
# Fine-tuning specific
freeze: int = 0 # Number of backbone layers to freeze (0 = none)
cos_lr: bool = False # Use cosine learning rate scheduler
# Document-specific augmentation (optimized for invoices)
augmentation: dict[str, Any] = field(default_factory=lambda: {
"degrees": 5.0,
@@ -106,7 +110,7 @@ class YOLOTrainer:
# Check model path
model_path = Path(self.config.model_path)
if not model_path.suffix == ".pt":
# Could be a model name like "yolo11n.pt" which is downloaded
# Could be a model name like "yolo26s.pt" which is downloaded
if not model_path.name.startswith("yolo"):
return False, f"Invalid model: {self.config.model_path}"
elif not model_path.exists():
@@ -147,6 +151,10 @@ class YOLOTrainer:
self._log("INFO", f" Epochs: {self.config.epochs}")
self._log("INFO", f" Batch size: {self.config.batch_size}")
self._log("INFO", f" Image size: {self.config.image_size}")
if self.config.freeze > 0:
self._log("INFO", f" Freeze layers: {self.config.freeze}")
if self.config.cos_lr:
self._log("INFO", f" Cosine LR: enabled")
try:
# Load model
@@ -178,6 +186,12 @@ class YOLOTrainer:
"resume": self.config.resume and self.config.resume_from is not None,
}
# Add fine-tuning settings
if self.config.freeze > 0:
train_args["freeze"] = self.config.freeze
if self.config.cos_lr:
train_args["cos_lr"] = True
# Add augmentation settings
train_args.update(self.config.augmentation)