WIP
This commit is contained in:
@@ -1,37 +1,20 @@
|
||||
"""
|
||||
BBox Scale Strategy Module.
|
||||
BBox Expansion Module.
|
||||
|
||||
Provides field-specific bounding box expansion strategies for YOLO training data.
|
||||
Expands bboxes using center-point scaling with directional compensation to capture
|
||||
field labels that typically appear above or to the left of field values.
|
||||
|
||||
Two modes are supported:
|
||||
- Auto-label: Field-specific scale strategies with directional compensation
|
||||
- Manual-label: Minimal padding only to prevent edge clipping
|
||||
Provides uniform bounding box expansion for YOLO training data.
|
||||
|
||||
Usage:
|
||||
from shared.bbox import expand_bbox, ScaleStrategy, FIELD_SCALE_STRATEGIES
|
||||
from shared.bbox import expand_bbox, UNIFORM_PAD
|
||||
|
||||
Available exports:
|
||||
- ScaleStrategy: Dataclass for scale strategy configuration
|
||||
- DEFAULT_STRATEGY: Default strategy for unknown fields (auto-label)
|
||||
- MANUAL_LABEL_STRATEGY: Minimal padding strategy for manual labels
|
||||
- FIELD_SCALE_STRATEGIES: dict[str, ScaleStrategy] - field-specific strategies
|
||||
- expand_bbox: Function to expand bbox using field-specific strategy
|
||||
- UNIFORM_PAD: Default uniform pixel padding (15px at 150 DPI)
|
||||
- expand_bbox: Function to expand bbox with uniform padding
|
||||
"""
|
||||
|
||||
from .scale_strategy import (
|
||||
ScaleStrategy,
|
||||
DEFAULT_STRATEGY,
|
||||
MANUAL_LABEL_STRATEGY,
|
||||
FIELD_SCALE_STRATEGIES,
|
||||
)
|
||||
from .scale_strategy import UNIFORM_PAD
|
||||
from .expander import expand_bbox
|
||||
|
||||
__all__ = [
|
||||
"ScaleStrategy",
|
||||
"DEFAULT_STRATEGY",
|
||||
"MANUAL_LABEL_STRATEGY",
|
||||
"FIELD_SCALE_STRATEGIES",
|
||||
"UNIFORM_PAD",
|
||||
"expand_bbox",
|
||||
]
|
||||
|
||||
@@ -1,101 +1,35 @@
|
||||
"""
|
||||
BBox Expander Module.
|
||||
|
||||
Provides functions to expand bounding boxes using field-specific strategies.
|
||||
Expansion is center-point based with directional compensation.
|
||||
|
||||
Two modes:
|
||||
- Auto-label (default): Field-specific scale strategies
|
||||
- Manual-label: Minimal padding only to prevent edge clipping
|
||||
Expands bounding boxes by a uniform pixel padding on all sides,
|
||||
clamped to image boundaries. No field-specific or directional logic.
|
||||
"""
|
||||
|
||||
from .scale_strategy import (
|
||||
ScaleStrategy,
|
||||
DEFAULT_STRATEGY,
|
||||
MANUAL_LABEL_STRATEGY,
|
||||
FIELD_SCALE_STRATEGIES,
|
||||
)
|
||||
from .scale_strategy import UNIFORM_PAD
|
||||
|
||||
|
||||
def expand_bbox(
|
||||
bbox: tuple[float, float, float, float],
|
||||
image_width: float,
|
||||
image_height: float,
|
||||
field_type: str,
|
||||
strategies: dict[str, ScaleStrategy] | None = None,
|
||||
manual_mode: bool = False,
|
||||
pad: int = UNIFORM_PAD,
|
||||
) -> tuple[int, int, int, int]:
|
||||
"""
|
||||
Expand bbox using field-specific scale strategy.
|
||||
|
||||
The expansion follows these steps:
|
||||
1. Scale bbox around center point (scale_x, scale_y)
|
||||
2. Apply directional compensation (extra_*_ratio)
|
||||
3. Clamp expansion to max_pad limits
|
||||
4. Clamp to image boundaries
|
||||
"""Expand bbox by uniform pixel padding, clamped to image bounds.
|
||||
|
||||
Args:
|
||||
bbox: (x0, y0, x1, y1) in pixels
|
||||
image_width: Image width for boundary clamping
|
||||
image_height: Image height for boundary clamping
|
||||
field_type: Field class_name (e.g., "ocr_number")
|
||||
strategies: Custom strategies dict, defaults to FIELD_SCALE_STRATEGIES
|
||||
manual_mode: If True, use MANUAL_LABEL_STRATEGY (minimal padding only)
|
||||
bbox: (x0, y0, x1, y1) in pixels.
|
||||
image_width: Image width for boundary clamping.
|
||||
image_height: Image height for boundary clamping.
|
||||
pad: Uniform pixel padding on all sides (default: UNIFORM_PAD).
|
||||
|
||||
Returns:
|
||||
Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds
|
||||
Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds.
|
||||
"""
|
||||
x0, y0, x1, y1 = bbox
|
||||
w = x1 - x0
|
||||
h = y1 - y0
|
||||
|
||||
# Get strategy based on mode
|
||||
if manual_mode:
|
||||
strategy = MANUAL_LABEL_STRATEGY
|
||||
elif strategies is None:
|
||||
strategy = FIELD_SCALE_STRATEGIES.get(field_type, DEFAULT_STRATEGY)
|
||||
else:
|
||||
strategy = strategies.get(field_type, DEFAULT_STRATEGY)
|
||||
|
||||
# Step 1: Scale around center point
|
||||
cx = (x0 + x1) / 2
|
||||
cy = (y0 + y1) / 2
|
||||
|
||||
new_w = w * strategy.scale_x
|
||||
new_h = h * strategy.scale_y
|
||||
|
||||
nx0 = cx - new_w / 2
|
||||
nx1 = cx + new_w / 2
|
||||
ny0 = cy - new_h / 2
|
||||
ny1 = cy + new_h / 2
|
||||
|
||||
# Step 2: Apply directional compensation
|
||||
nx0 -= w * strategy.extra_left_ratio
|
||||
nx1 += w * strategy.extra_right_ratio
|
||||
ny0 -= h * strategy.extra_top_ratio
|
||||
ny1 += h * strategy.extra_bottom_ratio
|
||||
|
||||
# Step 3: Clamp expansion to max_pad limits (preserve asymmetry)
|
||||
left_pad = min(x0 - nx0, strategy.max_pad_x)
|
||||
right_pad = min(nx1 - x1, strategy.max_pad_x)
|
||||
top_pad = min(y0 - ny0, strategy.max_pad_y)
|
||||
bottom_pad = min(ny1 - y1, strategy.max_pad_y)
|
||||
|
||||
# Ensure pads are non-negative (in case of contraction)
|
||||
left_pad = max(0, left_pad)
|
||||
right_pad = max(0, right_pad)
|
||||
top_pad = max(0, top_pad)
|
||||
bottom_pad = max(0, bottom_pad)
|
||||
|
||||
nx0 = x0 - left_pad
|
||||
nx1 = x1 + right_pad
|
||||
ny0 = y0 - top_pad
|
||||
ny1 = y1 + bottom_pad
|
||||
|
||||
# Step 4: Clamp to image boundaries
|
||||
nx0 = max(0, int(nx0))
|
||||
ny0 = max(0, int(ny0))
|
||||
nx1 = min(int(image_width), int(nx1))
|
||||
ny1 = min(int(image_height), int(ny1))
|
||||
nx0 = max(0, int(x0 - pad))
|
||||
ny0 = max(0, int(y0 - pad))
|
||||
nx1 = min(int(image_width), int(x1 + pad))
|
||||
ny1 = min(int(image_height), int(y1 + pad))
|
||||
|
||||
return (nx0, ny0, nx1, ny1)
|
||||
|
||||
@@ -1,140 +1,12 @@
|
||||
"""
|
||||
Scale Strategy Configuration.
|
||||
|
||||
Defines field-specific bbox expansion strategies for YOLO training data.
|
||||
Each strategy controls how bboxes are expanded around field values to
|
||||
capture contextual information like labels.
|
||||
Defines uniform bbox expansion padding for YOLO training data.
|
||||
All fields use the same fixed-pixel padding -- no layout assumptions.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Final
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ScaleStrategy:
|
||||
"""Immutable scale strategy for bbox expansion.
|
||||
|
||||
Attributes:
|
||||
scale_x: Horizontal scale factor (1.0 = no scaling)
|
||||
scale_y: Vertical scale factor (1.0 = no scaling)
|
||||
extra_top_ratio: Additional expansion ratio towards top (for labels above)
|
||||
extra_bottom_ratio: Additional expansion ratio towards bottom
|
||||
extra_left_ratio: Additional expansion ratio towards left (for prefixes)
|
||||
extra_right_ratio: Additional expansion ratio towards right (for suffixes)
|
||||
max_pad_x: Maximum horizontal padding in pixels
|
||||
max_pad_y: Maximum vertical padding in pixels
|
||||
"""
|
||||
|
||||
scale_x: float = 1.15
|
||||
scale_y: float = 1.15
|
||||
extra_top_ratio: float = 0.0
|
||||
extra_bottom_ratio: float = 0.0
|
||||
extra_left_ratio: float = 0.0
|
||||
extra_right_ratio: float = 0.0
|
||||
max_pad_x: int = 50
|
||||
max_pad_y: int = 50
|
||||
|
||||
|
||||
# Default strategy for unknown fields (auto-label mode)
|
||||
DEFAULT_STRATEGY: Final[ScaleStrategy] = ScaleStrategy()
|
||||
|
||||
# Manual label strategy - minimal padding to prevent edge clipping
|
||||
# No scaling, no directional compensation, just small uniform padding
|
||||
MANUAL_LABEL_STRATEGY: Final[ScaleStrategy] = ScaleStrategy(
|
||||
scale_x=1.0,
|
||||
scale_y=1.0,
|
||||
extra_top_ratio=0.0,
|
||||
extra_bottom_ratio=0.0,
|
||||
extra_left_ratio=0.0,
|
||||
extra_right_ratio=0.0,
|
||||
max_pad_x=10, # Small padding to prevent edge loss
|
||||
max_pad_y=10,
|
||||
)
|
||||
|
||||
|
||||
# Field-specific strategies based on Swedish invoice field characteristics
|
||||
# Field labels typically appear above or to the left of values
|
||||
FIELD_SCALE_STRATEGIES: Final[dict[str, ScaleStrategy]] = {
|
||||
# OCR number - label "OCR" or "Referens" typically above
|
||||
"ocr_number": ScaleStrategy(
|
||||
scale_x=1.15,
|
||||
scale_y=1.80,
|
||||
extra_top_ratio=0.60,
|
||||
max_pad_x=50,
|
||||
max_pad_y=140,
|
||||
),
|
||||
# Bankgiro - prefix "Bankgiro:" or "BG:" typically to the left
|
||||
"bankgiro": ScaleStrategy(
|
||||
scale_x=1.45,
|
||||
scale_y=1.35,
|
||||
extra_left_ratio=0.80,
|
||||
max_pad_x=160,
|
||||
max_pad_y=90,
|
||||
),
|
||||
# Plusgiro - prefix "Plusgiro:" or "PG:" typically to the left
|
||||
"plusgiro": ScaleStrategy(
|
||||
scale_x=1.45,
|
||||
scale_y=1.35,
|
||||
extra_left_ratio=0.80,
|
||||
max_pad_x=160,
|
||||
max_pad_y=90,
|
||||
),
|
||||
# Invoice date - label "Fakturadatum" typically above
|
||||
"invoice_date": ScaleStrategy(
|
||||
scale_x=1.25,
|
||||
scale_y=1.55,
|
||||
extra_top_ratio=0.40,
|
||||
max_pad_x=80,
|
||||
max_pad_y=110,
|
||||
),
|
||||
# Due date - label "Forfalldatum" typically above, sometimes left
|
||||
"invoice_due_date": ScaleStrategy(
|
||||
scale_x=1.30,
|
||||
scale_y=1.65,
|
||||
extra_top_ratio=0.45,
|
||||
extra_left_ratio=0.35,
|
||||
max_pad_x=100,
|
||||
max_pad_y=120,
|
||||
),
|
||||
# Amount - currency symbol "SEK" or "kr" may be to the right
|
||||
"amount": ScaleStrategy(
|
||||
scale_x=1.20,
|
||||
scale_y=1.35,
|
||||
extra_right_ratio=0.30,
|
||||
max_pad_x=70,
|
||||
max_pad_y=80,
|
||||
),
|
||||
# Invoice number - label "Fakturanummer" typically above
|
||||
"invoice_number": ScaleStrategy(
|
||||
scale_x=1.20,
|
||||
scale_y=1.50,
|
||||
extra_top_ratio=0.40,
|
||||
max_pad_x=80,
|
||||
max_pad_y=100,
|
||||
),
|
||||
# Supplier org number - label "Org.nr" typically above or left
|
||||
"supplier_org_number": ScaleStrategy(
|
||||
scale_x=1.25,
|
||||
scale_y=1.40,
|
||||
extra_top_ratio=0.30,
|
||||
extra_left_ratio=0.20,
|
||||
max_pad_x=90,
|
||||
max_pad_y=90,
|
||||
),
|
||||
# Customer number - label "Kundnummer" typically above or left
|
||||
"customer_number": ScaleStrategy(
|
||||
scale_x=1.25,
|
||||
scale_y=1.45,
|
||||
extra_top_ratio=0.35,
|
||||
extra_left_ratio=0.25,
|
||||
max_pad_x=90,
|
||||
max_pad_y=100,
|
||||
),
|
||||
# Payment line - machine-readable code, minimal expansion needed
|
||||
"payment_line": ScaleStrategy(
|
||||
scale_x=1.10,
|
||||
scale_y=1.20,
|
||||
max_pad_x=40,
|
||||
max_pad_y=40,
|
||||
),
|
||||
}
|
||||
# 15px at 150 DPI = ~2.5mm real-world padding around text.
|
||||
# Enough for OCR safety margin without capturing neighboring label text.
|
||||
UNIFORM_PAD: Final[int] = 15
|
||||
|
||||
@@ -17,7 +17,7 @@ class TrainingConfig:
|
||||
"""Training configuration."""
|
||||
|
||||
# Model settings
|
||||
model_path: str = "yolo11n.pt" # Base model or path to trained model
|
||||
model_path: str = "yolo26s.pt" # Base model or path to trained model
|
||||
data_yaml: str = "" # Path to data.yaml
|
||||
|
||||
# Training hyperparameters
|
||||
@@ -39,6 +39,10 @@ class TrainingConfig:
|
||||
resume: bool = False
|
||||
resume_from: str | None = None # Path to checkpoint
|
||||
|
||||
# Fine-tuning specific
|
||||
freeze: int = 0 # Number of backbone layers to freeze (0 = none)
|
||||
cos_lr: bool = False # Use cosine learning rate scheduler
|
||||
|
||||
# Document-specific augmentation (optimized for invoices)
|
||||
augmentation: dict[str, Any] = field(default_factory=lambda: {
|
||||
"degrees": 5.0,
|
||||
@@ -106,7 +110,7 @@ class YOLOTrainer:
|
||||
# Check model path
|
||||
model_path = Path(self.config.model_path)
|
||||
if not model_path.suffix == ".pt":
|
||||
# Could be a model name like "yolo11n.pt" which is downloaded
|
||||
# Could be a model name like "yolo26s.pt" which is downloaded
|
||||
if not model_path.name.startswith("yolo"):
|
||||
return False, f"Invalid model: {self.config.model_path}"
|
||||
elif not model_path.exists():
|
||||
@@ -147,6 +151,10 @@ class YOLOTrainer:
|
||||
self._log("INFO", f" Epochs: {self.config.epochs}")
|
||||
self._log("INFO", f" Batch size: {self.config.batch_size}")
|
||||
self._log("INFO", f" Image size: {self.config.image_size}")
|
||||
if self.config.freeze > 0:
|
||||
self._log("INFO", f" Freeze layers: {self.config.freeze}")
|
||||
if self.config.cos_lr:
|
||||
self._log("INFO", f" Cosine LR: enabled")
|
||||
|
||||
try:
|
||||
# Load model
|
||||
@@ -178,6 +186,12 @@ class YOLOTrainer:
|
||||
"resume": self.config.resume and self.config.resume_from is not None,
|
||||
}
|
||||
|
||||
# Add fine-tuning settings
|
||||
if self.config.freeze > 0:
|
||||
train_args["freeze"] = self.config.freeze
|
||||
if self.config.cos_lr:
|
||||
train_args["cos_lr"] = True
|
||||
|
||||
# Add augmentation settings
|
||||
train_args.update(self.config.augmentation)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user