WIP

2026-02-11 23:40:38 +01:00
parent f1a7bfe6b7
commit ad5ed46b4c
117 changed files with 5741 additions and 7669 deletions
--- a/packages/shared/shared/bbox/init.py
+++ b/packages/shared/shared/bbox/init.py
@@ -1,37 +1,20 @@
 """
-BBox Scale Strategy Module.
+BBox Expansion Module.

-Provides field-specific bounding box expansion strategies for YOLO training data.
-Expands bboxes using center-point scaling with directional compensation to capture
-field labels that typically appear above or to the left of field values.
-
-Two modes are supported:
- Auto-label: Field-specific scale strategies with directional compensation
- Manual-label: Minimal padding only to prevent edge clipping
+Provides uniform bounding box expansion for YOLO training data.

 Usage:
-    from shared.bbox import expand_bbox, ScaleStrategy, FIELD_SCALE_STRATEGIES
+    from shared.bbox import expand_bbox, UNIFORM_PAD

 Available exports:
-    - ScaleStrategy: Dataclass for scale strategy configuration
-    - DEFAULT_STRATEGY: Default strategy for unknown fields (auto-label)
-    - MANUAL_LABEL_STRATEGY: Minimal padding strategy for manual labels
-    - FIELD_SCALE_STRATEGIES: dict[str, ScaleStrategy] - field-specific strategies
-    - expand_bbox: Function to expand bbox using field-specific strategy
+    - UNIFORM_PAD: Default uniform pixel padding (15px at 150 DPI)
+    - expand_bbox: Function to expand bbox with uniform padding
 """

-from .scale_strategy import (
-    ScaleStrategy,
-    DEFAULT_STRATEGY,
-    MANUAL_LABEL_STRATEGY,
-    FIELD_SCALE_STRATEGIES,
-)
+from .scale_strategy import UNIFORM_PAD
 from .expander import expand_bbox

 __all__ = [
-    "ScaleStrategy",
-    "DEFAULT_STRATEGY",
-    "MANUAL_LABEL_STRATEGY",
-    "FIELD_SCALE_STRATEGIES",
+    "UNIFORM_PAD",
    "expand_bbox",
 ]
--- a/packages/shared/shared/bbox/expander.py
+++ b/packages/shared/shared/bbox/expander.py
@@ -1,101 +1,35 @@
 """
 BBox Expander Module.

-Provides functions to expand bounding boxes using field-specific strategies.
-Expansion is center-point based with directional compensation.
-
-Two modes:
- Auto-label (default): Field-specific scale strategies
- Manual-label: Minimal padding only to prevent edge clipping
+Expands bounding boxes by a uniform pixel padding on all sides,
+clamped to image boundaries. No field-specific or directional logic.
 """

-from .scale_strategy import (
-    ScaleStrategy,
-    DEFAULT_STRATEGY,
-    MANUAL_LABEL_STRATEGY,
-    FIELD_SCALE_STRATEGIES,
-)
+from .scale_strategy import UNIFORM_PAD


 def expand_bbox(
    bbox: tuple[float, float, float, float],
    image_width: float,
    image_height: float,
-    field_type: str,
-    strategies: dict[str, ScaleStrategy] | None = None,
-    manual_mode: bool = False,
+    pad: int = UNIFORM_PAD,
 ) -> tuple[int, int, int, int]:
-    """
-    Expand bbox using field-specific scale strategy.
-
-    The expansion follows these steps:
-    1. Scale bbox around center point (scale_x, scale_y)
-    2. Apply directional compensation (extra_*_ratio)
-    3. Clamp expansion to max_pad limits
-    4. Clamp to image boundaries
+    """Expand bbox by uniform pixel padding, clamped to image bounds.

    Args:
-        bbox: (x0, y0, x1, y1) in pixels
-        image_width: Image width for boundary clamping
-        image_height: Image height for boundary clamping
-        field_type: Field class_name (e.g., "ocr_number")
-        strategies: Custom strategies dict, defaults to FIELD_SCALE_STRATEGIES
-        manual_mode: If True, use MANUAL_LABEL_STRATEGY (minimal padding only)
+        bbox: (x0, y0, x1, y1) in pixels.
+        image_width: Image width for boundary clamping.
+        image_height: Image height for boundary clamping.
+        pad: Uniform pixel padding on all sides (default: UNIFORM_PAD).

    Returns:
-        Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds
+        Expanded bbox (x0, y0, x1, y1) as integers, clamped to image bounds.
    """
    x0, y0, x1, y1 = bbox
-    w = x1 - x0
-    h = y1 - y0

-    # Get strategy based on mode
-    if manual_mode:
-        strategy = MANUAL_LABEL_STRATEGY
-    elif strategies is None:
-        strategy = FIELD_SCALE_STRATEGIES.get(field_type, DEFAULT_STRATEGY)
-    else:
-        strategy = strategies.get(field_type, DEFAULT_STRATEGY)
-
-    # Step 1: Scale around center point
-    cx = (x0 + x1) / 2
-    cy = (y0 + y1) / 2
-
-    new_w = w * strategy.scale_x
-    new_h = h * strategy.scale_y
-
-    nx0 = cx - new_w / 2
-    nx1 = cx + new_w / 2
-    ny0 = cy - new_h / 2
-    ny1 = cy + new_h / 2
-
-    # Step 2: Apply directional compensation
-    nx0 -= w * strategy.extra_left_ratio
-    nx1 += w * strategy.extra_right_ratio
-    ny0 -= h * strategy.extra_top_ratio
-    ny1 += h * strategy.extra_bottom_ratio
-
-    # Step 3: Clamp expansion to max_pad limits (preserve asymmetry)
-    left_pad = min(x0 - nx0, strategy.max_pad_x)
-    right_pad = min(nx1 - x1, strategy.max_pad_x)
-    top_pad = min(y0 - ny0, strategy.max_pad_y)
-    bottom_pad = min(ny1 - y1, strategy.max_pad_y)
-
-    # Ensure pads are non-negative (in case of contraction)
-    left_pad = max(0, left_pad)
-    right_pad = max(0, right_pad)
-    top_pad = max(0, top_pad)
-    bottom_pad = max(0, bottom_pad)
-
-    nx0 = x0 - left_pad
-    nx1 = x1 + right_pad
-    ny0 = y0 - top_pad
-    ny1 = y1 + bottom_pad
-
-    # Step 4: Clamp to image boundaries
-    nx0 = max(0, int(nx0))
-    ny0 = max(0, int(ny0))
-    nx1 = min(int(image_width), int(nx1))
-    ny1 = min(int(image_height), int(ny1))
+    nx0 = max(0, int(x0 - pad))
+    ny0 = max(0, int(y0 - pad))
+    nx1 = min(int(image_width), int(x1 + pad))
+    ny1 = min(int(image_height), int(y1 + pad))

    return (nx0, ny0, nx1, ny1)
--- a/packages/shared/shared/bbox/scale_strategy.py
+++ b/packages/shared/shared/bbox/scale_strategy.py
@@ -1,140 +1,12 @@
 """
 Scale Strategy Configuration.

-Defines field-specific bbox expansion strategies for YOLO training data.
-Each strategy controls how bboxes are expanded around field values to
-capture contextual information like labels.
+Defines uniform bbox expansion padding for YOLO training data.
+All fields use the same fixed-pixel padding -- no layout assumptions.
 """

-from dataclasses import dataclass
 from typing import Final

-
-@dataclass(frozen=True)
-class ScaleStrategy:
-    """Immutable scale strategy for bbox expansion.
-
-    Attributes:
-        scale_x: Horizontal scale factor (1.0 = no scaling)
-        scale_y: Vertical scale factor (1.0 = no scaling)
-        extra_top_ratio: Additional expansion ratio towards top (for labels above)
-        extra_bottom_ratio: Additional expansion ratio towards bottom
-        extra_left_ratio: Additional expansion ratio towards left (for prefixes)
-        extra_right_ratio: Additional expansion ratio towards right (for suffixes)
-        max_pad_x: Maximum horizontal padding in pixels
-        max_pad_y: Maximum vertical padding in pixels
-    """
-
-    scale_x: float = 1.15
-    scale_y: float = 1.15
-    extra_top_ratio: float = 0.0
-    extra_bottom_ratio: float = 0.0
-    extra_left_ratio: float = 0.0
-    extra_right_ratio: float = 0.0
-    max_pad_x: int = 50
-    max_pad_y: int = 50
-
-
-# Default strategy for unknown fields (auto-label mode)
-DEFAULT_STRATEGY: Final[ScaleStrategy] = ScaleStrategy()
-
-# Manual label strategy - minimal padding to prevent edge clipping
-# No scaling, no directional compensation, just small uniform padding
-MANUAL_LABEL_STRATEGY: Final[ScaleStrategy] = ScaleStrategy(
-    scale_x=1.0,
-    scale_y=1.0,
-    extra_top_ratio=0.0,
-    extra_bottom_ratio=0.0,
-    extra_left_ratio=0.0,
-    extra_right_ratio=0.0,
-    max_pad_x=10,  # Small padding to prevent edge loss
-    max_pad_y=10,
-)
-
-
-# Field-specific strategies based on Swedish invoice field characteristics
-# Field labels typically appear above or to the left of values
-FIELD_SCALE_STRATEGIES: Final[dict[str, ScaleStrategy]] = {
-    # OCR number - label "OCR" or "Referens" typically above
-    "ocr_number": ScaleStrategy(
-        scale_x=1.15,
-        scale_y=1.80,
-        extra_top_ratio=0.60,
-        max_pad_x=50,
-        max_pad_y=140,
-    ),
-    # Bankgiro - prefix "Bankgiro:" or "BG:" typically to the left
-    "bankgiro": ScaleStrategy(
-        scale_x=1.45,
-        scale_y=1.35,
-        extra_left_ratio=0.80,
-        max_pad_x=160,
-        max_pad_y=90,
-    ),
-    # Plusgiro - prefix "Plusgiro:" or "PG:" typically to the left
-    "plusgiro": ScaleStrategy(
-        scale_x=1.45,
-        scale_y=1.35,
-        extra_left_ratio=0.80,
-        max_pad_x=160,
-        max_pad_y=90,
-    ),
-    # Invoice date - label "Fakturadatum" typically above
-    "invoice_date": ScaleStrategy(
-        scale_x=1.25,
-        scale_y=1.55,
-        extra_top_ratio=0.40,
-        max_pad_x=80,
-        max_pad_y=110,
-    ),
-    # Due date - label "Forfalldatum" typically above, sometimes left
-    "invoice_due_date": ScaleStrategy(
-        scale_x=1.30,
-        scale_y=1.65,
-        extra_top_ratio=0.45,
-        extra_left_ratio=0.35,
-        max_pad_x=100,
-        max_pad_y=120,
-    ),
-    # Amount - currency symbol "SEK" or "kr" may be to the right
-    "amount": ScaleStrategy(
-        scale_x=1.20,
-        scale_y=1.35,
-        extra_right_ratio=0.30,
-        max_pad_x=70,
-        max_pad_y=80,
-    ),
-    # Invoice number - label "Fakturanummer" typically above
-    "invoice_number": ScaleStrategy(
-        scale_x=1.20,
-        scale_y=1.50,
-        extra_top_ratio=0.40,
-        max_pad_x=80,
-        max_pad_y=100,
-    ),
-    # Supplier org number - label "Org.nr" typically above or left
-    "supplier_org_number": ScaleStrategy(
-        scale_x=1.25,
-        scale_y=1.40,
-        extra_top_ratio=0.30,
-        extra_left_ratio=0.20,
-        max_pad_x=90,
-        max_pad_y=90,
-    ),
-    # Customer number - label "Kundnummer" typically above or left
-    "customer_number": ScaleStrategy(
-        scale_x=1.25,
-        scale_y=1.45,
-        extra_top_ratio=0.35,
-        extra_left_ratio=0.25,
-        max_pad_x=90,
-        max_pad_y=100,
-    ),
-    # Payment line - machine-readable code, minimal expansion needed
-    "payment_line": ScaleStrategy(
-        scale_x=1.10,
-        scale_y=1.20,
-        max_pad_x=40,
-        max_pad_y=40,
-    ),
-}
+# 15px at 150 DPI = ~2.5mm real-world padding around text.
+# Enough for OCR safety margin without capturing neighboring label text.
+UNIFORM_PAD: Final[int] = 15
--- a/packages/shared/shared/training/yolo_trainer.py
+++ b/packages/shared/shared/training/yolo_trainer.py
@@ -17,7 +17,7 @@ class TrainingConfig:
    """Training configuration."""

    # Model settings
-    model_path: str = "yolo11n.pt"  # Base model or path to trained model
+    model_path: str = "yolo26s.pt"  # Base model or path to trained model
    data_yaml: str = ""  # Path to data.yaml

    # Training hyperparameters
@@ -39,6 +39,10 @@ class TrainingConfig:
    resume: bool = False
    resume_from: str | None = None  # Path to checkpoint

+    # Fine-tuning specific
+    freeze: int = 0  # Number of backbone layers to freeze (0 = none)
+    cos_lr: bool = False  # Use cosine learning rate scheduler
+
    # Document-specific augmentation (optimized for invoices)
    augmentation: dict[str, Any] = field(default_factory=lambda: {
        "degrees": 5.0,
@@ -106,7 +110,7 @@ class YOLOTrainer:
        # Check model path
        model_path = Path(self.config.model_path)
        if not model_path.suffix == ".pt":
-            # Could be a model name like "yolo11n.pt" which is downloaded
+            # Could be a model name like "yolo26s.pt" which is downloaded
            if not model_path.name.startswith("yolo"):
                return False, f"Invalid model: {self.config.model_path}"
        elif not model_path.exists():
@@ -147,6 +151,10 @@ class YOLOTrainer:
        self._log("INFO", f"  Epochs: {self.config.epochs}")
        self._log("INFO", f"  Batch size: {self.config.batch_size}")
        self._log("INFO", f"  Image size: {self.config.image_size}")
+        if self.config.freeze > 0:
+            self._log("INFO", f"  Freeze layers: {self.config.freeze}")
+        if self.config.cos_lr:
+            self._log("INFO", f"  Cosine LR: enabled")

        try:
            # Load model
@@ -178,6 +186,12 @@ class YOLOTrainer:
                "resume": self.config.resume and self.config.resume_from is not None,
            }

+            # Add fine-tuning settings
+            if self.config.freeze > 0:
+                train_args["freeze"] = self.config.freeze
+            if self.config.cos_lr:
+                train_args["cos_lr"] = True
+
            # Add augmentation settings
            train_args.update(self.config.augmentation)