Files
smart-support/backend/app/safety.py
Yaojia Wang 036e12349d refactor: formalize safety rules, extract shared styles, reconcile docs (P2)
- Add backend/app/safety.py with explicit confirmation policy, multi-intent
  semantics, and MCP error taxonomy with retry classification
- Add 26 unit tests for safety module (confirmation rules, error taxonomy)
- Extract repeated inline styles into shared CSS classes in index.css
  (section-card, stat-label, status-badge, data-table, empty/error-state,
  pagination-bar)
- Refactor DashboardPage, ReplayListPage, ReplayPage to use shared classes
- Update README: add missing API endpoints, document safety/confirmation rules
- Use proper HTML entities for arrow/dash characters to fix encoding glitches
2026-04-05 23:10:50 +02:00

132 lines
4.2 KiB
Python

"""Safety policy for destructive-action confirmation rules.
This module makes the confirmation rules explicit and auditable. Every tool
call passes through ``requires_confirmation`` before execution to decide
whether human-in-the-loop approval is needed.
Policy summary
--------------
- ``read`` actions: execute immediately, no confirmation required.
- ``write`` actions: require human approval via interrupt gate.
- OpenAPI-imported endpoints: use ``needs_interrupt`` from classification.
- If both the agent permission AND the endpoint classification agree
the action is read-only, it executes without confirmation.
Multi-intent semantics
----------------------
When a user message contains multiple intents (e.g. "cancel my order and
apply a refund"), the supervisor routes them sequentially. Each action is
evaluated independently:
- If a write action is blocked by an interrupt, subsequent actions in the
same message are paused until the interrupt is resolved.
- Read actions that follow a blocked write are also paused (sequential,
not best-effort) to preserve causal ordering.
- If an interrupt is rejected, the remaining actions are skipped and the
agent informs the user.
MCP error taxonomy
------------------
Tool execution errors are classified into categories for retry decisions:
- ``transient``: network timeouts, rate limits, 5xx -- retryable up to 3 times.
- ``validation``: bad parameters, 4xx -- not retryable, report to user.
- ``auth``: 401/403 -- not retryable, escalate.
- ``unknown``: unclassified -- not retryable, log and escalate.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal
@dataclass(frozen=True)
class ConfirmationPolicy:
"""Result of evaluating whether an action needs confirmation."""
requires_confirmation: bool
reason: str
def requires_confirmation(
*,
agent_permission: Literal["read", "write"],
needs_interrupt: bool | None = None,
) -> ConfirmationPolicy:
"""Determine whether an action requires human confirmation.
Parameters
----------
agent_permission:
The permission level of the agent executing the action.
needs_interrupt:
Override from OpenAPI classification. When ``None``, the decision
is based solely on ``agent_permission``.
"""
if needs_interrupt is not None:
if needs_interrupt:
return ConfirmationPolicy(
requires_confirmation=True,
reason="Endpoint classified as requiring human approval",
)
return ConfirmationPolicy(
requires_confirmation=False,
reason="Endpoint classified as safe (no interrupt needed)",
)
if agent_permission == "write":
return ConfirmationPolicy(
requires_confirmation=True,
reason="Write-permission agent actions require confirmation",
)
return ConfirmationPolicy(
requires_confirmation=False,
reason="Read-only agent actions execute immediately",
)
# --- MCP Error Taxonomy ---
MCP_ERROR_CATEGORY = Literal["transient", "validation", "auth", "unknown"]
_TRANSIENT_STATUS_CODES = frozenset({408, 429, 500, 502, 503, 504})
_AUTH_STATUS_CODES = frozenset({401, 403})
_MAX_RETRIES = 3
def classify_mcp_error(
*,
status_code: int | None = None,
error_message: str = "",
) -> MCP_ERROR_CATEGORY:
"""Classify an MCP tool error for retry decisions."""
if status_code is not None:
if status_code in _TRANSIENT_STATUS_CODES:
return "transient"
if status_code in _AUTH_STATUS_CODES:
return "auth"
if 400 <= status_code < 500:
return "validation"
lower_msg = error_message.lower()
if any(kw in lower_msg for kw in ("timeout", "timed out", "rate limit")):
return "transient"
if any(kw in lower_msg for kw in ("unauthorized", "forbidden")):
return "auth"
if any(kw in lower_msg for kw in ("invalid", "missing", "bad request")):
return "validation"
return "unknown"
def is_retryable(category: MCP_ERROR_CATEGORY) -> bool:
"""Return whether a given error category is retryable."""
return category == "transient"
def max_retries() -> int:
"""Maximum retry attempts for transient errors."""
return _MAX_RETRIES