feat: complete phase 5 -- error hardening, frontend, Docker, demo, docs
Backend: - ConversationTracker: Protocol + PostgresConversationTracker for lifecycle tracking - Error handler: ErrorCategory enum, classify_error(), with_retry() exponential backoff - Wire PostgresAnalyticsRecorder + ConversationTracker into ws_handler - Rate limiting (10 msg/10s per thread), edge case hardening - Health endpoint GET /api/health, version 0.5.0 - Demo seed data script + sample OpenAPI spec Frontend (all new): - React Router with NavBar (Chat / Replay / Dashboard / Review) - ReplayListPage + ReplayPage with ReplayTimeline component - DashboardPage with MetricCard, range selector, zero-state - ReviewPage for OpenAPI classification review - ErrorBanner for WebSocket disconnect handling - API client (api.ts) with typed fetch wrappers Infrastructure: - Frontend Dockerfile (multi-stage node -> nginx) - nginx.conf with SPA routing + API/WS proxy - docker-compose.yml with frontend service + healthchecks - .env.example files (root + backend) Documentation: - README.md with quick start and architecture - Agent configuration guide - OpenAPI import guide - Deployment guide - Demo script 48 new tests, 449 total passing, 92.87% coverage
This commit is contained in:
72
backend/app/tools/error_handler.py
Normal file
72
backend/app/tools/error_handler.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Error classification and retry logic for tool calls."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class ErrorCategory(Enum):
|
||||
"""Categories for error classification to guide retry decisions."""
|
||||
|
||||
RETRYABLE = "retryable"
|
||||
NON_RETRYABLE = "non_retryable"
|
||||
AUTH_FAILURE = "auth_failure"
|
||||
TIMEOUT = "timeout"
|
||||
NETWORK = "network"
|
||||
|
||||
|
||||
def classify_error(exc: Exception) -> ErrorCategory:
|
||||
"""Classify an exception into an ErrorCategory.
|
||||
|
||||
Rules:
|
||||
- httpx.TimeoutException -> TIMEOUT
|
||||
- httpx.ConnectError -> NETWORK
|
||||
- httpx.HTTPStatusError 401/403 -> AUTH_FAILURE
|
||||
- httpx.HTTPStatusError 429/500/502/503 -> RETRYABLE
|
||||
- anything else -> NON_RETRYABLE
|
||||
"""
|
||||
if isinstance(exc, httpx.TimeoutException):
|
||||
return ErrorCategory.TIMEOUT
|
||||
if isinstance(exc, httpx.ConnectError):
|
||||
return ErrorCategory.NETWORK
|
||||
if isinstance(exc, httpx.HTTPStatusError):
|
||||
code = exc.response.status_code
|
||||
if code in (401, 403):
|
||||
return ErrorCategory.AUTH_FAILURE
|
||||
if code in (429, 500, 502, 503):
|
||||
return ErrorCategory.RETRYABLE
|
||||
return ErrorCategory.NON_RETRYABLE
|
||||
return ErrorCategory.NON_RETRYABLE
|
||||
|
||||
|
||||
async def with_retry(
|
||||
fn: Callable[..., Any],
|
||||
max_retries: int = 3,
|
||||
base_delay: float = 1.0,
|
||||
) -> Any:
|
||||
"""Execute an async callable with exponential backoff for RETRYABLE errors.
|
||||
|
||||
Only ErrorCategory.RETRYABLE errors trigger retries. All other error
|
||||
categories raise immediately after the first attempt.
|
||||
"""
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
return await fn()
|
||||
except Exception as exc:
|
||||
category = classify_error(exc)
|
||||
if category != ErrorCategory.RETRYABLE:
|
||||
raise
|
||||
last_exc = exc
|
||||
if attempt < max_retries:
|
||||
delay = base_delay * (2 ** (attempt - 1))
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
raise last_exc # type: ignore[misc]
|
||||
Reference in New Issue
Block a user