feat: complete phase 5 -- error hardening, frontend, Docker, demo, docs

Backend: - ConversationTracker: Protocol + PostgresConversationTracker for lifecycle tracking - Error handler: ErrorCategory enum, classify_error(), with_retry() exponential backoff - Wire PostgresAnalyticsRecorder + ConversationTracker into ws_handler - Rate limiting (10 msg/10s per thread), edge case hardening - Health endpoint GET /api/health, version 0.5.0 - Demo seed data script + sample OpenAPI spec Frontend (all new): - React Router with NavBar (Chat / Replay / Dashboard / Review) - ReplayListPage + ReplayPage with ReplayTimeline component - DashboardPage with MetricCard, range selector, zero-state - ReviewPage for OpenAPI classification review - ErrorBanner for WebSocket disconnect handling - API client (api.ts) with typed fetch wrappers Infrastructure: - Frontend Dockerfile (multi-stage node -> nginx) - nginx.conf with SPA routing + API/WS proxy - docker-compose.yml with frontend service + healthchecks - .env.example files (root + backend) Documentation: - README.md with quick start and architecture - Agent configuration guide - OpenAPI import guide - Deployment guide - Demo script 48 new tests, 449 total passing, 92.87% coverage
2026-03-31 21:20:06 +02:00
parent 38644594d2
commit 0e78e5b06b
44 changed files with 3397 additions and 169 deletions
--- a/backend/app/ws_handler.py
+++ b/backend/app/ws_handler.py
@@ -5,6 +5,8 @@ from __future__ import annotations
 import json
 import logging
 import re
+import time
+from collections import defaultdict
 from typing import TYPE_CHECKING, Any

 from langchain_core.messages import HumanMessage
@@ -16,16 +18,23 @@ if TYPE_CHECKING:
    from fastapi import WebSocket
    from langgraph.graph.state import CompiledStateGraph

+    from app.analytics.event_recorder import AnalyticsRecorder
    from app.callbacks import TokenUsageCallbackHandler
+    from app.conversation_tracker import ConversationTrackerProtocol
    from app.interrupt_manager import InterruptManager
    from app.session_manager import SessionManager

 logger = logging.getLogger(__name__)

 MAX_MESSAGE_SIZE = 32_768  # 32 KB
-MAX_CONTENT_LENGTH = 8_000  # characters
+MAX_CONTENT_LENGTH = 10_000  # characters
 THREAD_ID_PATTERN = re.compile(r"^[a-zA-Z0-9\-_]{1,128}$")

+# Rate limiting: max 10 messages per 10-second window, per thread
+_RATE_LIMIT_MAX = 10
+_RATE_LIMIT_WINDOW = 10.0
+_thread_timestamps: dict[str, list[float]] = defaultdict(list)
+

 async def handle_user_message(
    ws: WebSocket,
@@ -197,6 +206,9 @@ async def dispatch_message(
    callback_handler: TokenUsageCallbackHandler,
    raw_data: str,
    interrupt_manager: InterruptManager | None = None,
+    analytics_recorder: AnalyticsRecorder | None = None,
+    conversation_tracker: ConversationTrackerProtocol | None = None,
+    pool: Any = None,
 ) -> None:
    """Parse and route an incoming WebSocket message."""
    if len(raw_data) > MAX_MESSAGE_SIZE:
@@ -205,10 +217,14 @@ async def dispatch_message(

    try:
        data = json.loads(raw_data)
-    except json.JSONDecodeError:
+    except (json.JSONDecodeError, ValueError):
        await _send_json(ws, {"type": "error", "message": "Invalid JSON"})
        return

+    if not isinstance(data, dict):
+        await _send_json(ws, {"type": "error", "message": "Invalid JSON: expected object"})
+        return
+
    msg_type = data.get("type")
    thread_id = data.get("thread_id", "")

@@ -222,16 +238,36 @@ async def dispatch_message(

    if msg_type == "message":
        content = data.get("content", "")
-        if not content:
+        if not content or not content.strip():
            await _send_json(ws, {"type": "error", "message": "Missing message content"})
            return
        if len(content) > MAX_CONTENT_LENGTH:
            await _send_json(ws, {"type": "error", "message": "Message content too long"})
            return
+
+        # Rate limiting check
+        now = time.time()
+        timestamps = _thread_timestamps[thread_id]
+        cutoff = now - _RATE_LIMIT_WINDOW
+        _thread_timestamps[thread_id] = [t for t in timestamps if t >= cutoff]
+        if len(_thread_timestamps[thread_id]) >= _RATE_LIMIT_MAX:
+            await _send_json(ws, {"type": "error", "message": "Rate limit exceeded"})
+            return
+        _thread_timestamps[thread_id].append(now)
+
        await handle_user_message(
            ws, graph, session_manager, callback_handler, thread_id, content,
            interrupt_manager=interrupt_manager,
        )
+        await _fire_and_forget_tracking(
+            thread_id=thread_id,
+            pool=pool,
+            analytics_recorder=analytics_recorder,
+            conversation_tracker=conversation_tracker,
+            agent_name=None,
+            tokens=0,
+            cost=0.0,
+        )

    elif msg_type == "interrupt_response":
        approved = data.get("approved", False)
@@ -244,6 +280,36 @@ async def dispatch_message(
        await _send_json(ws, {"type": "error", "message": "Unknown message type"})


+async def _fire_and_forget_tracking(
+    thread_id: str,
+    pool: Any,
+    analytics_recorder: Any | None,
+    conversation_tracker: Any | None,
+    agent_name: str | None,
+    tokens: int,
+    cost: float,
+) -> None:
+    """Fire-and-forget analytics/tracking; failures must NOT break chat."""
+    try:
+        if conversation_tracker is not None and pool is not None:
+            await conversation_tracker.ensure_conversation(pool, thread_id)
+            await conversation_tracker.record_turn(pool, thread_id, agent_name, tokens, cost)
+    except Exception:
+        logger.exception("Conversation tracker error for thread %s (suppressed)", thread_id)
+
+    try:
+        if analytics_recorder is not None:
+            await analytics_recorder.record(
+                thread_id=thread_id,
+                event_type="message",
+                agent_name=agent_name,
+                tokens_used=tokens,
+                cost_usd=cost,
+            )
+    except Exception:
+        logger.exception("Analytics recorder error for thread %s (suppressed)", thread_id)
+
+
 def _has_interrupt(state: Any) -> bool:
    """Check if the graph state has a pending interrupt."""
    tasks = getattr(state, "tasks", ())