refactor: engineering improvements -- API versioning, structured logging, Alembic, error standardization, test coverage

- API versioning: all REST endpoints prefixed with /api/v1/ - Structured logging: replaced stdlib logging with structlog (console/JSON modes) - Alembic migrations: versioned DB schema with initial migration - Error standardization: global exception handlers for consistent envelope format - Interrupt cleanup: asyncio background task for expired interrupt removal - Integration tests: +30 tests (analytics, replay, openapi, error, session APIs) - Frontend tests: +57 tests (all components, pages, useWebSocket hook) - Backend: 557 tests, 89.75% coverage | Frontend: 80 tests, 16 test files
2026-04-06 23:19:29 +02:00
parent af53111928
commit f0699436c5
59 changed files with 2846 additions and 149 deletions
--- a/backend/app/analytics/api.py
+++ b/backend/app/analytics/api.py
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
    from psycopg_pool import AsyncConnectionPool

 router = APIRouter(
-    prefix="/api/analytics",
+    prefix="/api/v1/analytics",
    tags=["analytics"],
    dependencies=[Depends(require_admin_api_key)],
 )
--- a/backend/app/auth.py
+++ b/backend/app/auth.py
@@ -2,14 +2,14 @@

 from __future__ import annotations

-import logging
 import secrets
 from typing import Annotated

+import structlog
 from fastapi import Depends, HTTPException, Query, Request, WebSocket, status
 from fastapi.security import APIKeyHeader

-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()

 _API_KEY_HEADER = APIKeyHeader(name="X-API-Key", auto_error=False)

--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -32,6 +32,8 @@ class Settings(BaseSettings):

    template_name: str = ""

+    log_format: str = "console"  # "console" for dev, "json" for production
+
    admin_api_key: str = ""

    anthropic_api_key: str = ""
--- a/backend/app/db.py
+++ b/backend/app/db.py
@@ -2,6 +2,7 @@

 from __future__ import annotations

+from pathlib import Path
 from typing import TYPE_CHECKING

 from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver
@@ -88,6 +89,17 @@ async def create_checkpointer(pool: AsyncConnectionPool) -> AsyncPostgresSaver:
    return checkpointer


+def run_alembic_migrations(database_url: str) -> None:
+    """Run Alembic migrations to head."""
+    from alembic.config import Config
+
+    from alembic import command
+
+    alembic_cfg = Config(str(Path(__file__).parent.parent / "alembic.ini"))
+    alembic_cfg.set_main_option("sqlalchemy.url", database_url)
+    command.upgrade(alembic_cfg, "head")
+
+
 async def setup_app_tables(pool: AsyncConnectionPool) -> None:
    """Create application-specific tables and apply migrations."""
    async with pool.connection() as conn:
--- a/backend/app/escalation.py
+++ b/backend/app/escalation.py
@@ -3,14 +3,14 @@
 from __future__ import annotations

 import asyncio
-import logging
 from dataclasses import dataclass
 from typing import Protocol

 import httpx
+import structlog
 from pydantic import BaseModel

-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()


 class EscalationPayload(BaseModel, frozen=True):
--- a/backend/app/graph.py
+++ b/backend/app/graph.py
@@ -2,7 +2,6 @@

 from __future__ import annotations

-import logging
 from typing import TYPE_CHECKING

 from langchain.agents import create_agent
@@ -18,7 +17,9 @@ if TYPE_CHECKING:
    from app.intent import IntentClassifier
    from app.registry import AgentRegistry

-logger = logging.getLogger(__name__)
+import structlog
+
+logger = structlog.get_logger()

 SUPERVISOR_PROMPT = (
    "You are a customer support supervisor. "
--- a/backend/app/intent.py
+++ b/backend/app/intent.py
@@ -2,7 +2,6 @@

 from __future__ import annotations

-import logging
 from typing import TYPE_CHECKING, Protocol

 from pydantic import BaseModel
@@ -12,7 +11,9 @@ if TYPE_CHECKING:

    from app.registry import AgentConfig

-logger = logging.getLogger(__name__)
+import structlog
+
+logger = structlog.get_logger()

 CLASSIFICATION_PROMPT = (
    "You are an intent classifier for a customer support system.\n"
--- a/backend/app/logging_config.py
+++ b/backend/app/logging_config.py
@@ -0,0 +1,57 @@
+"""Structured logging configuration using structlog."""
+
+from __future__ import annotations
+
+import logging
+import sys
+
+import structlog
+
+
+def configure_logging(log_format: str = "console") -> None:
+    """Configure structlog with stdlib integration.
+
+    Args:
+        log_format: "console" for human-readable dev output,
+                    "json" for machine-parseable production output.
+    """
+    shared_processors: list[structlog.types.Processor] = [
+        structlog.contextvars.merge_contextvars,
+        structlog.stdlib.filter_by_level,
+        structlog.stdlib.add_logger_name,
+        structlog.stdlib.add_log_level,
+        structlog.processors.TimeStamper(fmt="iso"),
+        structlog.processors.StackInfoRenderer(),
+        structlog.processors.format_exc_info,
+        structlog.processors.UnicodeDecoder(),
+    ]
+
+    if log_format == "json":
+        renderer: structlog.types.Processor = structlog.processors.JSONRenderer()
+    else:
+        renderer = structlog.dev.ConsoleRenderer()
+
+    structlog.configure(
+        processors=[
+            *shared_processors,
+            structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
+        ],
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        wrapper_class=structlog.stdlib.BoundLogger,
+        cache_logger_on_first_use=True,
+    )
+
+    formatter = structlog.stdlib.ProcessorFormatter(
+        processors=[
+            structlog.stdlib.ProcessorFormatter.remove_processors_meta,
+            renderer,
+        ],
+    )
+
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(formatter)
+
+    root_logger = logging.getLogger()
+    root_logger.handlers.clear()
+    root_logger.addHandler(handler)
+    root_logger.setLevel(logging.INFO)
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -2,25 +2,30 @@

 from __future__ import annotations

-import logging
+import asyncio
+import contextlib
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import TYPE_CHECKING

-from fastapi import Depends, FastAPI, Query, WebSocket, WebSocketDisconnect
+from fastapi import FastAPI, HTTPException, Query, WebSocket, WebSocketDisconnect
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles

 from app.analytics.api import router as analytics_router
 from app.analytics.event_recorder import PostgresAnalyticsRecorder
+from app.api_utils import envelope
 from app.callbacks import TokenUsageCallbackHandler
 from app.config import Settings
 from app.conversation_tracker import PostgresConversationTracker
-from app.db import create_checkpointer, create_pool, setup_app_tables
+from app.db import create_checkpointer, create_pool, run_alembic_migrations
 from app.escalation import NoOpEscalator, WebhookEscalator
 from app.graph import build_graph
 from app.intent import LLMIntentClassifier
 from app.interrupt_manager import InterruptManager
 from app.llm import create_llm
+from app.logging_config import configure_logging
 from app.openapi.review_api import router as openapi_router
 from app.registry import AgentRegistry
 from app.replay.api import router as replay_router
@@ -31,19 +36,44 @@ from app.ws_handler import dispatch_message
 if TYPE_CHECKING:
    from collections.abc import AsyncGenerator

-logger = logging.getLogger(__name__)
+import structlog
+
+logger = structlog.get_logger()

 AGENTS_YAML = Path(__file__).parent.parent / "agents.yaml"
 FRONTEND_DIST = Path(__file__).parent.parent.parent / "frontend" / "dist"


+async def _interrupt_cleanup_loop(
+    interrupt_manager: InterruptManager,
+    interval: int = 60,
+) -> None:
+    """Periodically remove expired interrupts in the background.
+
+    Runs until cancelled. Catches all exceptions to prevent the task
+    from dying unexpectedly.
+    """
+    while True:
+        await asyncio.sleep(interval)
+        try:
+            expired = interrupt_manager.cleanup_expired()
+            if expired:
+                logger.info(
+                    "Cleaned up %d expired interrupt(s)",
+                    len(expired),
+                )
+        except Exception:
+            logger.exception("Error during interrupt cleanup")
+
+
@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    settings = Settings()
+    configure_logging(settings.log_format)

    pool = await create_pool(settings)
    checkpointer = await create_checkpointer(pool)
-    await setup_app_tables(pool)
+    run_alembic_migrations(settings.database_url)

    # Load agents from template or default YAML
    if settings.template_name:
@@ -89,8 +119,16 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
        settings.template_name or "(default)",
    )

+    cleanup_task = asyncio.create_task(
+        _interrupt_cleanup_loop(interrupt_manager),
+    )
+
    yield

+    cleanup_task.cancel()
+    with contextlib.suppress(asyncio.CancelledError):
+        await cleanup_task
+
    await pool.close()


@@ -103,7 +141,35 @@ app.include_router(replay_router)
 app.include_router(analytics_router)


-@app.get("/api/health")
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):  # type: ignore[no-untyped-def]
+    """Wrap HTTPException in standard envelope format."""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=envelope(None, success=False, error=exc.detail),
+    )
+
+
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request, exc):  # type: ignore[no-untyped-def]
+    """Wrap validation errors in standard envelope format."""
+    return JSONResponse(
+        status_code=422,
+        content=envelope(None, success=False, error=str(exc)),
+    )
+
+
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):  # type: ignore[no-untyped-def]
+    """Catch-all handler -- never leak stack traces."""
+    logger.exception("Unhandled exception: %s", exc)
+    return JSONResponse(
+        status_code=500,
+        content=envelope(None, success=False, error="Internal server error"),
+    )
+
+
+@app.get("/api/v1/health")
 def health_check() -> dict:
    """Health check endpoint for load balancers and monitoring."""
    return {"status": "ok", "version": _VERSION}
--- a/backend/app/openapi/classifier.py
+++ b/backend/app/openapi/classifier.py
@@ -8,13 +8,14 @@ classifier and an LLM-backed classifier with heuristic fallback.
 from __future__ import annotations

 import json
-import logging
 import re
 from typing import Protocol

+import structlog
+
 from app.openapi.models import ClassificationResult, EndpointInfo

-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()

 _WRITE_METHODS = frozenset({"POST", "PUT", "PATCH", "DELETE"})
 _INTERRUPT_METHODS = frozenset({"POST", "PUT", "PATCH", "DELETE"})
--- a/backend/app/openapi/importer.py
+++ b/backend/app/openapi/importer.py
@@ -6,10 +6,11 @@ Each stage updates the job status and calls the on_progress callback.

 from __future__ import annotations

-import logging
 from collections.abc import Callable
 from dataclasses import replace

+import structlog
+
 from app.openapi.classifier import ClassifierProtocol, HeuristicClassifier
 from app.openapi.fetcher import fetch_spec
 from app.openapi.models import ImportJob
@@ -17,7 +18,7 @@ from app.openapi.parser import parse_endpoints
 from app.openapi.ssrf import DEFAULT_POLICY, SSRFPolicy
 from app.openapi.validator import validate_spec

-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()

 ProgressCallback = Callable[[str, ImportJob], None] | None

--- a/backend/app/openapi/review_api.py
+++ b/backend/app/openapi/review_api.py
@@ -10,11 +10,11 @@ Exposes endpoints for:
 from __future__ import annotations

 import asyncio
-import logging
 import re
 import uuid
 from typing import Literal

+import structlog
 from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
 from pydantic import BaseModel, field_validator

@@ -23,10 +23,10 @@ from app.openapi.generator import generate_agent_yaml, generate_tool_code
 from app.openapi.importer import ImportOrchestrator
 from app.openapi.models import ClassificationResult, ImportJob

-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()

 router = APIRouter(
-    prefix="/api/openapi",
+    prefix="/api/v1/openapi",
    tags=["openapi"],
    dependencies=[Depends(require_admin_api_key)],
 )
--- a/backend/app/replay/api.py
+++ b/backend/app/replay/api.py
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
    from psycopg_pool import AsyncConnectionPool

 router = APIRouter(
-    prefix="/api",
+    prefix="/api/v1",
    tags=["replay"],
    dependencies=[Depends(require_admin_api_key)],
 )
--- a/backend/app/replay/transformer.py
+++ b/backend/app/replay/transformer.py
@@ -2,11 +2,11 @@

 from __future__ import annotations

-import logging
+import structlog

 from app.replay.models import ReplayStep, StepType

-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()

 _EMPTY_TIMESTAMP = "1970-01-01T00:00:00Z"

--- a/backend/app/ws_handler.py
+++ b/backend/app/ws_handler.py
@@ -3,7 +3,6 @@
 from __future__ import annotations

 import json
-import logging
 import re
 import time
 from collections import defaultdict
@@ -21,7 +20,9 @@ if TYPE_CHECKING:
    from app.session_manager import SessionManager
    from app.ws_context import WebSocketContext

-logger = logging.getLogger(__name__)
+import structlog
+
+logger = structlog.get_logger()

 MAX_MESSAGE_SIZE = 32_768  # 32 KB
 MAX_CONTENT_LENGTH = 10_000  # characters