refactor: engineering improvements -- API versioning, structured logging, Alembic, error standardization, test coverage

- API versioning: all REST endpoints prefixed with /api/v1/
- Structured logging: replaced stdlib logging with structlog (console/JSON modes)
- Alembic migrations: versioned DB schema with initial migration
- Error standardization: global exception handlers for consistent envelope format
- Interrupt cleanup: asyncio background task for expired interrupt removal
- Integration tests: +30 tests (analytics, replay, openapi, error, session APIs)
- Frontend tests: +57 tests (all components, pages, useWebSocket hook)
- Backend: 557 tests, 89.75% coverage | Frontend: 80 tests, 16 test files
This commit is contained in:
Yaojia Wang
2026-04-06 23:19:29 +02:00
parent af53111928
commit f0699436c5
59 changed files with 2846 additions and 149 deletions

View File

@@ -16,7 +16,7 @@ if TYPE_CHECKING:
from psycopg_pool import AsyncConnectionPool
router = APIRouter(
prefix="/api/analytics",
prefix="/api/v1/analytics",
tags=["analytics"],
dependencies=[Depends(require_admin_api_key)],
)

View File

@@ -2,14 +2,14 @@
from __future__ import annotations
import logging
import secrets
from typing import Annotated
import structlog
from fastapi import Depends, HTTPException, Query, Request, WebSocket, status
from fastapi.security import APIKeyHeader
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
_API_KEY_HEADER = APIKeyHeader(name="X-API-Key", auto_error=False)

View File

@@ -32,6 +32,8 @@ class Settings(BaseSettings):
template_name: str = ""
log_format: str = "console" # "console" for dev, "json" for production
admin_api_key: str = ""
anthropic_api_key: str = ""

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver
@@ -88,6 +89,17 @@ async def create_checkpointer(pool: AsyncConnectionPool) -> AsyncPostgresSaver:
return checkpointer
def run_alembic_migrations(database_url: str) -> None:
"""Run Alembic migrations to head."""
from alembic.config import Config
from alembic import command
alembic_cfg = Config(str(Path(__file__).parent.parent / "alembic.ini"))
alembic_cfg.set_main_option("sqlalchemy.url", database_url)
command.upgrade(alembic_cfg, "head")
async def setup_app_tables(pool: AsyncConnectionPool) -> None:
"""Create application-specific tables and apply migrations."""
async with pool.connection() as conn:

View File

@@ -3,14 +3,14 @@
from __future__ import annotations
import asyncio
import logging
from dataclasses import dataclass
from typing import Protocol
import httpx
import structlog
from pydantic import BaseModel
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
class EscalationPayload(BaseModel, frozen=True):

View File

@@ -2,7 +2,6 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from langchain.agents import create_agent
@@ -18,7 +17,9 @@ if TYPE_CHECKING:
from app.intent import IntentClassifier
from app.registry import AgentRegistry
logger = logging.getLogger(__name__)
import structlog
logger = structlog.get_logger()
SUPERVISOR_PROMPT = (
"You are a customer support supervisor. "

View File

@@ -2,7 +2,6 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Protocol
from pydantic import BaseModel
@@ -12,7 +11,9 @@ if TYPE_CHECKING:
from app.registry import AgentConfig
logger = logging.getLogger(__name__)
import structlog
logger = structlog.get_logger()
CLASSIFICATION_PROMPT = (
"You are an intent classifier for a customer support system.\n"

View File

@@ -0,0 +1,57 @@
"""Structured logging configuration using structlog."""
from __future__ import annotations
import logging
import sys
import structlog
def configure_logging(log_format: str = "console") -> None:
"""Configure structlog with stdlib integration.
Args:
log_format: "console" for human-readable dev output,
"json" for machine-parseable production output.
"""
shared_processors: list[structlog.types.Processor] = [
structlog.contextvars.merge_contextvars,
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
]
if log_format == "json":
renderer: structlog.types.Processor = structlog.processors.JSONRenderer()
else:
renderer = structlog.dev.ConsoleRenderer()
structlog.configure(
processors=[
*shared_processors,
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
],
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
formatter = structlog.stdlib.ProcessorFormatter(
processors=[
structlog.stdlib.ProcessorFormatter.remove_processors_meta,
renderer,
],
)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
root_logger = logging.getLogger()
root_logger.handlers.clear()
root_logger.addHandler(handler)
root_logger.setLevel(logging.INFO)

View File

@@ -2,25 +2,30 @@
from __future__ import annotations
import logging
import asyncio
import contextlib
from contextlib import asynccontextmanager
from pathlib import Path
from typing import TYPE_CHECKING
from fastapi import Depends, FastAPI, Query, WebSocket, WebSocketDisconnect
from fastapi import FastAPI, HTTPException, Query, WebSocket, WebSocketDisconnect
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
from app.analytics.api import router as analytics_router
from app.analytics.event_recorder import PostgresAnalyticsRecorder
from app.api_utils import envelope
from app.callbacks import TokenUsageCallbackHandler
from app.config import Settings
from app.conversation_tracker import PostgresConversationTracker
from app.db import create_checkpointer, create_pool, setup_app_tables
from app.db import create_checkpointer, create_pool, run_alembic_migrations
from app.escalation import NoOpEscalator, WebhookEscalator
from app.graph import build_graph
from app.intent import LLMIntentClassifier
from app.interrupt_manager import InterruptManager
from app.llm import create_llm
from app.logging_config import configure_logging
from app.openapi.review_api import router as openapi_router
from app.registry import AgentRegistry
from app.replay.api import router as replay_router
@@ -31,19 +36,44 @@ from app.ws_handler import dispatch_message
if TYPE_CHECKING:
from collections.abc import AsyncGenerator
logger = logging.getLogger(__name__)
import structlog
logger = structlog.get_logger()
AGENTS_YAML = Path(__file__).parent.parent / "agents.yaml"
FRONTEND_DIST = Path(__file__).parent.parent.parent / "frontend" / "dist"
async def _interrupt_cleanup_loop(
interrupt_manager: InterruptManager,
interval: int = 60,
) -> None:
"""Periodically remove expired interrupts in the background.
Runs until cancelled. Catches all exceptions to prevent the task
from dying unexpectedly.
"""
while True:
await asyncio.sleep(interval)
try:
expired = interrupt_manager.cleanup_expired()
if expired:
logger.info(
"Cleaned up %d expired interrupt(s)",
len(expired),
)
except Exception:
logger.exception("Error during interrupt cleanup")
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
settings = Settings()
configure_logging(settings.log_format)
pool = await create_pool(settings)
checkpointer = await create_checkpointer(pool)
await setup_app_tables(pool)
run_alembic_migrations(settings.database_url)
# Load agents from template or default YAML
if settings.template_name:
@@ -89,8 +119,16 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
settings.template_name or "(default)",
)
cleanup_task = asyncio.create_task(
_interrupt_cleanup_loop(interrupt_manager),
)
yield
cleanup_task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await cleanup_task
await pool.close()
@@ -103,7 +141,35 @@ app.include_router(replay_router)
app.include_router(analytics_router)
@app.get("/api/health")
@app.exception_handler(HTTPException)
async def http_exception_handler(request, exc): # type: ignore[no-untyped-def]
"""Wrap HTTPException in standard envelope format."""
return JSONResponse(
status_code=exc.status_code,
content=envelope(None, success=False, error=exc.detail),
)
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request, exc): # type: ignore[no-untyped-def]
"""Wrap validation errors in standard envelope format."""
return JSONResponse(
status_code=422,
content=envelope(None, success=False, error=str(exc)),
)
@app.exception_handler(Exception)
async def general_exception_handler(request, exc): # type: ignore[no-untyped-def]
"""Catch-all handler -- never leak stack traces."""
logger.exception("Unhandled exception: %s", exc)
return JSONResponse(
status_code=500,
content=envelope(None, success=False, error="Internal server error"),
)
@app.get("/api/v1/health")
def health_check() -> dict:
"""Health check endpoint for load balancers and monitoring."""
return {"status": "ok", "version": _VERSION}

View File

@@ -8,13 +8,14 @@ classifier and an LLM-backed classifier with heuristic fallback.
from __future__ import annotations
import json
import logging
import re
from typing import Protocol
import structlog
from app.openapi.models import ClassificationResult, EndpointInfo
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
_WRITE_METHODS = frozenset({"POST", "PUT", "PATCH", "DELETE"})
_INTERRUPT_METHODS = frozenset({"POST", "PUT", "PATCH", "DELETE"})

View File

@@ -6,10 +6,11 @@ Each stage updates the job status and calls the on_progress callback.
from __future__ import annotations
import logging
from collections.abc import Callable
from dataclasses import replace
import structlog
from app.openapi.classifier import ClassifierProtocol, HeuristicClassifier
from app.openapi.fetcher import fetch_spec
from app.openapi.models import ImportJob
@@ -17,7 +18,7 @@ from app.openapi.parser import parse_endpoints
from app.openapi.ssrf import DEFAULT_POLICY, SSRFPolicy
from app.openapi.validator import validate_spec
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
ProgressCallback = Callable[[str, ImportJob], None] | None

View File

@@ -10,11 +10,11 @@ Exposes endpoints for:
from __future__ import annotations
import asyncio
import logging
import re
import uuid
from typing import Literal
import structlog
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from pydantic import BaseModel, field_validator
@@ -23,10 +23,10 @@ from app.openapi.generator import generate_agent_yaml, generate_tool_code
from app.openapi.importer import ImportOrchestrator
from app.openapi.models import ClassificationResult, ImportJob
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
router = APIRouter(
prefix="/api/openapi",
prefix="/api/v1/openapi",
tags=["openapi"],
dependencies=[Depends(require_admin_api_key)],
)

View File

@@ -16,7 +16,7 @@ if TYPE_CHECKING:
from psycopg_pool import AsyncConnectionPool
router = APIRouter(
prefix="/api",
prefix="/api/v1",
tags=["replay"],
dependencies=[Depends(require_admin_api_key)],
)

View File

@@ -2,11 +2,11 @@
from __future__ import annotations
import logging
import structlog
from app.replay.models import ReplayStep, StepType
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
_EMPTY_TIMESTAMP = "1970-01-01T00:00:00Z"

View File

@@ -3,7 +3,6 @@
from __future__ import annotations
import json
import logging
import re
import time
from collections import defaultdict
@@ -21,7 +20,9 @@ if TYPE_CHECKING:
from app.session_manager import SessionManager
from app.ws_context import WebSocketContext
logger = logging.getLogger(__name__)
import structlog
logger = structlog.get_logger()
MAX_MESSAGE_SIZE = 32_768 # 32 KB
MAX_CONTENT_LENGTH = 10_000 # characters