feat: add t-SNE stock clustering and similarity search (TDD)

2 new endpoints: - POST /portfolio/cluster - t-SNE + KMeans clustering by return similarity. Maps stocks to 2D coordinates with cluster labels. - POST /portfolio/similar - find most/least similar stocks by return correlation against a target symbol. Implementation: - sklearn TSNE (method=exact) + KMeans with auto n_clusters - Jitter handling for identical returns edge case - 33 new tests (17 service unit + 16 route integration) - All 503 tests passing
2026-03-19 22:53:27 +01:00
parent 9ee3ec9b4e
commit 4915f1bae4
4 changed files with 759 additions and 1 deletions
--- a/portfolio_service.py
+++ b/portfolio_service.py
@@ -1,7 +1,8 @@
-"""Portfolio optimization: HRP, correlation matrix, risk parity."""
+"""Portfolio optimization: HRP, correlation matrix, risk parity, t-SNE clustering."""

 import asyncio
 import logging
+from math import isqrt
 from typing import Any

 import numpy as np
@@ -220,3 +221,152 @@ async def compute_risk_parity(
        "risk_contributions": risk_contributions,
        "method": "risk_parity",
    }
+
+
+def _auto_n_clusters(n: int) -> int:
+    """Return a sensible default cluster count: max(2, floor(sqrt(n)))."""
+    return max(2, isqrt(n))
+
+
+def _run_tsne_kmeans(
+    returns_matrix: np.ndarray, n_clusters: int
+) -> tuple[np.ndarray, np.ndarray]:
+    """Run t-SNE then KMeans on a (n_symbols x n_days) returns matrix.
+
+    Returns (coords, labels) where coords has shape (n_symbols, 2).
+    CPU-heavy: caller must wrap in asyncio.to_thread.
+    """
+    from sklearn.cluster import KMeans
+    from sklearn.manifold import TSNE
+
+    n_samples = returns_matrix.shape[0]
+    perplexity = min(5, n_samples - 1)
+
+    # Add tiny noise to prevent numerical singularity when returns are identical
+    rng = np.random.default_rng(42)
+    jittered = returns_matrix + rng.normal(0, 1e-10, returns_matrix.shape)
+
+    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, method="exact")
+    coords = tsne.fit_transform(jittered)
+
+    km = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
+    labels = km.fit_predict(coords)
+
+    return coords, labels
+
+
+async def cluster_stocks(
+    symbols: list[str],
+    days: int = 180,
+    n_clusters: int | None = None,
+) -> dict[str, Any]:
+    """Cluster stocks by return similarity using t-SNE + KMeans.
+
+    Args:
+        symbols: List of ticker symbols. Minimum 3, maximum 50.
+        days: Number of historical trading days to use.
+        n_clusters: Number of clusters. Defaults to floor(sqrt(n_symbols)).
+
+    Returns:
+        Dict with keys ``symbols``, ``coordinates``, ``clusters``,
+        ``method``, ``n_clusters``, and ``days``.
+
+    Raises:
+        ValueError: Fewer than 3 symbols, or no price data available.
+    """
+    if len(symbols) < 3:
+        raise ValueError("cluster_stocks requires at least 3 symbols")
+
+    prices = await fetch_historical_prices(symbols, days=days)
+    if prices.empty:
+        raise ValueError("No price data available for the given symbols")
+
+    returns = _compute_returns(prices)
+    available = list(returns.columns)
+    n = len(available)
+
+    k = n_clusters if n_clusters is not None else _auto_n_clusters(n)
+
+    # Build (n_symbols x n_days) matrix; fill NaN with column mean
+    matrix = returns[available].T.fillna(0).values.astype(float)
+
+    coords, labels = await asyncio.to_thread(_run_tsne_kmeans, matrix, k)
+
+    coordinates = [
+        {
+            "symbol": sym,
+            "x": float(coords[i, 0]),
+            "y": float(coords[i, 1]),
+            "cluster": int(labels[i]),
+        }
+        for i, sym in enumerate(available)
+    ]
+
+    clusters: dict[str, list[str]] = {}
+    for sym, label in zip(available, labels):
+        key = str(int(label))
+        clusters.setdefault(key, []).append(sym)
+
+    return {
+        "symbols": available,
+        "coordinates": coordinates,
+        "clusters": clusters,
+        "method": "t-SNE + KMeans",
+        "n_clusters": k,
+        "days": days,
+    }
+
+
+async def find_similar_stocks(
+    symbol: str,
+    universe: list[str],
+    days: int = 180,
+    top_n: int = 5,
+) -> dict[str, Any]:
+    """Find stocks most/least similar to a target by return correlation.
+
+    Args:
+        symbol: Target ticker symbol.
+        universe: List of candidate symbols to compare against.
+        days: Number of historical trading days to use.
+        top_n: Number of most- and least-similar stocks to return.
+
+    Returns:
+        Dict with keys ``symbol``, ``most_similar``, ``least_similar``.
+
+    Raises:
+        ValueError: No price data available, or target symbol missing from data.
+    """
+    all_symbols = [symbol] + [s for s in universe if s != symbol]
+    prices = await fetch_historical_prices(all_symbols, days=days)
+
+    if prices.empty:
+        raise ValueError("No price data available for the given symbols")
+
+    if symbol not in prices.columns:
+        raise ValueError(
+            f"{symbol} not found in price data; it may have no available history"
+        )
+
+    returns = _compute_returns(prices)
+
+    target_returns = returns[symbol]
+    peers = [s for s in universe if s in returns.columns and s != symbol]
+
+    correlations: list[dict[str, Any]] = []
+    for peer in peers:
+        corr_val = float(target_returns.corr(returns[peer]))
+        if not np.isnan(corr_val):
+            correlations.append({"symbol": peer, "correlation": corr_val})
+
+    correlations.sort(key=lambda e: e["correlation"], reverse=True)
+
+    n = min(top_n, len(correlations))
+    most_similar = correlations[:n]
+    least_similar = sorted(correlations, key=lambda e: e["correlation"])[:n]
+
+    return {
+        "symbol": symbol,
+        "most_similar": most_similar,
+        "least_similar": least_similar,
+    }