feat: add t-SNE stock clustering and similarity search (TDD)
2 new endpoints: - POST /portfolio/cluster - t-SNE + KMeans clustering by return similarity. Maps stocks to 2D coordinates with cluster labels. - POST /portfolio/similar - find most/least similar stocks by return correlation against a target symbol. Implementation: - sklearn TSNE (method=exact) + KMeans with auto n_clusters - Jitter handling for identical returns edge case - 33 new tests (17 service unit + 16 route integration) - All 503 tests passing
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
"""Portfolio optimization: HRP, correlation matrix, risk parity."""
|
||||
"""Portfolio optimization: HRP, correlation matrix, risk parity, t-SNE clustering."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from math import isqrt
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -220,3 +221,152 @@ async def compute_risk_parity(
|
||||
"risk_contributions": risk_contributions,
|
||||
"method": "risk_parity",
|
||||
}
|
||||
|
||||
|
||||
def _auto_n_clusters(n: int) -> int:
|
||||
"""Return a sensible default cluster count: max(2, floor(sqrt(n)))."""
|
||||
return max(2, isqrt(n))
|
||||
|
||||
|
||||
def _run_tsne_kmeans(
|
||||
returns_matrix: np.ndarray, n_clusters: int
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Run t-SNE then KMeans on a (n_symbols x n_days) returns matrix.
|
||||
|
||||
Returns (coords, labels) where coords has shape (n_symbols, 2).
|
||||
CPU-heavy: caller must wrap in asyncio.to_thread.
|
||||
"""
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
n_samples = returns_matrix.shape[0]
|
||||
perplexity = min(5, n_samples - 1)
|
||||
|
||||
# Add tiny noise to prevent numerical singularity when returns are identical
|
||||
rng = np.random.default_rng(42)
|
||||
jittered = returns_matrix + rng.normal(0, 1e-10, returns_matrix.shape)
|
||||
|
||||
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, method="exact")
|
||||
coords = tsne.fit_transform(jittered)
|
||||
|
||||
km = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
|
||||
labels = km.fit_predict(coords)
|
||||
|
||||
return coords, labels
|
||||
|
||||
|
||||
async def cluster_stocks(
|
||||
symbols: list[str],
|
||||
days: int = 180,
|
||||
n_clusters: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Cluster stocks by return similarity using t-SNE + KMeans.
|
||||
|
||||
Args:
|
||||
symbols: List of ticker symbols. Minimum 3, maximum 50.
|
||||
days: Number of historical trading days to use.
|
||||
n_clusters: Number of clusters. Defaults to floor(sqrt(n_symbols)).
|
||||
|
||||
Returns:
|
||||
Dict with keys ``symbols``, ``coordinates``, ``clusters``,
|
||||
``method``, ``n_clusters``, and ``days``.
|
||||
|
||||
Raises:
|
||||
ValueError: Fewer than 3 symbols, or no price data available.
|
||||
"""
|
||||
if len(symbols) < 3:
|
||||
raise ValueError("cluster_stocks requires at least 3 symbols")
|
||||
|
||||
prices = await fetch_historical_prices(symbols, days=days)
|
||||
if prices.empty:
|
||||
raise ValueError("No price data available for the given symbols")
|
||||
|
||||
returns = _compute_returns(prices)
|
||||
available = list(returns.columns)
|
||||
n = len(available)
|
||||
|
||||
k = n_clusters if n_clusters is not None else _auto_n_clusters(n)
|
||||
|
||||
# Build (n_symbols x n_days) matrix; fill NaN with column mean
|
||||
matrix = returns[available].T.fillna(0).values.astype(float)
|
||||
|
||||
coords, labels = await asyncio.to_thread(_run_tsne_kmeans, matrix, k)
|
||||
|
||||
coordinates = [
|
||||
{
|
||||
"symbol": sym,
|
||||
"x": float(coords[i, 0]),
|
||||
"y": float(coords[i, 1]),
|
||||
"cluster": int(labels[i]),
|
||||
}
|
||||
for i, sym in enumerate(available)
|
||||
]
|
||||
|
||||
clusters: dict[str, list[str]] = {}
|
||||
for sym, label in zip(available, labels):
|
||||
key = str(int(label))
|
||||
clusters.setdefault(key, []).append(sym)
|
||||
|
||||
return {
|
||||
"symbols": available,
|
||||
"coordinates": coordinates,
|
||||
"clusters": clusters,
|
||||
"method": "t-SNE + KMeans",
|
||||
"n_clusters": k,
|
||||
"days": days,
|
||||
}
|
||||
|
||||
|
||||
async def find_similar_stocks(
|
||||
symbol: str,
|
||||
universe: list[str],
|
||||
days: int = 180,
|
||||
top_n: int = 5,
|
||||
) -> dict[str, Any]:
|
||||
"""Find stocks most/least similar to a target by return correlation.
|
||||
|
||||
Args:
|
||||
symbol: Target ticker symbol.
|
||||
universe: List of candidate symbols to compare against.
|
||||
days: Number of historical trading days to use.
|
||||
top_n: Number of most- and least-similar stocks to return.
|
||||
|
||||
Returns:
|
||||
Dict with keys ``symbol``, ``most_similar``, ``least_similar``.
|
||||
|
||||
Raises:
|
||||
ValueError: No price data available, or target symbol missing from data.
|
||||
"""
|
||||
all_symbols = [symbol] + [s for s in universe if s != symbol]
|
||||
prices = await fetch_historical_prices(all_symbols, days=days)
|
||||
|
||||
if prices.empty:
|
||||
raise ValueError("No price data available for the given symbols")
|
||||
|
||||
if symbol not in prices.columns:
|
||||
raise ValueError(
|
||||
f"{symbol} not found in price data; it may have no available history"
|
||||
)
|
||||
|
||||
returns = _compute_returns(prices)
|
||||
|
||||
target_returns = returns[symbol]
|
||||
peers = [s for s in universe if s in returns.columns and s != symbol]
|
||||
|
||||
correlations: list[dict[str, Any]] = []
|
||||
for peer in peers:
|
||||
corr_val = float(target_returns.corr(returns[peer]))
|
||||
if not np.isnan(corr_val):
|
||||
correlations.append({"symbol": peer, "correlation": corr_val})
|
||||
|
||||
correlations.sort(key=lambda e: e["correlation"], reverse=True)
|
||||
|
||||
n = min(top_n, len(correlations))
|
||||
most_similar = correlations[:n]
|
||||
least_similar = sorted(correlations, key=lambda e: e["correlation"])[:n]
|
||||
|
||||
return {
|
||||
"symbol": symbol,
|
||||
"most_similar": most_similar,
|
||||
"least_similar": least_similar,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user