2 new endpoints: - POST /portfolio/cluster - t-SNE + KMeans clustering by return similarity. Maps stocks to 2D coordinates with cluster labels. - POST /portfolio/similar - find most/least similar stocks by return correlation against a target symbol. Implementation: - sklearn TSNE (method=exact) + KMeans with auto n_clusters - Jitter handling for identical returns edge case - 33 new tests (17 service unit + 16 route integration) - All 503 tests passing
373 lines
11 KiB
Python
373 lines
11 KiB
Python
"""Portfolio optimization: HRP, correlation matrix, risk parity, t-SNE clustering."""
|
|
|
|
import asyncio
|
|
import logging
|
|
from math import isqrt
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from obb_utils import fetch_historical
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def fetch_historical_prices(symbols: list[str], days: int = 365) -> pd.DataFrame:
|
|
"""Fetch closing prices for multiple symbols and return as a DataFrame.
|
|
|
|
Columns are symbol names; rows are dates. Symbols with no data are skipped.
|
|
"""
|
|
tasks = [fetch_historical(sym, days=days) for sym in symbols]
|
|
results = await asyncio.gather(*tasks)
|
|
|
|
price_series: dict[str, pd.Series] = {}
|
|
for sym, result in zip(symbols, results):
|
|
if result is None or result.results is None:
|
|
logger.warning("No historical data for %s, skipping", sym)
|
|
continue
|
|
rows = result.results
|
|
if not rows:
|
|
continue
|
|
dates = []
|
|
closes = []
|
|
for row in rows:
|
|
d = getattr(row, "date", None)
|
|
c = getattr(row, "close", None)
|
|
if d is not None and c is not None:
|
|
dates.append(str(d))
|
|
closes.append(float(c))
|
|
if dates:
|
|
price_series[sym] = pd.Series(closes, index=dates)
|
|
|
|
if not price_series:
|
|
return pd.DataFrame()
|
|
|
|
df = pd.DataFrame(price_series)
|
|
df = df.dropna(how="all")
|
|
return df
|
|
|
|
|
|
def _compute_returns(prices: pd.DataFrame) -> pd.DataFrame:
|
|
"""Compute daily log returns from a price DataFrame."""
|
|
return prices.pct_change().dropna()
|
|
|
|
|
|
def _inverse_volatility_weights(returns: pd.DataFrame) -> dict[str, float]:
|
|
"""Compute inverse-volatility weights."""
|
|
vols = returns.std()
|
|
inv_vols = 1.0 / vols
|
|
weights = inv_vols / inv_vols.sum()
|
|
return {sym: float(w) for sym, w in weights.items()}
|
|
|
|
|
|
def _hrp_weights(returns: pd.DataFrame) -> dict[str, float]:
|
|
"""Compute Hierarchical Risk Parity weights via scipy clustering.
|
|
|
|
Falls back to inverse-volatility if scipy is unavailable.
|
|
"""
|
|
symbols = list(returns.columns)
|
|
n = len(symbols)
|
|
|
|
if n == 1:
|
|
return {symbols[0]: 1.0}
|
|
|
|
try:
|
|
from scipy.cluster.hierarchy import linkage, leaves_list
|
|
from scipy.spatial.distance import squareform
|
|
|
|
corr = returns.corr().fillna(0).values
|
|
# Convert correlation to distance: d = sqrt(0.5 * (1 - corr))
|
|
dist = np.sqrt(np.clip(0.5 * (1 - corr), 0, 1))
|
|
np.fill_diagonal(dist, 0.0)
|
|
condensed = squareform(dist)
|
|
link = linkage(condensed, method="single")
|
|
order = leaves_list(link)
|
|
sorted_symbols = [symbols[i] for i in order]
|
|
except ImportError:
|
|
logger.warning("scipy not available; using inverse-volatility for HRP")
|
|
return _inverse_volatility_weights(returns)
|
|
|
|
cov = returns.cov().values
|
|
|
|
def _bisect_weights(items: list[str]) -> dict[str, float]:
|
|
if len(items) == 1:
|
|
return {items[0]: 1.0}
|
|
mid = len(items) // 2
|
|
left_items = items[:mid]
|
|
right_items = items[mid:]
|
|
left_idx = [sorted_symbols.index(s) for s in left_items]
|
|
right_idx = [sorted_symbols.index(s) for s in right_items]
|
|
|
|
def _cluster_var(idx: list[int]) -> float:
|
|
sub_cov = cov[np.ix_(idx, idx)]
|
|
w = np.ones(len(idx)) / len(idx)
|
|
return float(w @ sub_cov @ w)
|
|
|
|
v_left = _cluster_var(left_idx)
|
|
v_right = _cluster_var(right_idx)
|
|
total = v_left + v_right
|
|
alpha = 1.0 - v_left / total if total > 0 else 0.5
|
|
|
|
w_left = _bisect_weights(left_items)
|
|
w_right = _bisect_weights(right_items)
|
|
|
|
result = {}
|
|
for sym, w in w_left.items():
|
|
result[sym] = w * (1.0 - alpha)
|
|
for sym, w in w_right.items():
|
|
result[sym] = w * alpha
|
|
return result
|
|
|
|
raw = _bisect_weights(sorted_symbols)
|
|
total = sum(raw.values())
|
|
return {sym: float(w / total) for sym, w in raw.items()}
|
|
|
|
|
|
async def optimize_hrp(symbols: list[str], days: int = 365) -> dict[str, Any]:
|
|
"""Compute Hierarchical Risk Parity portfolio weights.
|
|
|
|
Args:
|
|
symbols: List of ticker symbols (1-50).
|
|
days: Number of historical days to use.
|
|
|
|
Returns:
|
|
Dict with keys ``weights`` (symbol -> float) and ``method``.
|
|
|
|
Raises:
|
|
ValueError: If symbols is empty or no price data is available.
|
|
"""
|
|
if not symbols:
|
|
raise ValueError("symbols must not be empty")
|
|
|
|
prices = await fetch_historical_prices(symbols, days=days)
|
|
if prices.empty:
|
|
raise ValueError("No price data available for the given symbols")
|
|
|
|
returns = _compute_returns(prices)
|
|
weights = _hrp_weights(returns)
|
|
|
|
return {"weights": weights, "method": "hrp"}
|
|
|
|
|
|
async def compute_correlation(
|
|
symbols: list[str], days: int = 365
|
|
) -> dict[str, Any]:
|
|
"""Compute correlation matrix for a list of symbols.
|
|
|
|
Args:
|
|
symbols: List of ticker symbols (1-50).
|
|
days: Number of historical days to use.
|
|
|
|
Returns:
|
|
Dict with keys ``symbols`` (list) and ``matrix`` (list of lists).
|
|
|
|
Raises:
|
|
ValueError: If symbols is empty or no price data is available.
|
|
"""
|
|
if not symbols:
|
|
raise ValueError("symbols must not be empty")
|
|
|
|
prices = await fetch_historical_prices(symbols, days=days)
|
|
if prices.empty:
|
|
raise ValueError("No price data available for the given symbols")
|
|
|
|
returns = _compute_returns(prices)
|
|
available = list(returns.columns)
|
|
corr = returns.corr().fillna(0)
|
|
|
|
matrix = corr.values.tolist()
|
|
|
|
return {"symbols": available, "matrix": matrix}
|
|
|
|
|
|
async def compute_risk_parity(
|
|
symbols: list[str], days: int = 365
|
|
) -> dict[str, Any]:
|
|
"""Compute equal risk contribution (inverse-volatility) weights.
|
|
|
|
Args:
|
|
symbols: List of ticker symbols (1-50).
|
|
days: Number of historical days to use.
|
|
|
|
Returns:
|
|
Dict with keys ``weights``, ``risk_contributions``, and ``method``.
|
|
|
|
Raises:
|
|
ValueError: If symbols is empty or no price data is available.
|
|
"""
|
|
if not symbols:
|
|
raise ValueError("symbols must not be empty")
|
|
|
|
prices = await fetch_historical_prices(symbols, days=days)
|
|
if prices.empty:
|
|
raise ValueError("No price data available for the given symbols")
|
|
|
|
returns = _compute_returns(prices)
|
|
weights = _inverse_volatility_weights(returns)
|
|
|
|
# Risk contributions: w_i * sigma_i / sum(w_j * sigma_j)
|
|
vols = returns.std()
|
|
weighted_risk = {sym: weights[sym] * float(vols[sym]) for sym in weights}
|
|
total_risk = sum(weighted_risk.values())
|
|
if total_risk > 0:
|
|
risk_contributions = {sym: v / total_risk for sym, v in weighted_risk.items()}
|
|
else:
|
|
n = len(weights)
|
|
risk_contributions = {sym: 1.0 / n for sym in weights}
|
|
|
|
return {
|
|
"weights": weights,
|
|
"risk_contributions": risk_contributions,
|
|
"method": "risk_parity",
|
|
}
|
|
|
|
|
|
def _auto_n_clusters(n: int) -> int:
|
|
"""Return a sensible default cluster count: max(2, floor(sqrt(n)))."""
|
|
return max(2, isqrt(n))
|
|
|
|
|
|
def _run_tsne_kmeans(
|
|
returns_matrix: np.ndarray, n_clusters: int
|
|
) -> tuple[np.ndarray, np.ndarray]:
|
|
"""Run t-SNE then KMeans on a (n_symbols x n_days) returns matrix.
|
|
|
|
Returns (coords, labels) where coords has shape (n_symbols, 2).
|
|
CPU-heavy: caller must wrap in asyncio.to_thread.
|
|
"""
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.manifold import TSNE
|
|
|
|
n_samples = returns_matrix.shape[0]
|
|
perplexity = min(5, n_samples - 1)
|
|
|
|
# Add tiny noise to prevent numerical singularity when returns are identical
|
|
rng = np.random.default_rng(42)
|
|
jittered = returns_matrix + rng.normal(0, 1e-10, returns_matrix.shape)
|
|
|
|
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, method="exact")
|
|
coords = tsne.fit_transform(jittered)
|
|
|
|
km = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
|
|
labels = km.fit_predict(coords)
|
|
|
|
return coords, labels
|
|
|
|
|
|
async def cluster_stocks(
|
|
symbols: list[str],
|
|
days: int = 180,
|
|
n_clusters: int | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Cluster stocks by return similarity using t-SNE + KMeans.
|
|
|
|
Args:
|
|
symbols: List of ticker symbols. Minimum 3, maximum 50.
|
|
days: Number of historical trading days to use.
|
|
n_clusters: Number of clusters. Defaults to floor(sqrt(n_symbols)).
|
|
|
|
Returns:
|
|
Dict with keys ``symbols``, ``coordinates``, ``clusters``,
|
|
``method``, ``n_clusters``, and ``days``.
|
|
|
|
Raises:
|
|
ValueError: Fewer than 3 symbols, or no price data available.
|
|
"""
|
|
if len(symbols) < 3:
|
|
raise ValueError("cluster_stocks requires at least 3 symbols")
|
|
|
|
prices = await fetch_historical_prices(symbols, days=days)
|
|
if prices.empty:
|
|
raise ValueError("No price data available for the given symbols")
|
|
|
|
returns = _compute_returns(prices)
|
|
available = list(returns.columns)
|
|
n = len(available)
|
|
|
|
k = n_clusters if n_clusters is not None else _auto_n_clusters(n)
|
|
|
|
# Build (n_symbols x n_days) matrix; fill NaN with column mean
|
|
matrix = returns[available].T.fillna(0).values.astype(float)
|
|
|
|
coords, labels = await asyncio.to_thread(_run_tsne_kmeans, matrix, k)
|
|
|
|
coordinates = [
|
|
{
|
|
"symbol": sym,
|
|
"x": float(coords[i, 0]),
|
|
"y": float(coords[i, 1]),
|
|
"cluster": int(labels[i]),
|
|
}
|
|
for i, sym in enumerate(available)
|
|
]
|
|
|
|
clusters: dict[str, list[str]] = {}
|
|
for sym, label in zip(available, labels):
|
|
key = str(int(label))
|
|
clusters.setdefault(key, []).append(sym)
|
|
|
|
return {
|
|
"symbols": available,
|
|
"coordinates": coordinates,
|
|
"clusters": clusters,
|
|
"method": "t-SNE + KMeans",
|
|
"n_clusters": k,
|
|
"days": days,
|
|
}
|
|
|
|
|
|
async def find_similar_stocks(
|
|
symbol: str,
|
|
universe: list[str],
|
|
days: int = 180,
|
|
top_n: int = 5,
|
|
) -> dict[str, Any]:
|
|
"""Find stocks most/least similar to a target by return correlation.
|
|
|
|
Args:
|
|
symbol: Target ticker symbol.
|
|
universe: List of candidate symbols to compare against.
|
|
days: Number of historical trading days to use.
|
|
top_n: Number of most- and least-similar stocks to return.
|
|
|
|
Returns:
|
|
Dict with keys ``symbol``, ``most_similar``, ``least_similar``.
|
|
|
|
Raises:
|
|
ValueError: No price data available, or target symbol missing from data.
|
|
"""
|
|
all_symbols = [symbol] + [s for s in universe if s != symbol]
|
|
prices = await fetch_historical_prices(all_symbols, days=days)
|
|
|
|
if prices.empty:
|
|
raise ValueError("No price data available for the given symbols")
|
|
|
|
if symbol not in prices.columns:
|
|
raise ValueError(
|
|
f"{symbol} not found in price data; it may have no available history"
|
|
)
|
|
|
|
returns = _compute_returns(prices)
|
|
|
|
target_returns = returns[symbol]
|
|
peers = [s for s in universe if s in returns.columns and s != symbol]
|
|
|
|
correlations: list[dict[str, Any]] = []
|
|
for peer in peers:
|
|
corr_val = float(target_returns.corr(returns[peer]))
|
|
if not np.isnan(corr_val):
|
|
correlations.append({"symbol": peer, "correlation": corr_val})
|
|
|
|
correlations.sort(key=lambda e: e["correlation"], reverse=True)
|
|
|
|
n = min(top_n, len(correlations))
|
|
most_similar = correlations[:n]
|
|
least_similar = sorted(correlations, key=lambda e: e["correlation"])[:n]
|
|
|
|
return {
|
|
"symbol": symbol,
|
|
"most_similar": most_similar,
|
|
"least_similar": least_similar,
|
|
}
|