"""Portfolio optimization: HRP, correlation matrix, risk parity, t-SNE clustering.""" import asyncio import logging from math import isqrt from typing import Any import numpy as np import pandas as pd from obb_utils import fetch_historical logger = logging.getLogger(__name__) async def fetch_historical_prices(symbols: list[str], days: int = 365) -> pd.DataFrame: """Fetch closing prices for multiple symbols and return as a DataFrame. Columns are symbol names; rows are dates. Symbols with no data are skipped. """ tasks = [fetch_historical(sym, days=days) for sym in symbols] results = await asyncio.gather(*tasks) price_series: dict[str, pd.Series] = {} for sym, result in zip(symbols, results): if result is None or result.results is None: logger.warning("No historical data for %s, skipping", sym) continue rows = result.results if not rows: continue dates = [] closes = [] for row in rows: d = getattr(row, "date", None) c = getattr(row, "close", None) if d is not None and c is not None: dates.append(str(d)) closes.append(float(c)) if dates: price_series[sym] = pd.Series(closes, index=dates) if not price_series: return pd.DataFrame() df = pd.DataFrame(price_series) df = df.dropna(how="all") return df def _compute_returns(prices: pd.DataFrame) -> pd.DataFrame: """Compute daily log returns from a price DataFrame.""" return prices.pct_change().dropna() def _inverse_volatility_weights(returns: pd.DataFrame) -> dict[str, float]: """Compute inverse-volatility weights.""" vols = returns.std() inv_vols = 1.0 / vols weights = inv_vols / inv_vols.sum() return {sym: float(w) for sym, w in weights.items()} def _hrp_weights(returns: pd.DataFrame) -> dict[str, float]: """Compute Hierarchical Risk Parity weights via scipy clustering. Falls back to inverse-volatility if scipy is unavailable. """ symbols = list(returns.columns) n = len(symbols) if n == 1: return {symbols[0]: 1.0} try: from scipy.cluster.hierarchy import linkage, leaves_list from scipy.spatial.distance import squareform corr = returns.corr().fillna(0).values # Convert correlation to distance: d = sqrt(0.5 * (1 - corr)) dist = np.sqrt(np.clip(0.5 * (1 - corr), 0, 1)) np.fill_diagonal(dist, 0.0) condensed = squareform(dist) link = linkage(condensed, method="single") order = leaves_list(link) sorted_symbols = [symbols[i] for i in order] except ImportError: logger.warning("scipy not available; using inverse-volatility for HRP") return _inverse_volatility_weights(returns) cov = returns.cov().values def _bisect_weights(items: list[str]) -> dict[str, float]: if len(items) == 1: return {items[0]: 1.0} mid = len(items) // 2 left_items = items[:mid] right_items = items[mid:] left_idx = [sorted_symbols.index(s) for s in left_items] right_idx = [sorted_symbols.index(s) for s in right_items] def _cluster_var(idx: list[int]) -> float: sub_cov = cov[np.ix_(idx, idx)] w = np.ones(len(idx)) / len(idx) return float(w @ sub_cov @ w) v_left = _cluster_var(left_idx) v_right = _cluster_var(right_idx) total = v_left + v_right alpha = 1.0 - v_left / total if total > 0 else 0.5 w_left = _bisect_weights(left_items) w_right = _bisect_weights(right_items) result = {} for sym, w in w_left.items(): result[sym] = w * (1.0 - alpha) for sym, w in w_right.items(): result[sym] = w * alpha return result raw = _bisect_weights(sorted_symbols) total = sum(raw.values()) return {sym: float(w / total) for sym, w in raw.items()} async def optimize_hrp(symbols: list[str], days: int = 365) -> dict[str, Any]: """Compute Hierarchical Risk Parity portfolio weights. Args: symbols: List of ticker symbols (1-50). days: Number of historical days to use. Returns: Dict with keys ``weights`` (symbol -> float) and ``method``. Raises: ValueError: If symbols is empty or no price data is available. """ if not symbols: raise ValueError("symbols must not be empty") prices = await fetch_historical_prices(symbols, days=days) if prices.empty: raise ValueError("No price data available for the given symbols") returns = _compute_returns(prices) weights = _hrp_weights(returns) return {"weights": weights, "method": "hrp"} async def compute_correlation( symbols: list[str], days: int = 365 ) -> dict[str, Any]: """Compute correlation matrix for a list of symbols. Args: symbols: List of ticker symbols (1-50). days: Number of historical days to use. Returns: Dict with keys ``symbols`` (list) and ``matrix`` (list of lists). Raises: ValueError: If symbols is empty or no price data is available. """ if not symbols: raise ValueError("symbols must not be empty") prices = await fetch_historical_prices(symbols, days=days) if prices.empty: raise ValueError("No price data available for the given symbols") returns = _compute_returns(prices) available = list(returns.columns) corr = returns.corr().fillna(0) matrix = corr.values.tolist() return {"symbols": available, "matrix": matrix} async def compute_risk_parity( symbols: list[str], days: int = 365 ) -> dict[str, Any]: """Compute equal risk contribution (inverse-volatility) weights. Args: symbols: List of ticker symbols (1-50). days: Number of historical days to use. Returns: Dict with keys ``weights``, ``risk_contributions``, and ``method``. Raises: ValueError: If symbols is empty or no price data is available. """ if not symbols: raise ValueError("symbols must not be empty") prices = await fetch_historical_prices(symbols, days=days) if prices.empty: raise ValueError("No price data available for the given symbols") returns = _compute_returns(prices) weights = _inverse_volatility_weights(returns) # Risk contributions: w_i * sigma_i / sum(w_j * sigma_j) vols = returns.std() weighted_risk = {sym: weights[sym] * float(vols[sym]) for sym in weights} total_risk = sum(weighted_risk.values()) if total_risk > 0: risk_contributions = {sym: v / total_risk for sym, v in weighted_risk.items()} else: n = len(weights) risk_contributions = {sym: 1.0 / n for sym in weights} return { "weights": weights, "risk_contributions": risk_contributions, "method": "risk_parity", } def _auto_n_clusters(n: int) -> int: """Return a sensible default cluster count: max(2, floor(sqrt(n))).""" return max(2, isqrt(n)) def _run_tsne_kmeans( returns_matrix: np.ndarray, n_clusters: int ) -> tuple[np.ndarray, np.ndarray]: """Run t-SNE then KMeans on a (n_symbols x n_days) returns matrix. Returns (coords, labels) where coords has shape (n_symbols, 2). CPU-heavy: caller must wrap in asyncio.to_thread. """ from sklearn.cluster import KMeans from sklearn.manifold import TSNE n_samples = returns_matrix.shape[0] perplexity = min(5, n_samples - 1) # Add tiny noise to prevent numerical singularity when returns are identical rng = np.random.default_rng(42) jittered = returns_matrix + rng.normal(0, 1e-10, returns_matrix.shape) tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, method="exact") coords = tsne.fit_transform(jittered) km = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto") labels = km.fit_predict(coords) return coords, labels async def cluster_stocks( symbols: list[str], days: int = 180, n_clusters: int | None = None, ) -> dict[str, Any]: """Cluster stocks by return similarity using t-SNE + KMeans. Args: symbols: List of ticker symbols. Minimum 3, maximum 50. days: Number of historical trading days to use. n_clusters: Number of clusters. Defaults to floor(sqrt(n_symbols)). Returns: Dict with keys ``symbols``, ``coordinates``, ``clusters``, ``method``, ``n_clusters``, and ``days``. Raises: ValueError: Fewer than 3 symbols, or no price data available. """ if len(symbols) < 3: raise ValueError("cluster_stocks requires at least 3 symbols") prices = await fetch_historical_prices(symbols, days=days) if prices.empty: raise ValueError("No price data available for the given symbols") returns = _compute_returns(prices) available = list(returns.columns) n = len(available) k = n_clusters if n_clusters is not None else _auto_n_clusters(n) # Build (n_symbols x n_days) matrix; fill NaN with column mean matrix = returns[available].T.fillna(0).values.astype(float) coords, labels = await asyncio.to_thread(_run_tsne_kmeans, matrix, k) coordinates = [ { "symbol": sym, "x": float(coords[i, 0]), "y": float(coords[i, 1]), "cluster": int(labels[i]), } for i, sym in enumerate(available) ] clusters: dict[str, list[str]] = {} for sym, label in zip(available, labels): key = str(int(label)) clusters.setdefault(key, []).append(sym) return { "symbols": available, "coordinates": coordinates, "clusters": clusters, "method": "t-SNE + KMeans", "n_clusters": k, "days": days, } async def find_similar_stocks( symbol: str, universe: list[str], days: int = 180, top_n: int = 5, ) -> dict[str, Any]: """Find stocks most/least similar to a target by return correlation. Args: symbol: Target ticker symbol. universe: List of candidate symbols to compare against. days: Number of historical trading days to use. top_n: Number of most- and least-similar stocks to return. Returns: Dict with keys ``symbol``, ``most_similar``, ``least_similar``. Raises: ValueError: No price data available, or target symbol missing from data. """ all_symbols = [symbol] + [s for s in universe if s != symbol] prices = await fetch_historical_prices(all_symbols, days=days) if prices.empty: raise ValueError("No price data available for the given symbols") if symbol not in prices.columns: raise ValueError( f"{symbol} not found in price data; it may have no available history" ) returns = _compute_returns(prices) target_returns = returns[symbol] peers = [s for s in universe if s in returns.columns and s != symbol] correlations: list[dict[str, Any]] = [] for peer in peers: corr_val = float(target_returns.corr(returns[peer])) if not np.isnan(corr_val): correlations.append({"symbol": peer, "correlation": corr_val}) correlations.sort(key=lambda e: e["correlation"], reverse=True) n = min(top_n, len(correlations)) most_similar = correlations[:n] least_similar = sorted(correlations, key=lambda e: e["correlation"])[:n] return { "symbol": symbol, "most_similar": most_similar, "least_similar": least_similar, }