openbb-invest-api/portfolio_service.py

"""Portfolio optimization: HRP, correlation matrix, risk parity, t-SNE clustering."""

import asyncio
import logging
from math import isqrt
from typing import Any

import numpy as np
import pandas as pd

from obb_utils import fetch_historical

logger = logging.getLogger(__name__)


async def fetch_historical_prices(symbols: list[str], days: int = 365) -> pd.DataFrame:
    """Fetch closing prices for multiple symbols and return as a DataFrame.

    Columns are symbol names; rows are dates. Symbols with no data are skipped.
    """
    tasks = [fetch_historical(sym, days=days) for sym in symbols]
    results = await asyncio.gather(*tasks)

    price_series: dict[str, pd.Series] = {}
    for sym, result in zip(symbols, results):
        if result is None or result.results is None:
            logger.warning("No historical data for %s, skipping", sym)
            continue
        rows = result.results
        if not rows:
            continue
        dates = []
        closes = []
        for row in rows:
            d = getattr(row, "date", None)
            c = getattr(row, "close", None)
            if d is not None and c is not None:
                dates.append(str(d))
                closes.append(float(c))
        if dates:
            price_series[sym] = pd.Series(closes, index=dates)

    if not price_series:
        return pd.DataFrame()

    df = pd.DataFrame(price_series)
    df = df.dropna(how="all")
    return df


def _compute_returns(prices: pd.DataFrame) -> pd.DataFrame:
    """Compute daily log returns from a price DataFrame."""
    return prices.pct_change().dropna()


def _inverse_volatility_weights(returns: pd.DataFrame) -> dict[str, float]:
    """Compute inverse-volatility weights."""
    vols = returns.std()
    inv_vols = 1.0 / vols
    weights = inv_vols / inv_vols.sum()
    return {sym: float(w) for sym, w in weights.items()}


def _hrp_weights(returns: pd.DataFrame) -> dict[str, float]:
    """Compute Hierarchical Risk Parity weights via scipy clustering.

    Falls back to inverse-volatility if scipy is unavailable.
    """
    symbols = list(returns.columns)
    n = len(symbols)

    if n == 1:
        return {symbols[0]: 1.0}

    try:
        from scipy.cluster.hierarchy import linkage, leaves_list
        from scipy.spatial.distance import squareform

        corr = returns.corr().fillna(0).values
        # Convert correlation to distance: d = sqrt(0.5 * (1 - corr))
        dist = np.sqrt(np.clip(0.5 * (1 - corr), 0, 1))
        np.fill_diagonal(dist, 0.0)
        condensed = squareform(dist)
        link = linkage(condensed, method="single")
        order = leaves_list(link)
        sorted_symbols = [symbols[i] for i in order]
    except ImportError:
        logger.warning("scipy not available; using inverse-volatility for HRP")
        return _inverse_volatility_weights(returns)

    cov = returns.cov().values

    def _bisect_weights(items: list[str]) -> dict[str, float]:
        if len(items) == 1:
            return {items[0]: 1.0}
        mid = len(items) // 2
        left_items = items[:mid]
        right_items = items[mid:]
        left_idx = [sorted_symbols.index(s) for s in left_items]
        right_idx = [sorted_symbols.index(s) for s in right_items]

        def _cluster_var(idx: list[int]) -> float:
            sub_cov = cov[np.ix_(idx, idx)]
            w = np.ones(len(idx)) / len(idx)
            return float(w @ sub_cov @ w)

        v_left = _cluster_var(left_idx)
        v_right = _cluster_var(right_idx)
        total = v_left + v_right
        alpha = 1.0 - v_left / total if total > 0 else 0.5

        w_left = _bisect_weights(left_items)
        w_right = _bisect_weights(right_items)

        result = {}
        for sym, w in w_left.items():
            result[sym] = w * (1.0 - alpha)
        for sym, w in w_right.items():
            result[sym] = w * alpha
        return result

    raw = _bisect_weights(sorted_symbols)
    total = sum(raw.values())
    return {sym: float(w / total) for sym, w in raw.items()}


async def optimize_hrp(symbols: list[str], days: int = 365) -> dict[str, Any]:
    """Compute Hierarchical Risk Parity portfolio weights.

    Args:
        symbols: List of ticker symbols (1-50).
        days: Number of historical days to use.

    Returns:
        Dict with keys ``weights`` (symbol -> float) and ``method``.

    Raises:
        ValueError: If symbols is empty or no price data is available.
    """
    if not symbols:
        raise ValueError("symbols must not be empty")

    prices = await fetch_historical_prices(symbols, days=days)
    if prices.empty:
        raise ValueError("No price data available for the given symbols")

    returns = _compute_returns(prices)
    weights = _hrp_weights(returns)

    return {"weights": weights, "method": "hrp"}


async def compute_correlation(
    symbols: list[str], days: int = 365
) -> dict[str, Any]:
    """Compute correlation matrix for a list of symbols.

    Args:
        symbols: List of ticker symbols (1-50).
        days: Number of historical days to use.

    Returns:
        Dict with keys ``symbols`` (list) and ``matrix`` (list of lists).

    Raises:
        ValueError: If symbols is empty or no price data is available.
    """
    if not symbols:
        raise ValueError("symbols must not be empty")

    prices = await fetch_historical_prices(symbols, days=days)
    if prices.empty:
        raise ValueError("No price data available for the given symbols")

    returns = _compute_returns(prices)
    available = list(returns.columns)
    corr = returns.corr().fillna(0)

    matrix = corr.values.tolist()

    return {"symbols": available, "matrix": matrix}


async def compute_risk_parity(
    symbols: list[str], days: int = 365
) -> dict[str, Any]:
    """Compute equal risk contribution (inverse-volatility) weights.

    Args:
        symbols: List of ticker symbols (1-50).
        days: Number of historical days to use.

    Returns:
        Dict with keys ``weights``, ``risk_contributions``, and ``method``.

    Raises:
        ValueError: If symbols is empty or no price data is available.
    """
    if not symbols:
        raise ValueError("symbols must not be empty")

    prices = await fetch_historical_prices(symbols, days=days)
    if prices.empty:
        raise ValueError("No price data available for the given symbols")

    returns = _compute_returns(prices)
    weights = _inverse_volatility_weights(returns)

    # Risk contributions: w_i * sigma_i / sum(w_j * sigma_j)
    vols = returns.std()
    weighted_risk = {sym: weights[sym] * float(vols[sym]) for sym in weights}
    total_risk = sum(weighted_risk.values())
    if total_risk > 0:
        risk_contributions = {sym: v / total_risk for sym, v in weighted_risk.items()}
    else:
        n = len(weights)
        risk_contributions = {sym: 1.0 / n for sym in weights}

    return {
        "weights": weights,
        "risk_contributions": risk_contributions,
        "method": "risk_parity",
    }


def _auto_n_clusters(n: int) -> int:
    """Return a sensible default cluster count: max(2, floor(sqrt(n)))."""
    return max(2, isqrt(n))


def _run_tsne_kmeans(
    returns_matrix: np.ndarray, n_clusters: int
) -> tuple[np.ndarray, np.ndarray]:
    """Run t-SNE then KMeans on a (n_symbols x n_days) returns matrix.

    Returns (coords, labels) where coords has shape (n_symbols, 2).
    CPU-heavy: caller must wrap in asyncio.to_thread.
    """
    from sklearn.cluster import KMeans
    from sklearn.manifold import TSNE

    n_samples = returns_matrix.shape[0]
    perplexity = min(5, n_samples - 1)

    # Add tiny noise to prevent numerical singularity when returns are identical
    rng = np.random.default_rng(42)
    jittered = returns_matrix + rng.normal(0, 1e-10, returns_matrix.shape)

    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, method="exact")
    coords = tsne.fit_transform(jittered)

    km = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
    labels = km.fit_predict(coords)

    return coords, labels


async def cluster_stocks(
    symbols: list[str],
    days: int = 180,
    n_clusters: int | None = None,
) -> dict[str, Any]:
    """Cluster stocks by return similarity using t-SNE + KMeans.

    Args:
        symbols: List of ticker symbols. Minimum 3, maximum 50.
        days: Number of historical trading days to use.
        n_clusters: Number of clusters. Defaults to floor(sqrt(n_symbols)).

    Returns:
        Dict with keys ``symbols``, ``coordinates``, ``clusters``,
        ``method``, ``n_clusters``, and ``days``.

    Raises:
        ValueError: Fewer than 3 symbols, or no price data available.
    """
    if len(symbols) < 3:
        raise ValueError("cluster_stocks requires at least 3 symbols")

    prices = await fetch_historical_prices(symbols, days=days)
    if prices.empty:
        raise ValueError("No price data available for the given symbols")

    returns = _compute_returns(prices)
    available = list(returns.columns)
    n = len(available)

    k = n_clusters if n_clusters is not None else _auto_n_clusters(n)

    # Build (n_symbols x n_days) matrix; fill NaN with column mean
    matrix = returns[available].T.fillna(0).values.astype(float)

    coords, labels = await asyncio.to_thread(_run_tsne_kmeans, matrix, k)

    coordinates = [
        {
            "symbol": sym,
            "x": float(coords[i, 0]),
            "y": float(coords[i, 1]),
            "cluster": int(labels[i]),
        }
        for i, sym in enumerate(available)
    ]

    clusters: dict[str, list[str]] = {}
    for sym, label in zip(available, labels):
        key = str(int(label))
        clusters.setdefault(key, []).append(sym)

    return {
        "symbols": available,
        "coordinates": coordinates,
        "clusters": clusters,
        "method": "t-SNE + KMeans",
        "n_clusters": k,
        "days": days,
    }


async def find_similar_stocks(
    symbol: str,
    universe: list[str],
    days: int = 180,
    top_n: int = 5,
) -> dict[str, Any]:
    """Find stocks most/least similar to a target by return correlation.

    Args:
        symbol: Target ticker symbol.
        universe: List of candidate symbols to compare against.
        days: Number of historical trading days to use.
        top_n: Number of most- and least-similar stocks to return.

    Returns:
        Dict with keys ``symbol``, ``most_similar``, ``least_similar``.

    Raises:
        ValueError: No price data available, or target symbol missing from data.
    """
    all_symbols = [symbol] + [s for s in universe if s != symbol]
    prices = await fetch_historical_prices(all_symbols, days=days)

    if prices.empty:
        raise ValueError("No price data available for the given symbols")

    if symbol not in prices.columns:
        raise ValueError(
            f"{symbol} not found in price data; it may have no available history"
        )

    returns = _compute_returns(prices)

    target_returns = returns[symbol]
    peers = [s for s in universe if s in returns.columns and s != symbol]

    correlations: list[dict[str, Any]] = []
    for peer in peers:
        corr_val = float(target_returns.corr(returns[peer]))
        if not np.isnan(corr_val):
            correlations.append({"symbol": peer, "correlation": corr_val})

    correlations.sort(key=lambda e: e["correlation"], reverse=True)

    n = min(top_n, len(correlations))
    most_similar = correlations[:n]
    least_similar = sorted(correlations, key=lambda e: e["correlation"])[:n]

    return {
        "symbol": symbol,
        "most_similar": most_similar,
        "least_similar": least_similar,
    }