Source code for cnvlib.segmetrics

"""Robust metrics to evaluate performance of copy number estimates."""

from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Union

# import pandas as pd
import numpy as np
from scipy import stats

from . import descriptives

if TYPE_CHECKING:
    from collections.abc import Callable
    from collections.abc import Iterator
    from cnvlib.cnary import CopyNumArray
    from numpy import float64, ndarray
    from pandas.core.series import Series



[docs]
def do_segmetrics(
    cnarr: CopyNumArray,
    segarr: CopyNumArray,
    location_stats: Union[tuple[()], list[str]] = (),
    spread_stats: Union[tuple[()], list[str]] = (),
    interval_stats: Union[tuple[()], list[str]] = (),
    alpha: float = 0.05,
    bootstraps: int = 100,
    smoothed: bool = False,
    skip_low: bool = False,
) -> CopyNumArray:
    """Compute segment-level metrics from bin-level log2 ratios.

    Parameters
    ----------
    cnarr : CopyNumArray
        Bin-level copy number data.
    segarr : CopyNumArray
        Segmented copy number data.
    location_stats : list of str, optional
        Location statistics to compute: 'mean', 'median', 'mode', 'p_ttest'.
        Default is empty tuple.
    spread_stats : list of str, optional
        Spread statistics to compute: 'stdev', 'mad', 'mse', 'iqr', 'bivar', 'sem'.
        Default is empty tuple.
    interval_stats : list of str, optional
        Interval statistics to compute: 'ci' (confidence interval),
        'pi' (prediction interval). Default is empty tuple.
    alpha : float, optional
        Significance level for confidence/prediction intervals. Default is 0.05.
    bootstraps : int, optional
        Number of bootstrap iterations for confidence intervals. Default is 100.
    smoothed : bool, optional
        Use smoothed bootstrap for confidence intervals. Default is False.
    skip_low : bool, optional
        Skip bins with low coverage. Default is False.

    Returns
    -------
    CopyNumArray
        Segmented data with additional statistical columns.
    """
    # Silence sem's "Degrees of freedom <= 0 for slice"; NaN is OK
    import warnings

    warnings.simplefilter("ignore", RuntimeWarning)

    stat_funcs = {
        "mean": np.mean,
        "median": np.median,
        "mode": descriptives.modal_location,
        "p_ttest": lambda a: stats.ttest_1samp(a, 0.0, nan_policy="omit")[1],
        "stdev": np.std,
        "mad": descriptives.median_absolute_deviation,
        "mse": descriptives.mean_squared_error,
        "iqr": descriptives.interquartile_range,
        "bivar": descriptives.biweight_midvariance,
        "sem": stats.sem,
        "ci": make_ci_func(alpha, bootstraps, smoothed),
        "pi": make_pi_func(alpha),
    }

    if skip_low:
        cnarr = cnarr.drop_low_coverage()
    bins_log2s = list(cnarr.iter_ranges_of(segarr, "log2", "outer", True))

    segarr = segarr.copy()
    if location_stats:
        # Measures of location
        for statname in location_stats:
            func = stat_funcs[statname]
            segarr[statname] = np.fromiter(
                map(func, bins_log2s), np.float64, len(segarr)
            )
    # Measures of spread
    if spread_stats:
        deviations = (bl - sl for bl, sl in zip(bins_log2s, segarr["log2"], strict=False))
        if len(spread_stats) > 1:
            deviations = list(deviations)
        for statname in spread_stats:
            func = stat_funcs[statname]
            segarr[statname] = np.fromiter(
                map(func, deviations), np.float64, len(segarr)
            )
    # Interval calculations
    weights = cnarr["weight"]
    if "ci" in interval_stats:
        segarr["ci_lo"], segarr["ci_hi"] = calc_intervals(
            bins_log2s, weights, stat_funcs["ci"]
        )
    if "pi" in interval_stats:
        segarr["pi_lo"], segarr["pi_hi"] = calc_intervals(
            bins_log2s, weights, stat_funcs["pi"]
        )

    return segarr




[docs]
def make_ci_func(alpha: float, bootstraps: int, smoothed: bool) -> Callable:
    def ci_func(ser, wt):
        return confidence_interval_bootstrap(ser, wt, alpha, bootstraps, smoothed)

    return ci_func




[docs]
def make_pi_func(alpha: float) -> Callable:
    """Prediction interval, estimated by percentiles."""
    # ENH: weighted percentile
    pct_lo = 100 * alpha / 2
    pct_hi = 100 * (1 - alpha / 2)

    def pi_func(ser, _w):
        return np.percentile(ser, [pct_lo, pct_hi])

    return pi_func




[docs]
def calc_intervals(
    bins_log2s: list[Series], weights: Series, func: Callable
) -> tuple[ndarray, ndarray]:
    """Compute a stat that yields intervals (low & high values)"""
    out_vals_lo = np.repeat(np.nan, len(bins_log2s))
    out_vals_hi = np.repeat(np.nan, len(bins_log2s))
    for i, ser in enumerate(bins_log2s):
        if len(ser):
            wt = weights[ser.index]
            assert (wt.index == ser.index).all()
            out_vals_lo[i], out_vals_hi[i] = func(ser.values, wt.values)
    return out_vals_lo, out_vals_hi




[docs]
def confidence_interval_bootstrap(
    values: ndarray,
    weights: ndarray,
    alpha: float,
    bootstraps: int = 100,
    smoothed: bool = False,
) -> ndarray:
    """Confidence interval for segment mean log2 value, estimated by bootstrap."""
    if not 0 < alpha < 1:
        raise ValueError(f"alpha must be between 0 and 1; got {alpha}")
    if bootstraps <= 2 / alpha:
        new_boots = int(np.ceil(2 / alpha))
        logging.warning(
            "%d bootstraps not enough to estimate CI alpha level %f; increasing to %d",
            bootstraps,
            alpha,
            new_boots,
        )
        bootstraps = new_boots
    # Bootstrap for CI
    k = len(values)
    if k < 2:
        return np.repeat(values[0], 2)

    rng = np.random.default_rng(0xA5EED)
    rand_indices = rng.integers(0, k, size=(bootstraps, k))
    samples = ((np.take(values, idx), np.take(weights, idx)) for idx in rand_indices)
    if smoothed:
        samples = _smooth_samples_by_weight(values, samples)
    # Recalculate segment means
    seg_means = (np.average(val, weights=wt) for val, wt in samples)
    bootstrap_dist = np.fromiter(seg_means, np.float64, bootstraps)
    alphas = np.array([alpha / 2, 1 - alpha / 2])
    if not smoothed:
        # alphas = _bca_correct_alpha(values, weights, bootstrap_dist, alphas)
        pass
    ci = np.percentile(bootstrap_dist, list(100 * alphas))
    return ci



def _smooth_samples_by_weight(
    values: ndarray, samples: Iterator[Any]
) -> list[tuple[ndarray, ndarray]]:
    """Add Gaussian noise to each bootstrap replicate.

    The result is used to compute a "smoothed bootstrap," where the added noise
    ensures that for small samples (e.g. number of bins in the segment) the
    bootstrapped CI is close to the standard error of the mean, as it should be.
    Conceptually, sample from a KDE instead of the values themselves.

    This addresses the issue that small segments (#bins < #replicates) don't
    fully represent the underlying distribution, in particular the extreme
    values, so the CI is too narrow. For single-bin segments in particular,
    the confidence interval will always have zero width unless the samples are
    smoothed.

    Standard deviation of the noise added to each bin comes from each bin's
    weight, which is an estimate of (1-variance).

    Parameters
    ----------
    values : np.ndarray
        Original log2 values within the segment.
    samples : list of np.ndarray
        Bootstrap replicates as (value_sample, weight_sample).

    Returns
    -------
    `samples` with random N(0, pop_sd) added to each value, and
    weights unchanged.
    """
    k = len(values)
    # KDE bandwidth narrows for larger sample sizes
    # Following Silverman's Rule and Polansky 1995,
    # but requiring k=1 -> bw=1 for consistency
    bw = k ** (-1 / 4)
    rng = np.random.default_rng()
    samples = [
        (v + (bw * np.sqrt(1 - w) * rng.standard_normal(k)), w) for v, w in samples
    ]
    return samples


def _bca_correct_alpha(values, weights, bootstrap_dist, alphas):
    """Bias Corrected & Accellerated (BCa) bootstrap adjustment.

    See: Efron 1987, "Better Bootstrap Confidence Intervals"
    http://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478410

    Ported from R package "bootstrap" function "bcanon".
    """
    n_boots = len(bootstrap_dist)
    orig_mean = np.average(values, weights=weights)
    logging.warning(
        "boot samples less: %s / %s", (bootstrap_dist < orig_mean).sum(), n_boots
    )
    n_boots_below = (bootstrap_dist < orig_mean).sum()
    if n_boots_below == 0:
        logging.warning("boots mean %s, orig mean %s", bootstrap_dist.mean(), orig_mean)
    else:
        logging.warning("boot samples less: %s / %s", n_boots_below, n_boots)
    z0 = stats.norm.ppf((bootstrap_dist < orig_mean).sum() / n_boots)
    zalpha = stats.norm.ppf(alphas)
    # Jackknife influence values
    u = np.array(
        [
            np.average(
                np.concatenate([values[:i], values[i + 1 :]]),
                weights=np.concatenate([weights[:i], weights[i + 1 :]]),
            )
            for i in range(len(values))
        ]
    )
    uu = u.mean() - u
    acc = (u**3).sum() / (6 * (uu**2).sum() ** 1.5)
    alphas = stats.norm.cdf(z0 + (z0 + zalpha) / (1 - acc * (z0 + zalpha)))
    logging.warning(
        "New alphas: %s -- via z0=%s, za=%s, acc=%s", alphas, z0, zalpha, acc
    )
    if not 0 < alphas[0] < 1 and 0 < alphas[1] < 1:
        raise ValueError(f"CI alphas should be in (0,1); got {alphas}")
    return alphas



[docs]
def segment_mean(cnarr: CopyNumArray, skip_low: bool = False) -> Union[float64, float]:
    """Weighted average of bin log2 values."""
    if skip_low:
        cnarr = cnarr.drop_low_coverage()
    if len(cnarr) == 0:
        return np.nan
    if "weight" in cnarr and cnarr["weight"].any():
        return np.average(cnarr["log2"], weights=cnarr["weight"])
    return cnarr["log2"].mean()
Source code for cnvlib.segmetrics

CNVkit

Navigation

Related Topics