Source code for cnvlib.segmetrics

"""Robust metrics to evaluate performance of copy number estimates.
"""
import logging

import numpy as np
# import pandas as pd
from scipy import stats

from . import descriptives


[docs]def do_segmetrics(cnarr, segarr, location_stats=(), spread_stats=(), interval_stats=(), alpha=.05, bootstraps=100, smoothed=False): """Compute segment-level metrics from bin-level log2 ratios.""" # Silence sem's "Degrees of freedom <= 0 for slice"; NaN is OK import warnings warnings.simplefilter('ignore', RuntimeWarning) stat_funcs = { 'mean': np.mean, 'median': np.median, 'mode': descriptives.modal_location, 'p_ttest': lambda a: stats.ttest_1samp(a, 0.0, nan_policy='omit')[1], 'stdev': np.std, 'mad': descriptives.median_absolute_deviation, 'mse': descriptives.mean_squared_error, 'iqr': descriptives.interquartile_range, 'bivar': descriptives.biweight_midvariance, 'sem': stats.sem, 'ci': make_ci_func(alpha, bootstraps, smoothed), 'pi': make_pi_func(alpha), } bins_log2s = list(cnarr.iter_ranges_of(segarr, 'log2', 'outer', True)) segarr = segarr.copy() if location_stats: # Measures of location for statname in location_stats: func = stat_funcs[statname] segarr[statname] = np.fromiter(map(func, bins_log2s), np.float_, len(segarr)) # Measures of spread if spread_stats: deviations = (bl - sl for bl, sl in zip(bins_log2s, segarr['log2'])) if len(spread_stats) > 1: deviations = list(deviations) for statname in spread_stats: func = stat_funcs[statname] segarr[statname] = np.fromiter(map(func, deviations), np.float_, len(segarr)) # Interval calculations weights = cnarr['weight'] if 'ci' in interval_stats: segarr['ci_lo'], segarr['ci_hi'] = calc_intervals(bins_log2s, weights, stat_funcs['ci']) if 'pi' in interval_stats: segarr['pi_lo'], segarr['pi_hi'] = calc_intervals(bins_log2s, weights, stat_funcs['pi']) return segarr
[docs]def make_ci_func(alpha, bootstraps, smoothed): def ci_func(ser, wt): return confidence_interval_bootstrap(ser, wt, alpha, bootstraps, smoothed) return ci_func
[docs]def make_pi_func(alpha): """Prediction interval, estimated by percentiles.""" # ENH: weighted percentile pct_lo = 100 * alpha / 2 pct_hi = 100 * (1 - alpha / 2) def pi_func(ser, _w): return np.percentile(ser, [pct_lo, pct_hi]) return pi_func
[docs]def calc_intervals(bins_log2s, weights, func): """Compute a stat that yields intervals (low & high values)""" out_vals_lo = np.repeat(np.nan, len(bins_log2s)) out_vals_hi = np.repeat(np.nan, len(bins_log2s)) for i, ser in enumerate(bins_log2s): if len(ser): wt = weights[ser.index] assert (wt.index == ser.index).all() out_vals_lo[i], out_vals_hi[i] = func(ser.values, wt.values) return out_vals_lo, out_vals_hi
[docs]def confidence_interval_bootstrap(values, weights, alpha, bootstraps=100, smoothed=False): """Confidence interval for segment mean log2 value, estimated by bootstrap.""" if not 0 < alpha < 1: raise ValueError("alpha must be between 0 and 1; got %s" % alpha) if bootstraps <= 2 / alpha: new_boots = int(np.ceil(2 / alpha)) logging.warning("%d bootstraps not enough to estimate CI alpha level " "%f; increasing to %d", bootstraps, alpha, new_boots) bootstraps = new_boots # Bootstrap for CI k = len(values) if k < 2: return np.repeat(values[0], 2) np.random.seed(0xA5EED) rand_indices = np.random.randint(0, k, size=(bootstraps, k)) samples = ((np.take(values, idx), np.take(weights, idx)) for idx in rand_indices) if smoothed: samples = _smooth_samples_by_weight(values, samples) # Recalculate segment means seg_means = (np.average(val, weights=wt) for val, wt in samples) bootstrap_dist = np.fromiter(seg_means, np.float_, bootstraps) alphas = np.array([alpha / 2, 1 - alpha / 2]) if not smoothed: # alphas = _bca_correct_alpha(values, weights, bootstrap_dist, alphas) pass ci = np.percentile(bootstrap_dist, list(100 * alphas)) return ci
def _smooth_samples_by_weight(values, samples): """Add Gaussian noise to each bootstrap replicate. The result is used to compute a "smoothed bootstrap," where the added noise ensures that for small samples (e.g. number of bins in the segment) the bootstrapped CI is close to the standard error of the mean, as it should be. Conceptually, sample from a KDE instead of the values themselves. This addresses the issue that small segments (#bins < #replicates) don't fully represent the underlying distribution, in particular the extreme values, so the CI is too narrow. For single-bin segments in particular, the confidence interval will always have zero width unless the samples are smoothed. Standard deviation of the noise added to each bin comes from each bin's weight, which is an estimate of (1-variance). Parameters ---------- values : np.ndarray Original log2 values within the segment. samples : list of np.ndarray Bootstrap replicates as (value_sample, weight_sample). Returns ------- `samples` with random N(0, pop_sd) added to each value, and weights unchanged. """ k = len(values) # KDE bandwidth narrows for larger sample sizes # Following Silverman's Rule and Polansky 1995, # but requiring k=1 -> bw=1 for consistency bw = k ** (-1/4) samples = [(v + (bw * np.sqrt(1-w) * np.random.randn(k)), w) for v, w in samples] return samples def _bca_correct_alpha(values, weights, bootstrap_dist, alphas): """Bias Corrected & Accellerated (BCa) bootstrap adjustment. See: Efron 1987, "Better Bootstrap Confidence Intervals" http://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478410 Ported from R package "bootstrap" function "bcanon". """ n_boots = len(bootstrap_dist) orig_mean = np.average(values, weights=weights) logging.warning("boot samples less: %s / %s", (bootstrap_dist < orig_mean).sum(), n_boots) n_boots_below = (bootstrap_dist < orig_mean).sum() if n_boots_below == 0: logging.warning("boots mean %s, orig mean %s", bootstrap_dist.mean(), orig_mean) else: logging.warning("boot samples less: %s / %s", n_boots_below, n_boots) z0 = stats.norm.ppf((bootstrap_dist < orig_mean).sum() / n_boots) zalpha = stats.norm.ppf(alphas) # Jackknife influence values u = np.array([np.average(np.concatenate([values[:i], values[i+1:]]), weights=np.concatenate([weights[:i], weights[i+1:]])) for i in range(len(values))]) uu = u.mean() - u acc = (u**3).sum() / (6 * (uu**2).sum()**1.5) alphas = stats.norm.cdf(z0 + (z0 + zalpha) / (1 - acc * (z0 + zalpha))) logging.warning("New alphas: %s -- via z0=%s, za=%s, acc=%s", alphas, z0, zalpha, acc) if not 0 < alphas[0] < 1 and 0 < alphas[1] < 1: raise ValueError("CI alphas should be in (0,1); got %s" % alphas) return alphas
[docs]def segment_mean(cnarr, skip_low=False): """Weighted average of bin log2 values.""" if skip_low: cnarr = cnarr.drop_low_coverage() if len(cnarr) == 0: return np.nan if 'weight' in cnarr and cnarr['weight'].any(): return np.average(cnarr['log2'], weights=cnarr['weight']) return cnarr['log2'].mean()