"""Robust metrics to evaluate performance of copy number estimates.
"""
import logging
import numpy as np
# import pandas as pd
from scipy import stats
from . import descriptives
[docs]def do_segmetrics(cnarr, segarr, location_stats=(), spread_stats=(),
interval_stats=(), alpha=.05, bootstraps=100, smoothed=False):
"""Compute segment-level metrics from bin-level log2 ratios."""
# Silence sem's "Degrees of freedom <= 0 for slice"; NaN is OK
import warnings
warnings.simplefilter('ignore', RuntimeWarning)
stat_funcs = {
'mean': np.mean,
'median': np.median,
'mode': descriptives.modal_location,
'p_ttest': lambda a: stats.ttest_1samp(a, 0.0, nan_policy='omit')[1],
'stdev': np.std,
'mad': descriptives.median_absolute_deviation,
'mse': descriptives.mean_squared_error,
'iqr': descriptives.interquartile_range,
'bivar': descriptives.biweight_midvariance,
'sem': stats.sem,
'ci': make_ci_func(alpha, bootstraps, smoothed),
'pi': make_pi_func(alpha),
}
bins_log2s = list(cnarr.iter_ranges_of(segarr, 'log2', 'outer', True))
segarr = segarr.copy()
if location_stats:
# Measures of location
for statname in location_stats:
func = stat_funcs[statname]
segarr[statname] = np.fromiter(map(func, bins_log2s),
np.float_, len(segarr))
# Measures of spread
if spread_stats:
deviations = (bl - sl for bl, sl in zip(bins_log2s, segarr['log2']))
if len(spread_stats) > 1:
deviations = list(deviations)
for statname in spread_stats:
func = stat_funcs[statname]
segarr[statname] = np.fromiter(map(func, deviations),
np.float_, len(segarr))
# Interval calculations
weights = cnarr['weight']
if 'ci' in interval_stats:
segarr['ci_lo'], segarr['ci_hi'] = calc_intervals(bins_log2s, weights,
stat_funcs['ci'])
if 'pi' in interval_stats:
segarr['pi_lo'], segarr['pi_hi'] = calc_intervals(bins_log2s, weights,
stat_funcs['pi'])
return segarr
[docs]def make_ci_func(alpha, bootstraps, smoothed):
def ci_func(ser, wt):
return confidence_interval_bootstrap(ser, wt, alpha, bootstraps,
smoothed)
return ci_func
[docs]def make_pi_func(alpha):
"""Prediction interval, estimated by percentiles."""
# ENH: weighted percentile
pct_lo = 100 * alpha / 2
pct_hi = 100 * (1 - alpha / 2)
def pi_func(ser, _w):
return np.percentile(ser, [pct_lo, pct_hi])
return pi_func
[docs]def calc_intervals(bins_log2s, weights, func):
"""Compute a stat that yields intervals (low & high values)"""
out_vals_lo = np.repeat(np.nan, len(bins_log2s))
out_vals_hi = np.repeat(np.nan, len(bins_log2s))
for i, ser in enumerate(bins_log2s):
if len(ser):
wt = weights[ser.index]
assert (wt.index == ser.index).all()
out_vals_lo[i], out_vals_hi[i] = func(ser.values, wt.values)
return out_vals_lo, out_vals_hi
[docs]def confidence_interval_bootstrap(values, weights, alpha, bootstraps=100, smoothed=False):
"""Confidence interval for segment mean log2 value, estimated by bootstrap."""
if not 0 < alpha < 1:
raise ValueError("alpha must be between 0 and 1; got %s" % alpha)
if bootstraps <= 2 / alpha:
new_boots = int(np.ceil(2 / alpha))
logging.warning("%d bootstraps not enough to estimate CI alpha level "
"%f; increasing to %d", bootstraps, alpha, new_boots)
bootstraps = new_boots
# Bootstrap for CI
k = len(values)
if k < 2:
return np.repeat(values[0], 2)
np.random.seed(0xA5EED)
rand_indices = np.random.randint(0, k, size=(bootstraps, k))
samples = ((np.take(values, idx), np.take(weights, idx))
for idx in rand_indices)
if smoothed:
samples = _smooth_samples_by_weight(values, samples)
# Recalculate segment means
seg_means = (np.average(val, weights=wt)
for val, wt in samples)
bootstrap_dist = np.fromiter(seg_means, np.float_, bootstraps)
alphas = np.array([alpha / 2, 1 - alpha / 2])
if not smoothed:
# alphas = _bca_correct_alpha(values, weights, bootstrap_dist, alphas)
pass
ci = np.percentile(bootstrap_dist, list(100 * alphas))
return ci
def _smooth_samples_by_weight(values, samples):
"""Add Gaussian noise to each bootstrap replicate.
The result is used to compute a "smoothed bootstrap," where the added noise
ensures that for small samples (e.g. number of bins in the segment) the
bootstrapped CI is close to the standard error of the mean, as it should be.
Conceptually, sample from a KDE instead of the values themselves.
This addresses the issue that small segments (#bins < #replicates) don't
fully represent the underlying distribution, in particular the extreme
values, so the CI is too narrow. For single-bin segments in particular,
the confidence interval will always have zero width unless the samples are
smoothed.
Standard deviation of the noise added to each bin comes from each bin's
weight, which is an estimate of (1-variance).
Parameters
----------
values : np.ndarray
Original log2 values within the segment.
samples : list of np.ndarray
Bootstrap replicates as (value_sample, weight_sample).
Returns
-------
`samples` with random N(0, pop_sd) added to each value, and
weights unchanged.
"""
k = len(values)
# KDE bandwidth narrows for larger sample sizes
# Following Silverman's Rule and Polansky 1995,
# but requiring k=1 -> bw=1 for consistency
bw = k ** (-1/4)
samples = [(v + (bw * np.sqrt(1-w) * np.random.randn(k)), w)
for v, w in samples]
return samples
def _bca_correct_alpha(values, weights, bootstrap_dist, alphas):
"""Bias Corrected & Accellerated (BCa) bootstrap adjustment.
See: Efron 1987, "Better Bootstrap Confidence Intervals"
http://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478410
Ported from R package "bootstrap" function "bcanon".
"""
n_boots = len(bootstrap_dist)
orig_mean = np.average(values, weights=weights)
logging.warning("boot samples less: %s / %s",
(bootstrap_dist < orig_mean).sum(),
n_boots)
n_boots_below = (bootstrap_dist < orig_mean).sum()
if n_boots_below == 0:
logging.warning("boots mean %s, orig mean %s",
bootstrap_dist.mean(), orig_mean)
else:
logging.warning("boot samples less: %s / %s",
n_boots_below, n_boots)
z0 = stats.norm.ppf((bootstrap_dist < orig_mean).sum() / n_boots)
zalpha = stats.norm.ppf(alphas)
# Jackknife influence values
u = np.array([np.average(np.concatenate([values[:i], values[i+1:]]),
weights=np.concatenate([weights[:i],
weights[i+1:]]))
for i in range(len(values))])
uu = u.mean() - u
acc = (u**3).sum() / (6 * (uu**2).sum()**1.5)
alphas = stats.norm.cdf(z0 + (z0 + zalpha)
/ (1 - acc * (z0 + zalpha)))
logging.warning("New alphas: %s -- via z0=%s, za=%s, acc=%s",
alphas, z0, zalpha, acc)
if not 0 < alphas[0] < 1 and 0 < alphas[1] < 1:
raise ValueError("CI alphas should be in (0,1); got %s" % alphas)
return alphas
[docs]def segment_mean(cnarr, skip_low=False):
"""Weighted average of bin log2 values."""
if skip_low:
cnarr = cnarr.drop_low_coverage()
if len(cnarr) == 0:
return np.nan
if 'weight' in cnarr and cnarr['weight'].any():
return np.average(cnarr['log2'], weights=cnarr['weight'])
return cnarr['log2'].mean()