Source code for cnvlib.metrics

"""Robust metrics to evaluate performance of copy number estimates.
"""
from __future__ import absolute_import, division, print_function
from builtins import zip

import logging

import numpy as np
import pandas as pd
from scipy import stats

from .descriptives import (biweight_midvariance, interquartile_range,
                           median_absolute_deviation)


[docs]def do_metrics(cnarrs, segments=None, skip_low=False):
    """Compute coverage deviations and other metrics for self-evaluation."""
    # Catch if passed args are single CopyNumArrays instead of lists
    from .cnary import CopyNumArray as CNA
    if isinstance(cnarrs, CNA):
        cnarrs = [cnarrs]
    if isinstance(segments, CNA):
        segments = [segments]
    elif segments is None:
        segments = [None]
    else:
        segments = list(segments)
    if skip_low:
        cnarrs = (cna.drop_low_coverage() for cna in cnarrs)
    rows = ((cna.meta.get("filename", cna.sample_id),
             len(seg) if seg is not None else '-'
            ) + ests_of_scale(cna.residuals(seg))
            for cna, seg in zip_repeater(cnarrs, segments))
    colnames = ["sample", "segments", "stdev", "mad", "iqr", "bivar"]
    return pd.DataFrame.from_records(rows, columns=colnames)


[docs]def zip_repeater(iterable, repeatable):
    """Repeat a single segmentation to match the number of copy ratio inputs"""
    rpt_len = len(repeatable)
    if rpt_len == 1:
        rpt = repeatable[0]
        for it in iterable:
            yield it, rpt
    else:
        i = -1
        for i, (it, rpt) in enumerate(zip(iterable, repeatable)):
            yield it, rpt
        # Require lengths to match
        if i + 1 != rpt_len:
            raise ValueError("""Number of unsegmented and segmented input files
                             did not match (%d vs. %d)""" % (i, rpt_len))


[docs]def ests_of_scale(deviations):
    """Estimators of scale: standard deviation, MAD, biweight midvariance.

    Calculates all of these values for an array of deviations and returns them
    as a tuple.
    """
    std = np.std(deviations, dtype=np.float64)
    mad = median_absolute_deviation(deviations)
    iqr = interquartile_range(deviations)
    biw = biweight_midvariance(deviations)
    return (std, mad, iqr, biw)


[docs]def segment_mean(cnarr, skip_low=False):
    """Weighted average of bin log2 values."""
    if skip_low:
        cnarr = cnarr.drop_low_coverage()
    if len(cnarr) == 0:
        return np.nan
    if 'weight' in cnarr:
        return np.average(cnarr['log2'], weights=cnarr['weight'])
    return cnarr['log2'].mean()


# Intervals

[docs]def confidence_interval_bootstrap(bins, alpha, bootstraps=100, smoothed=True):
    """Confidence interval for segment mean log2 value, estimated by bootstrap."""
    if not 0 < alpha < 1:
        raise ValueError("alpha must be between 0 and 1; got %s" % alpha)
    if bootstraps <= 2 / alpha:
        new_boots = int(np.ceil(2 / alpha))
        logging.warn("%d bootstraps not enough to estimate CI alpha level %f; "
                     "increasing to %d", bootstraps, alpha, new_boots)
        bootstraps = new_boots
    # Bootstrap for CI
    k = len(bins)
    if k < 2:
        return np.array([bins["log2"][0], bins["log2"][0]])

    np.random.seed(0xA5EED)
    rand_indices = np.random.randint(0, k, (bootstraps, k))
    samples = [bins.data.take(idx) for idx in rand_indices]
    if smoothed:
        # samples = _smooth_samples(bins, samples, alpha)
        pass
    # Recalculate segment means
    bootstrap_dist = np.array([segment_mean(samp) for samp in samples])
    alphas = np.array([alpha / 2, 1 - alpha / 2])
    if not smoothed:
        # alphas = _bca_correct_alpha(bins, bootstrap_dist, alphas)
        pass
    ci = np.percentile(bootstrap_dist, list(100 * alphas))
    return ci


def _smooth_samples(bins, samples, alpha):
    k = len(bins)
    # Essentially, resample from a kernel density estimate of the data
    # instead of the original data.
    # Estimate KDE bandwidth (Polansky 1995)
    resids = bins['log2'] - bins['log2'].mean()
    s_hat = 1/k * (resids**2).sum()  # sigma^2 = E[X-theta]^2
    y_hat = 1/k * abs((resids**3).sum())  # gamma = E[X-theta]^3
    z = stats.norm.ppf(alpha / 2)  # or alpha?
    bw = k**(-1/4) * np.sqrt(y_hat*(z**2 + 2) / (3*s_hat*z))
    # NB: or, Silverman's Rule for KDE bandwidth (roughly):
    # std = interquartile_range(bins['log2']) / 1.34
    # bw = std * (k*3/4) ** (-1/5)
    if bw > 0:
        samples = [samp.assign(log2=lambda x:
                                x['log2'] + bw * np.random.randn(k))
                    for samp in samples]
        logging.debug("Smoothing worked for this segment (bw=%s)", bw)
    else:
        logging.debug("Smoothing not needed for this segment (bw=%s)", bw)
    return samples


def _bca_correct_alpha(bins, bootstrap_dist, alphas):
    # BCa correction (Efron 1987, "Better Bootstrap Confidence Intervals")
    # http://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478410
    # Ported from R package "bootstrap" function "bcanon"
    n_boots = len(bootstrap_dist)
    orig_mean = segment_mean(bins)
    logging.warn("boot samples less: %s / %s",
                 (bootstrap_dist < orig_mean).sum(),
                 n_boots)
    n_boots_below = (bootstrap_dist < orig_mean).sum()
    if n_boots_below == 0:
        logging.warn("boots mean %s, orig mean %s",
                     bootstrap_dist.mean(), orig_mean)
    else:
        logging.warn("boot samples less: %s / %s",
                     n_boots_below, n_boots)
    z0 = stats.norm.ppf((bootstrap_dist < orig_mean).sum() / n_boots)
    zalpha = stats.norm.ppf(alphas)
    # Jackknife influence values
    u = np.array([segment_mean(bins.concat([bins[:i], bins[i+1:]]))
                    for i in range(len(bins))])
    uu = u.mean() - u
    acc = (u**3).sum() / (6 * (uu**2).sum()**1.5)
    alphas = stats.norm.cdf(z0 + (z0 + zalpha)
                                    / (1 - acc * (z0 + zalpha)))
    logging.warn("New alphas: %s -- via z0=%s, za=%s, acc=%s",
                    alphas, z0, zalpha, acc)
    if not 0 < alphas[0] < 1 and 0 < alphas[1] < 1:
        raise ValueError("CI alphas should be in (0,1); got %s" % alphas)
    return alphas


[docs]def prediction_interval(bins, alpha):
    """Prediction interval, estimated by percentiles."""
    pct_lo = 100 * alpha / 2
    pct_hi = 100 * (1 - alpha / 2)
    # ENH: weighted percentile
    return np.percentile(bins['log2'], [pct_lo, pct_hi])