Source code for cnvlib.bintest

"""Z-test for single-bin copy number alterations."""
import logging

import numpy as np
import pandas as pd
from scipy.stats import norm

from . import params, segfilters


[docs]def do_bintest(cnarr, segments=None, alpha=0.005, target_only=False):
    """Get a probability for each bin based on its Z-score.

    Adds a column w/ p-values to the input .cnr. With `segments`, the Z-score is
    relative to the enclosing segment's mean, otherwise it is relative to 0.

    Bin p-values are corrected for multiple hypothesis testing by the
    Benjamini-Hochberg method.

    Returns: bins where the probability < `alpha`.
    """
    cnarr = cnarr.copy()
    # Subtract segment means, if given, to report only the CNA bins that
    # weren't already detected (including exon-size CNAs within a 
    # larger-scale, smaller-amplitude CNA)
    resid = cnarr.residuals(segments)
    if not resid.index.is_unique:
        # Overlapping segments, maybe?
        dup_idx = resid.index.duplicated(keep=False)
        logging.warning("Segments may overlap at %d bins; dropping duplicate values",
                        dup_idx.sum())
        logging.debug("Duplicated indices: %s", " ".join(map(str, resid[dup_idx].head(50))))
        resid = resid[~resid.index.duplicated()]
        cnarr = cnarr.as_dataframe(cnarr.data.loc[resid.index])
    if len(cnarr) != len(resid):
        logging.info("Segments do not cover all bins (%d), only %d of them",
                     len(cnarr), len(resid))
        cnarr = cnarr.as_dataframe(cnarr.data.loc[resid.index])

    cnarr['log2'] = resid

    if target_only:
        antitarget_idx = cnarr['gene'].isin(params.ANTITARGET_ALIASES)
        if antitarget_idx.any():
            logging.info("Ignoring %d off-target bins", antitarget_idx.sum())
            # NB: bins no longer match the original input
            cnarr = cnarr[~antitarget_idx]

    cnarr['p_bintest'] = z_prob(cnarr)
    is_sig = cnarr['p_bintest'] < alpha
    logging.info("Significant hits in {}/{} bins ({:.3g}%)"
                 .format(is_sig.sum(), len(is_sig),
                         100 * is_sig.sum() / len(is_sig)))
    # if segments:
    #     return spike_into_segments(cnarr, segments, is_sig)

    # May be empty
    hits = cnarr[is_sig]
    return hits


def z_prob(cnarr):
    """Calculate z-test p-value at each bin."""
    # Bin weights ~ 1-variance; bin log2 values already centered at 0.0
    sd = np.sqrt(1 - cnarr['weight'])
    # Convert to Z-scores
    z = cnarr['log2'] / sd
    # Two-sided survival function (1-CDF) probability
    p = 2. * norm.cdf(-np.abs(z))
    # Similar to the above -- which is better?
    # p2 = 2 * norm.pdf(cnarr['log2'], loc=0, scale=sd)
    # if not np.allclose(p, p2):
    #     print("Max diff:", np.abs(p - p2).max())
    #     print("Median diff:", np.median(np.abs(p - p2)))
    #     print("Ratio:", (p / p2).mean())
    # Correct for multiple hypothesis tests
    return p_adjust_bh(p)


def p_adjust_bh(p):
    """Benjamini-Hochberg p-value correction for multiple hypothesis testing."""
    p = np.asfarray(p)
    by_descend = p.argsort()[::-1]
    by_orig = by_descend.argsort()
    steps = float(len(p)) / np.arange(len(p), 0, -1)
    q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend]))
    return q[by_orig]


def spike_into_segments(cnarr, segments, is_sig):
    if is_sig.any():
        # Splice significant hits into the given segments
        # NB: residuals() above ensures hits all occur within segments
        cnarr['is_sig'] = is_sig
        chunks = []
        for segment, seghits in cnarr.by_ranges(segments, keep_empty=True):
            if seghits['is_sig'].any():
                # Merge each run of adjacent non-significant bins within this
                # segment, leaving the significant hits as single-bin segments
                levels = seghits['is_sig'].cumsum() * seghits['is_sig']
                chunks.append(seghits.data
                                .assign(_levels=levels)
                                .groupby('_levels', sort=False)
                                .apply(segfilters.squash_region)
                                .reset_index(drop=True))
            else:
                # Keep this segment as-is
                chunks.append(pd.DataFrame.from_records([segment],
                                                        columns=segments.data.columns))
        return cnarr.as_dataframe(pd.concat(chunks,
                                            # pandas 0.23+
                                            #sort=False
                                            ))
    else:
        # Nothing to do
        return segments