Source code for cnvlib.segmentation

"""Segmentation of copy number values."""
from __future__ import absolute_import, division, print_function
from builtins import map, zip

import locale
import logging
import math
import os.path
import tempfile

import numpy as np
import pandas as pd
from Bio._py3k import StringIO
from skgenome import tabio
from skgenome.intersect import iter_slices

from .. import core, parallel, params, smoothing, vary
from ..cnary import CopyNumArray as CNA
from ..segfilters import squash_by_groups
from . import cbs, flasso, haar, hmm, none

[docs]def do_segmentation(cnarr, method, threshold=None, variants=None, skip_low=False, skip_outliers=10, min_weight=0, save_dataframe=False, rlibpath=None, rscript_path="Rscript", processes=1): """Infer copy number segments from the given coverage table.""" if variants: variants = variants.heterozygous() if not threshold: threshold = {'cbs': 0.0001, 'flasso': 0.0001, 'haar': 0.0001, }.get(method) msg = "Segmenting with method " + repr(method) if threshold is not None: if method.startswith('hmm'): msg += ", smoothing window size %s," % threshold else: msg += ", significance threshold %s," % threshold msg += " in %s processes" % processes # NB: parallel cghFLasso segfaults in R ('memory not mapped'), # even when run on a single chromosome if method == 'flasso' or method.startswith('hmm'): # ENH segment p/q arms separately # -> assign separate identifiers via chrom name suffix? cna = _do_segmentation(cnarr, method, threshold, variants, skip_low, skip_outliers, min_weight, save_dataframe, rlibpath, rscript_path) if save_dataframe: cna, rstr = cna rstr = _to_str(rstr) else: with parallel.pick_pool(processes) as pool: rets = list(, ((ca, method, threshold, variants, skip_low, skip_outliers, min_weight, save_dataframe, rlibpath, rscript_path) for _, ca in cnarr.by_arm()))) if save_dataframe: # rets is a list of (CNA, R dataframe string) -- unpack rets, r_dframe_strings = zip(*rets) # Strip the header line from all but the first dataframe, then combine r_dframe_strings = map(_to_str, r_dframe_strings) rstr = [next(r_dframe_strings)] rstr.extend(r[r.index('\n') + 1:] for r in r_dframe_strings) rstr = "".join(rstr) cna = cnarr.concat(rets) cna.sort_columns() if save_dataframe: return cna, rstr return cna
def _to_str(s, enc=locale.getpreferredencoding()): if isinstance(s, bytes): return s.decode(enc) return s def _ds(args): """Wrapper for parallel map""" return _do_segmentation(*args) def _do_segmentation(cnarr, method, threshold, variants=None, skip_low=False, skip_outliers=10, min_weight=0, save_dataframe=False, rlibpath=None, rscript_path="Rscript"): """Infer copy number segments from the given coverage table.""" if not len(cnarr): return cnarr filtered_cn = cnarr.copy() # Filter out bins with no or near-zero sequencing coverage if skip_low: filtered_cn = filtered_cn.drop_low_coverage(verbose=False) # Filter by distance from rolling quantiles if skip_outliers: filtered_cn = drop_outliers(filtered_cn, 50, skip_outliers) # Filter by bin weights if min_weight: weight_too_low = (filtered_cn["weight"] < min_weight).fillna(True) else: weight_too_low = (filtered_cn["weight"] == 0).fillna(True) n_weight_too_low = weight_too_low.sum() if len(weight_too_low) else 0 if n_weight_too_low: filtered_cn = filtered_cn[~weight_too_low] if min_weight: logging.debug("Dropped %d bins with weight below %s", n_weight_too_low, min_weight) else: logging.debug("Dropped %d bins with zero weight", n_weight_too_low) if len(filtered_cn) != len(cnarr): msg = ("Dropped %d / %d bins" % (len(cnarr) - len(filtered_cn), len(cnarr))) if cnarr["chromosome"].iat[0] == cnarr["chromosome"].iat[-1]: msg += " on chromosome " + str(cnarr["chromosome"].iat[0]) if not len(filtered_cn): return filtered_cn seg_out = "" if method == 'haar': segarr = haar.segment_haar(filtered_cn, threshold) elif method == 'none': segarr = none.segment_none(filtered_cn) elif method.startswith('hmm'): segarr = hmm.segment_hmm(filtered_cn, method, threshold) elif method in ('cbs', 'flasso'): # Run R scripts to calculate copy number segments rscript = {'cbs': cbs.CBS_RSCRIPT, 'flasso': flasso.FLASSO_RSCRIPT, }[method] filtered_cn['start'] += 1 # Convert to 1-indexed coordinates for R with tempfile.NamedTemporaryFile(suffix='.cnr', mode="w+t") as tmp: # TODO tabio.write(filtered_cn, tmp, 'seg'), index=False, sep='\t', float_format='%.6g', mode="w+t") tmp.flush() script_strings = { 'probes_fname':, 'sample_id': cnarr.sample_id, 'threshold': threshold, 'rlibpath': ('.libPaths(c("%s"))' % rlibpath if rlibpath else ''), } with core.temp_write_text(rscript % script_strings, mode='w+t') as script_fname: seg_out = core.call_quiet(rscript_path, '--vanilla', script_fname) # Convert R dataframe contents (SEG) to a proper CopyNumArray # NB: Automatically shifts 'start' back from 1- to 0-indexed segarr =, "seg", into=CNA) if method == 'flasso': # Merge adjacent bins with same log2 value into segments if 'weight' in filtered_cn: segarr['weight'] = filtered_cn['weight'] else: segarr['weight'] = 1.0 segarr = squash_by_groups(segarr, segarr['log2'], by_arm=True) else: raise ValueError("Unknown method %r" % method) segarr.meta = cnarr.meta.copy() if variants and not method.startswith('hmm'): # Re-segment the variant allele freqs within each segment newsegs = [haar.variants_in_segment(subvarr, segment, 0.01 * threshold) for segment, subvarr in variants.by_ranges(segarr)] segarr = segarr.as_dataframe(pd.concat(newsegs)) segarr['baf'] = variants.baf_by_ranges(segarr) segarr = transfer_fields(segarr, cnarr) if save_dataframe: return segarr, seg_out else: return segarr
[docs]def drop_outliers(cnarr, width, factor): """Drop outlier bins with log2 ratios too far from the trend line. Outliers are the log2 values `factor` times the 90th quantile of absolute deviations from the rolling average, within a window of given `width`. The 90th quantile is about 1.97 standard deviations if the log2 values are Gaussian, so this is similar to calling outliers `factor` * 1.97 standard deviations from the rolling mean. For a window size of 50, the breakdown point is 2.5 outliers within a window, which is plenty robust for our needs. """ if not len(cnarr): return cnarr outlier_mask = np.concatenate([ smoothing.rolling_outlier_quantile(subarr['log2'], width, .95, factor) for _chrom, subarr in cnarr.by_chromosome()]) n_outliers = outlier_mask.sum() if n_outliers:"Dropped %d outlier bins:\n%s%s", n_outliers, cnarr[outlier_mask].data.head(20), "\n..." if n_outliers > 20 else "") return cnarr[~outlier_mask]
[docs]def transfer_fields(segments, cnarr, ignore=params.IGNORE_GENE_NAMES): """Map gene names, weights, depths from `cnarr` bins to `segarr` segments. Segment gene name is the comma-separated list of bin gene names. Segment weight is the sum of bin weights, and depth is the (weighted) mean of bin depths. Also: Post-process segmentation output. 1. Ensure every chromosome has at least one segment. 2. Ensure first and last segment ends match 1st/last bin ends (but keep log2 as-is). """ def make_null_segment(chrom, orig_start, orig_end): """Closes over 'segments'.""" vals = {'chromosome': chrom, 'start': orig_start, 'end': orig_end, 'gene': '-', 'depth': 0.0, 'log2': 0.0, 'probes': 0.0, 'weight': 0.0, } row_vals = tuple(vals[c] for c in return row_vals if not len(cnarr): # This Should Never Happen (TM) # raise RuntimeError("No bins for:\n" + str( logging.warn("No bins for:\n%s", return segments # Adjust segment endpoints to cover the chromosome arm's original bins # (Stretch first and last segment endpoints to match first/last bins) bins_chrom = cnarr.chromosome.iat[0] bins_start = cnarr.start.iat[0] bins_end = cnarr.end.iat[-1] if not len(segments): # All bins in this chromosome arm were dropped: make a dummy segment return make_null_segment(bins_chrom, bins_start, bins_end) segments.start.iat[0] = bins_start segments.end.iat[-1] = bins_end # Aggregate segment depths, weights, gene names # ENH refactor so that np/ access is encapsulated in skgenome ignore += params.ANTITARGET_ALIASES assert bins_chrom == segments.chromosome.iat[0] cdata = if 'depth' not in cdata.columns: cdata['depth'] = np.exp2(cnarr['log2'].values) bin_genes = cdata['gene'].values bin_weights = cdata['weight'].values if 'weight' in cdata.columns else None bin_depths = cdata['depth'].values seg_genes = ['-'] * len(segments) seg_weights = np.zeros(len(segments)) seg_depths = np.zeros(len(segments)) for i, bin_idx in enumerate(iter_slices(cdata,, 'outer', False)): if bin_weights is not None: seg_wt = bin_weights[bin_idx].sum() if seg_wt > 0: seg_dp = np.average(bin_depths[bin_idx], weights=bin_weights[bin_idx]) else: seg_dp = 0.0 else: bin_count = len(cdata.iloc[bin_idx]) seg_wt = float(bin_count) seg_dp = bin_depths[bin_idx].mean() subgenes = [g for g in pd.unique(bin_genes[bin_idx]) if g not in ignore] if subgenes: seg_gn = ",".join(subgenes) else: seg_gn = '-' seg_genes[i] = seg_gn seg_weights[i] = seg_wt seg_depths[i] = seg_dp = gene=seg_genes, weight=seg_weights, depth=seg_depths) return segments