Source code for cnvlib.batch

"""The 'batch' command."""
import logging
import os

from matplotlib import pyplot
from skgenome import tabio, GenomicArray as GA

from . import (access, antitarget, autobin, bintest, call, core, coverage,
               diagram, fix, parallel, reference, scatter, segmentation,
               segmetrics, target)
from .cmdutil import read_cna


[docs]def batch_make_reference(normal_bams, target_bed, antitarget_bed, male_reference, fasta, annotate, short_names, target_avg_size, access_bed, antitarget_avg_size, antitarget_min_size, output_reference, output_dir, processes, by_count, method, do_cluster): """Build the CN reference from normal samples, targets and antitargets.""" if method in ("wgs", "amplicon"): if antitarget_bed: raise ValueError("%r protocol: antitargets should not be " "given/specified." % method) if access_bed and target_bed and access_bed != target_bed: raise ValueError("%r protocol: targets and access should not be " "different." % method) bait_arr = None if method == "wgs": if not annotate: # TODO check if target_bed has gene names logging.warning("WGS protocol: recommend '--annotate' option " "(e.g. refFlat.txt) to help locate genes " "in output files.") access_arr = None if not target_bed: # TODO - drop weird contigs before writing, see antitargets.py if access_bed: target_bed = access_bed elif fasta: # Run 'access' on the fly access_arr = access.do_access(fasta) # Take filename base from FASTA, lacking any other clue target_bed = os.path.splitext(os.path.basename(fasta) )[0] + ".bed" tabio.write(access_arr, target_bed, "bed3") else: raise ValueError("WGS protocol: need to provide --targets, " "--access, or --fasta options.") # Tweak default parameters if not target_avg_size: if normal_bams: # Calculate bin size from .bai & access if fasta and not access_arr: # Calculate wgs depth from all # sequencing-accessible area (it doesn't take that long # compared to WGS coverage); user-provided access might be # something else that excludes a significant number of # mapped reads. access_arr = access.do_access(fasta) if access_arr: autobin_args = ['wgs', None, access_arr] else: # Don't assume the given targets/access covers the whole # genome; use autobin sampling to estimate bin size, as we # do for amplicon bait_arr = tabio.read_auto(target_bed) autobin_args = ['amplicon', bait_arr] # Choose median-size normal bam or tumor bam bam_fname = autobin.midsize_file(normal_bams) (wgs_depth, target_avg_size), _ = autobin.do_autobin( bam_fname, *autobin_args, bp_per_bin=50000., fasta=fasta) logging.info("WGS average depth %.2f --> using bin size %d", wgs_depth, target_avg_size) else: # This bin size is OK down to 10x target_avg_size = 5000 # To make temporary filenames for processed targets or antitargets tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed)) if output_dir: tgt_name_base = os.path.join(output_dir, tgt_name_base) # Pre-process baits/targets new_target_fname = tgt_name_base + '.target.bed' if bait_arr is None: bait_arr = tabio.read_auto(target_bed) target_arr = target.do_target(bait_arr, annotate, short_names, True, **({'avg_size': target_avg_size} if target_avg_size else {})) tabio.write(target_arr, new_target_fname, 'bed4') target_bed = new_target_fname if not antitarget_bed: # Devise a temporary antitarget filename antitarget_bed = tgt_name_base + '.antitarget.bed' if method == "hybrid": # Build antitarget BED from the given targets anti_kwargs = {} if access_bed: anti_kwargs['access'] = tabio.read_auto(access_bed) if antitarget_avg_size: anti_kwargs['avg_bin_size'] = antitarget_avg_size if antitarget_min_size: anti_kwargs['min_bin_size'] = antitarget_min_size anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs) else: # No antitargets for wgs, amplicon anti_arr = GA([]) tabio.write(anti_arr, antitarget_bed, "bed4") if len(normal_bams) == 0: logging.info("Building a flat reference...") ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta, male_reference) else: logging.info("Building a copy number reference from normal samples...") # Run coverage on all normals with parallel.pick_pool(processes) as pool: tgt_futures = [] anti_futures = [] procs_per_cnn = max(1, processes // (2 * len(normal_bams))) for nbam in normal_bams: sample_id = core.fbase(nbam) sample_pfx = os.path.join(output_dir, sample_id) tgt_futures.append( pool.submit(batch_write_coverage, target_bed, nbam, sample_pfx + '.targetcoverage.cnn', by_count, procs_per_cnn, fasta)) anti_futures.append( pool.submit(batch_write_coverage, antitarget_bed, nbam, sample_pfx + '.antitargetcoverage.cnn', by_count, procs_per_cnn, fasta)) target_fnames = [tf.result() for tf in tgt_futures] antitarget_fnames = [af.result() for af in anti_futures] # Build reference from *.cnn ref_arr = reference.do_reference(target_fnames, antitarget_fnames, fasta, male_reference, None, do_gc=True, do_edge=(method == "hybrid"), do_rmask=True, do_cluster=do_cluster) if not output_reference: output_reference = os.path.join(output_dir, "reference.cnn") core.ensure_path(output_reference) tabio.write(ref_arr, output_reference) return output_reference, target_bed, antitarget_bed
[docs]def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes, fasta): """Run coverage on one sample, write to file.""" cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes, fasta) tabio.write(cnarr, out_fname) return out_fname
[docs]def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname, output_dir, male_reference, plot_scatter, plot_diagram, rscript_path, by_count, skip_low, seq_method, segment_method, processes, do_cluster, fasta=None): """Run the pipeline on one BAM file.""" # ENH - return probes, segments (cnarr, segarr) logging.info("Running the CNVkit pipeline on %s ...", bam_fname) sample_id = core.fbase(bam_fname) sample_pfx = os.path.join(output_dir, sample_id) raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0, processes, fasta) tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn') raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0, processes, fasta) tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn') cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname), do_gc=True, do_edge=(seq_method == "hybrid"), do_rmask=True, do_cluster=do_cluster) tabio.write(cnarr, sample_pfx + '.cnr') logging.info("Segmenting %s.cnr ...", sample_pfx) segments = segmentation.do_segmentation(cnarr, segment_method, rscript_path=rscript_path, skip_low=skip_low, processes=processes, **({'threshold': 1e-6} if seq_method == 'wgs' else {})) logging.info("Post-processing %s.cns ...", sample_pfx) # TODO/ENH take centering shift & apply to .cnr for use in segmetrics seg_metrics = segmetrics.do_segmetrics(cnarr, segments, interval_stats=['ci'], alpha=0.5, smoothed=True) tabio.write(seg_metrics, sample_pfx + '.cns') # Remove likely false-positive breakpoints seg_call = call.do_call(seg_metrics, method="none", filters=['ci']) # Calculate another segment-level test p-value seg_alltest = segmetrics.do_segmetrics(cnarr, seg_call, location_stats=['p_ttest']) # Finally, assign absolute copy number values to each segment seg_alltest.center_all("median") seg_final = call.do_call(seg_alltest, method="threshold") tabio.write(seg_final, sample_pfx + '.call.cns') # Test for single-bin CNVs separately seg_bintest = bintest.do_bintest(cnarr, seg_call, target_only=True) tabio.write(seg_bintest, sample_pfx + '.bintest.cns') if plot_scatter: scatter.do_scatter(cnarr, seg_final) pyplot.savefig(sample_pfx + '-scatter.png', format='png', bbox_inches="tight") logging.info("Wrote %s-scatter.png", sample_pfx) if plot_diagram: is_xx = cnarr.guess_xx(male_reference) outfname = sample_pfx + '-diagram.pdf' diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx), seg_final.shift_xx(male_reference, is_xx), 0.5, 3, outfname) logging.info("Wrote %s", outfname)