"""The 'batch' command."""
import logging
import os
from matplotlib import pyplot
from skgenome import tabio, GenomicArray as GA
from . import (access, antitarget, autobin, bintest, call, core, coverage,
diagram, fix, parallel, reference, scatter, segmentation,
segmetrics, target)
from .cmdutil import read_cna
[docs]def batch_make_reference(normal_bams, target_bed, antitarget_bed,
male_reference, fasta, annotate, short_names,
target_avg_size, access_bed, antitarget_avg_size,
antitarget_min_size, output_reference, output_dir,
processes, by_count, method, do_cluster):
"""Build the CN reference from normal samples, targets and antitargets."""
if method in ("wgs", "amplicon"):
if antitarget_bed:
raise ValueError("%r protocol: antitargets should not be "
"given/specified." % method)
if access_bed and target_bed and access_bed != target_bed:
raise ValueError("%r protocol: targets and access should not be "
"different." % method)
bait_arr = None
if method == "wgs":
if not annotate:
# TODO check if target_bed has gene names
logging.warning("WGS protocol: recommend '--annotate' option "
"(e.g. refFlat.txt) to help locate genes "
"in output files.")
access_arr = None
if not target_bed:
# TODO - drop weird contigs before writing, see antitargets.py
if access_bed:
target_bed = access_bed
elif fasta:
# Run 'access' on the fly
access_arr = access.do_access(fasta)
# Take filename base from FASTA, lacking any other clue
target_bed = os.path.splitext(os.path.basename(fasta)
)[0] + ".bed"
tabio.write(access_arr, target_bed, "bed3")
else:
raise ValueError("WGS protocol: need to provide --targets, "
"--access, or --fasta options.")
# Tweak default parameters
if not target_avg_size:
if normal_bams:
# Calculate bin size from .bai & access
if fasta and not access_arr:
# Calculate wgs depth from all
# sequencing-accessible area (it doesn't take that long
# compared to WGS coverage); user-provided access might be
# something else that excludes a significant number of
# mapped reads.
access_arr = access.do_access(fasta)
if access_arr:
autobin_args = ['wgs', None, access_arr]
else:
# Don't assume the given targets/access covers the whole
# genome; use autobin sampling to estimate bin size, as we
# do for amplicon
bait_arr = tabio.read_auto(target_bed)
autobin_args = ['amplicon', bait_arr]
# Choose median-size normal bam or tumor bam
bam_fname = autobin.midsize_file(normal_bams)
(wgs_depth, target_avg_size), _ = autobin.do_autobin(
bam_fname, *autobin_args, bp_per_bin=50000., fasta=fasta)
logging.info("WGS average depth %.2f --> using bin size %d",
wgs_depth, target_avg_size)
else:
# This bin size is OK down to 10x
target_avg_size = 5000
# To make temporary filenames for processed targets or antitargets
tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed))
if output_dir:
tgt_name_base = os.path.join(output_dir, tgt_name_base)
# Pre-process baits/targets
new_target_fname = tgt_name_base + '.target.bed'
if bait_arr is None:
bait_arr = tabio.read_auto(target_bed)
target_arr = target.do_target(bait_arr, annotate, short_names, True,
**({'avg_size': target_avg_size}
if target_avg_size
else {}))
tabio.write(target_arr, new_target_fname, 'bed4')
target_bed = new_target_fname
if not antitarget_bed:
# Devise a temporary antitarget filename
antitarget_bed = tgt_name_base + '.antitarget.bed'
if method == "hybrid":
# Build antitarget BED from the given targets
anti_kwargs = {}
if access_bed:
anti_kwargs['access'] = tabio.read_auto(access_bed)
if antitarget_avg_size:
anti_kwargs['avg_bin_size'] = antitarget_avg_size
if antitarget_min_size:
anti_kwargs['min_bin_size'] = antitarget_min_size
anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs)
else:
# No antitargets for wgs, amplicon
anti_arr = GA([])
tabio.write(anti_arr, antitarget_bed, "bed4")
if len(normal_bams) == 0:
logging.info("Building a flat reference...")
ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta,
male_reference)
else:
logging.info("Building a copy number reference from normal samples...")
# Run coverage on all normals
with parallel.pick_pool(processes) as pool:
tgt_futures = []
anti_futures = []
procs_per_cnn = max(1, processes // (2 * len(normal_bams)))
for nbam in normal_bams:
sample_id = core.fbase(nbam)
sample_pfx = os.path.join(output_dir, sample_id)
tgt_futures.append(
pool.submit(batch_write_coverage,
target_bed, nbam,
sample_pfx + '.targetcoverage.cnn',
by_count, procs_per_cnn, fasta))
anti_futures.append(
pool.submit(batch_write_coverage,
antitarget_bed, nbam,
sample_pfx + '.antitargetcoverage.cnn',
by_count, procs_per_cnn, fasta))
target_fnames = [tf.result() for tf in tgt_futures]
antitarget_fnames = [af.result() for af in anti_futures]
# Build reference from *.cnn
ref_arr = reference.do_reference(target_fnames, antitarget_fnames,
fasta, male_reference, None,
do_gc=True,
do_edge=(method == "hybrid"),
do_rmask=True,
do_cluster=do_cluster)
if not output_reference:
output_reference = os.path.join(output_dir, "reference.cnn")
core.ensure_path(output_reference)
tabio.write(ref_arr, output_reference)
return output_reference, target_bed, antitarget_bed
[docs]def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes, fasta):
"""Run coverage on one sample, write to file."""
cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes, fasta)
tabio.write(cnarr, out_fname)
return out_fname
[docs]def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
output_dir, male_reference, plot_scatter, plot_diagram,
rscript_path, by_count, skip_low, seq_method,
segment_method, processes, do_cluster, fasta=None):
"""Run the pipeline on one BAM file."""
# ENH - return probes, segments (cnarr, segarr)
logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
sample_id = core.fbase(bam_fname)
sample_pfx = os.path.join(output_dir, sample_id)
raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
processes, fasta)
tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')
raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
processes, fasta)
tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')
cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname),
do_gc=True, do_edge=(seq_method == "hybrid"), do_rmask=True,
do_cluster=do_cluster)
tabio.write(cnarr, sample_pfx + '.cnr')
logging.info("Segmenting %s.cnr ...", sample_pfx)
segments = segmentation.do_segmentation(cnarr, segment_method,
rscript_path=rscript_path,
skip_low=skip_low,
processes=processes,
**({'threshold': 1e-6}
if seq_method == 'wgs'
else {}))
logging.info("Post-processing %s.cns ...", sample_pfx)
# TODO/ENH take centering shift & apply to .cnr for use in segmetrics
seg_metrics = segmetrics.do_segmetrics(cnarr, segments,
interval_stats=['ci'], alpha=0.5,
smoothed=True)
tabio.write(seg_metrics, sample_pfx + '.cns')
# Remove likely false-positive breakpoints
seg_call = call.do_call(seg_metrics, method="none", filters=['ci'])
# Calculate another segment-level test p-value
seg_alltest = segmetrics.do_segmetrics(cnarr, seg_call, location_stats=['p_ttest'])
# Finally, assign absolute copy number values to each segment
seg_alltest.center_all("median")
seg_final = call.do_call(seg_alltest, method="threshold")
tabio.write(seg_final, sample_pfx + '.call.cns')
# Test for single-bin CNVs separately
seg_bintest = bintest.do_bintest(cnarr, seg_call, target_only=True)
tabio.write(seg_bintest, sample_pfx + '.bintest.cns')
if plot_scatter:
scatter.do_scatter(cnarr, seg_final)
pyplot.savefig(sample_pfx + '-scatter.png', format='png',
bbox_inches="tight")
logging.info("Wrote %s-scatter.png", sample_pfx)
if plot_diagram:
is_xx = cnarr.guess_xx(male_reference)
outfname = sample_pfx + '-diagram.pdf'
diagram.create_diagram(cnarr.shift_xx(male_reference, is_xx),
seg_final.shift_xx(male_reference, is_xx),
0.5, 3, outfname)
logging.info("Wrote %s", outfname)