"""Supporting functions for the 'antitarget' command."""
from __future__ import absolute_import, division
import logging
import math
import os.path
import time
from Bio._py3k import map, zip
import pysam
from .cnary import CopyNumArray as CNA
from .rary import RegionArray as RA
from .core import fbase
from .params import NULL_LOG2_COVERAGE, READ_LEN
[docs]def interval_coverages(bed_fname, bam_fname, by_count, min_mapq):
"""Calculate log2 coverages in the BAM file at each interval."""
start_time = time.time()
# Skip processing if the BED file is empty
with open(bed_fname) as bed_handle:
for line in bed_handle:
if line.strip():
break
else:
logging.info("Skip processing %s with empty regions file %s",
os.path.basename(bam_fname), bed_fname)
return CNA.from_rows([], meta_dict={'sample_id': fbase(bam_fname)})
# Calculate average read depth in each bin
ic_func = (interval_coverages_count if by_count
else interval_coverages_pileup)
results = ic_func(bed_fname, bam_fname, min_mapq)
read_counts, cna_rows = zip(*list(results))
# Log some stats
tot_time = time.time() - start_time
tot_reads = sum(read_counts)
logging.info("Time: %.3f seconds (%d reads/sec, %s bins/sec)",
tot_time,
int(round(tot_reads / tot_time, 0)),
int(round(len(read_counts) / tot_time, 0)))
logging.info("Summary: #bins=%d, #reads=%d, "
"mean=%.4f, min=%s, max=%s ",
len(read_counts),
tot_reads,
(tot_reads / len(read_counts)),
min(read_counts),
max(read_counts))
tot_mapped_reads = bam_total_reads(bam_fname)
if tot_mapped_reads:
logging.info("Percent reads in regions: %.3f (of %d mapped)",
100. * tot_reads / tot_mapped_reads,
tot_mapped_reads)
else:
logging.info("(Couldn't calculate total number of mapped reads)")
return CNA.from_rows(list(cna_rows),
meta_dict={'sample_id': fbase(bam_fname)})
[docs]def interval_coverages_count(bed_fname, bam_fname, min_mapq):
"""Calculate log2 coverages in the BAM file at each interval."""
bamfile = pysam.Samfile(bam_fname, 'rb')
for chrom, subregions in RA.read(bed_fname).by_chromosome():
logging.info("Processing chromosome %s of %s",
chrom, os.path.basename(bam_fname))
for _chrom, start, end, name, in subregions.coords(["name"]):
count, depth = region_depth_count(bamfile, chrom, start, end,
min_mapq)
yield [count,
(chrom, start, end, name,
math.log(depth, 2) if depth else NULL_LOG2_COVERAGE)]
[docs]def region_depth_count(bamfile, chrom, start, end, min_mapq):
"""Calculate depth of a region via pysam count.
i.e. counting the number of read starts in a region, then scaling for read
length and region width to estimate depth.
Coordinates are 0-based, per pysam.
"""
def filter_read(read):
"""True if the given read should be counted towards coverage."""
return not (read.is_duplicate
or read.is_secondary
or read.is_unmapped
or read.is_qcfail
or read.mapq < min_mapq)
# Count the number of read midpoints in the interval
count = sum((start <= (read.pos + .5*read.rlen) <= end and filter_read(read))
for read in bamfile.fetch(reference=chrom,
start=start, end=end))
# Scale read counts to region length
# Depth := #Bases / Span
depth = (READ_LEN * count / (end - start)
if end > start else 0)
return count, depth
[docs]def interval_coverages_pileup(bed_fname, bam_fname, min_mapq):
"""Calculate log2 coverages in the BAM file at each interval."""
logging.info("Processing reads in %s", os.path.basename(bam_fname))
for chrom, start, end, name, count, depth in bedcov(bed_fname, bam_fname,
min_mapq):
yield [count,
(chrom, start, end, name,
math.log(depth, 2) if depth else NULL_LOG2_COVERAGE)]
[docs]def bedcov(bed_fname, bam_fname, min_mapq):
"""Calculate depth of all regions in a BED file via samtools (pysam) bedcov.
i.e. mean pileup depth across each region.
"""
# Count bases in each region; exclude low-MAPQ reads
if min_mapq > 0:
bedcov_args = ['-Q', str(min_mapq)]
else:
bedcov_args = []
try:
lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args)
except pysam.SamtoolsError as exc:
raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s"
% (bam_fname, bed_fname, exc))
if not lines:
raise ValueError("BED file %r sequence IDs don't match any in BAM file %r"
% (bed_fname, bam_fname))
# Return an iterable...
if isinstance(lines, basestring):
lines = lines.splitlines()
for line in lines:
fields = line.split('\t')
if len(fields) == 5:
chrom, start_s, end_s, name, basecount_s = fields
elif len(fields) == 4:
chrom, start_s, end_s, basecount_s = fields
name = "-"
else:
raise RuntimeError("Bad line from bedcov:\n" + line)
start, end, basecount = map(int, (start_s, end_s, basecount_s.strip()))
span = end - start
if span > 0:
# Algebra from above
count = basecount / READ_LEN
mean_depth = basecount / span
else:
# User-supplied bins might be oddly constructed
count = mean_depth = 0
yield chrom, start, end, name, count, mean_depth
[docs]def bam_total_reads(bam_fname):
"""Count the total number of mapped reads in a BAM file.
Uses the BAM index to do this quickly.
"""
lines = pysam.idxstats(bam_fname)
if isinstance(lines, basestring):
lines = lines.splitlines()
tot_mapped_reads = 0
for line in lines:
_seqname, _seqlen, nmapped, _nunmapped = line.split()
tot_mapped_reads += int(nmapped)
return tot_mapped_reads