Source code for cnvlib.coverage

"""Supporting functions for the 'antitarget' command."""
from __future__ import absolute_import, division
import math
import os.path
import time
from itertools import groupby

from Bio._py3k import map, zip
import pysam

from . import ngfrills
from .ngfrills import echo

from .params import NULL_LOG2_COVERAGE, READ_LEN


[docs]def interval_coverages(bed_fname, bam_fname, by_count): """Calculate log2 coverages in the BAM file at each interval.""" start_time = time.time() # Skip processing if the BED file is empty with open(bed_fname) as bed_handle: for line in bed_handle: if line.strip(): break else: echo("Skip processing", os.path.basename(bam_fname), "with empty regions file", bed_fname) return [] # Calculate average read depth in each bin ic_func = (interval_coverages_count if by_count else interval_coverages_pileup) results = ic_func(bed_fname, bam_fname) read_counts, cna_rows = zip(*list(results)) # Log some stats tot_time = time.time() - start_time tot_reads = sum(read_counts) echo("Time: %.3f seconds (%d reads/sec, %s bins/sec)" % (tot_time, int(round(tot_reads / tot_time, 0)), int(round(len(read_counts) / tot_time, 0)))) echo("Summary:", "#bins=%d," % len(read_counts), "#reads=%d," % tot_reads, "mean=%.4f," % (tot_reads / len(read_counts)), "min=%s," % min(read_counts), "max=%s" % max(read_counts)) tot_mapped_reads = bam_total_reads(bam_fname) if tot_mapped_reads: echo("On-target percentage: %.3f (of %d mapped)" % (100. * tot_reads / tot_mapped_reads, tot_mapped_reads)) else: echo("(Couldn't calculate total number of mapped reads)") return list(cna_rows)
[docs]def interval_coverages_count(bed_fname, bam_fname): """Calculate log2 coverages in the BAM file at each interval.""" bamfile = pysam.Samfile(bam_fname, 'rb') # Parse the BED lines and group them by chromosome # (efficient if records are already sorted by chromosome) for chrom, rows_iter in groupby(ngfrills.parse_regions(bed_fname), key=lambda r: r[0]): # Thunk and reshape this chromosome's intervals echo("Processing chromosome", chrom, "of", os.path.basename(bam_fname)) _chroms, starts, ends, names = zip(*rows_iter) counts_depths = [region_depth_count(bamfile, chrom, s, e) for s, e in zip(starts, ends)] for start, end, name, (count, depth) in zip(starts, ends, names, counts_depths): yield [count, (chrom, start, end, name, math.log(depth, 2) if depth else NULL_LOG2_COVERAGE)]
[docs]def region_depth_count(bamfile, chrom, start, end): """Calculate depth of a region via pysam count. i.e. counting the number of read starts in a region, then scaling for read length and region width to estimate depth. Coordinates are 0-based, per pysam. """ # Count the number of read midpoints in the interval count = sum((start <= (read.pos + .5*read.rlen) <= end and filter_read(read)) for read in bamfile.fetch(reference=chrom, start=start, end=end)) # Scale read counts to region length # Depth := #Bases / Span depth = (READ_LEN * count / (end - start) if end > start else 0) return count, depth
[docs]def interval_coverages_pileup(bed_fname, bam_fname): """Calculate log2 coverages in the BAM file at each interval.""" echo("Processing reads in", os.path.basename(bam_fname)) for chrom, start, end, name, count, depth in bedcov(bed_fname, bam_fname): yield [count, (chrom, start, end, name, math.log(depth, 2) if depth else NULL_LOG2_COVERAGE)]
[docs]def bedcov(bed_fname, bam_fname): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude 0-MAPQ reads try: lines = pysam.bedcov(bed_fname, bam_fname, '-Q', '1') except pysam.SamtoolsError, exc: raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s" % (bam_fname, bed_fname, exc)) if not lines: raise ValueError("BED file %r sequence IDs don't match any in BAM file %r" % (bed_fname, bam_fname)) # Return an iterable... for line in lines: try: chrom, start_s, end_s, name, basecount_s = line.split('\t') except: raise RuntimeError("Bad line from bedcov:\n" + line) start, end, basecount = map(int, (start_s, end_s, basecount_s.strip())) span = end - start if span > 0: # Algebra from above count = basecount / READ_LEN mean_depth = basecount / span else: # User-supplied bins might be oddly constructed count = mean_depth = 0 yield chrom, start, end, name, count, mean_depth
[docs]def filter_column(col): """Count the number of filtered reads in a pileup column.""" return sum(filter_read(read.alignment) for read in col.pileups)
[docs]def filter_read(read): """True if the given read should be counted towards coverage.""" return not (read.is_duplicate or read.is_secondary or read.is_unmapped or read.is_qcfail or read.mapq == 0 )
[docs]def bam_total_reads(bam_fname): """Count the total number of mapped reads in a BAM file. Uses the BAM index to do this quickly. """ lines = pysam.idxstats(bam_fname) tot_mapped_reads = 0 for line in lines: _seqname, _seqlen, nmapped, _nunmapped = line.split() tot_mapped_reads += int(nmapped) return tot_mapped_reads