Source code for cnvlib.autobin

"""Estimate reasonable bin sizes from BAM read counts or depths."""
from __future__ import absolute_import, division, print_function
import logging
import os
import tempfile

import numpy as np
import pandas as pd
from skgenome import tabio, GenomicArray as GA

from . import coverage, samutil
from .antitarget import compare_chrom_names
from .descriptives import weighted_median


[docs]def midsize_file(fnames): """Select the median-size file from several given filenames. If an even number of files is given, selects the file just below the median. """ return sorted(fnames, key=lambda f: os.stat(f).st_size )[len(fnames) // 2 - 1]
[docs]def do_autobin(bam_fname, method, targets=None, access=None, bp_per_bin=100000., target_min_size=20, target_max_size=50000, antitarget_min_size=500, antitarget_max_size=1000000): """Quickly calculate reasonable bin sizes from BAM read counts. Parameters ---------- bam_fname : string BAM filename. method : string One of: 'wgs' (whole-genome sequencing), 'amplicon' (targeted amplicon capture), 'hybrid' (hybridization capture). targets : GenomicArray Targeted genomic regions (for 'hybrid' and 'amplicon'). access : GenomicArray Sequencing-accessible regions of the reference genome (for 'hybrid' and 'wgs'). bp_per_bin : int Desired number of sequencing read nucleotide bases mapped to each bin. Returns ------- 2-tuple of 2-tuples: ((target depth, target avg. bin size), (antitarget depth, antitarget avg. bin size)) """ if method in ('amplicon', 'hybrid'): if targets is None: raise ValueError("Target regions are required for method %r " "but were not provided." % method) if not len(targets): raise ValueError("Target regions are required for method %r " "but were not provided." % method) # Closes over bp_per_bin def depth2binsize(depth, min_size, max_size): if depth: bin_size = int(round(bp_per_bin / depth)) if bin_size < min_size: logging.info("Limiting est. bin size %d to given min. %d", bin_size, min_size) bin_size = min_size elif bin_size > max_size: logging.info("Limiting est. bin size %d to given max. %d", bin_size, max_size) bin_size = max_size return bin_size samutil.ensure_bam_index(bam_fname) rc_table = samutil.idxstats(bam_fname, drop_unmapped=True) read_len = samutil.get_read_length(bam_fname) logging.info("Estimated read length %s", read_len) # Dispatch if method == 'amplicon': # From BAM index # rc_table = update_chrom_length(rc_table, targets) # tgt_depth = average_depth(rc_table, read_len) # By sampling tgt_depth = sample_region_cov(bam_fname, targets) anti_depth = None elif method == 'hybrid': tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets, access) elif method == 'wgs': if access is not None and len(access): rc_table = update_chrom_length(rc_table, access) tgt_depth = average_depth(rc_table, read_len) anti_depth = None # Clip bin sizes to specified ranges tgt_bin_size = depth2binsize(tgt_depth, target_min_size, target_max_size) anti_bin_size = depth2binsize(anti_depth, antitarget_min_size, antitarget_max_size) return ((tgt_depth, tgt_bin_size), (anti_depth, anti_bin_size))
[docs]def hybrid(rc_table, read_len, bam_fname, targets, access=None): """Hybrid capture sequencing.""" # Identify off-target regions if access is None: access = idxstats2ga(rc_table, bam_fname) # Verify BAM chromosome names match those in target BED compare_chrom_names(access, targets) antitargets = access.subtract(targets) # Only examine chromosomes present in all 2-3 input datasets rc_table, targets, antitargets = shared_chroms(rc_table, targets, antitargets) # Deal with targets target_depth = sample_region_cov(bam_fname, targets) # Antitargets: subtract captured reads from total target_length = region_size_by_chrom(targets)['length'] target_reads = (target_length * target_depth / read_len).values anti_table = update_chrom_length(rc_table, antitargets) anti_table = anti_table.assign(mapped=anti_table.mapped - target_reads) anti_depth = average_depth(anti_table, read_len) return target_depth, anti_depth
# ---
[docs]def average_depth(rc_table, read_length): """Estimate the average read depth across the genome. Returns ------- float Median of the per-chromosome mean read depths, weighted by chromosome size. """ mean_depths = read_length * rc_table.mapped / rc_table.length return weighted_median(mean_depths, rc_table.length)
[docs]def idxstats2ga(table, bam_fname): return GA(table.assign(start=0, end=table.length) .loc[:, ('chromosome', 'start', 'end')], meta_dict={'filename': bam_fname})
[docs]def sample_region_cov(bam_fname, regions, max_num=100): """Calculate read depth in a randomly sampled subset of regions.""" midsize_regions = sample_midsize_regions(regions, max_num) with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f: tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4') f.flush() table = coverage.bedcov(f.name, bam_fname, 0) # Mean read depth across all sampled regions return table.basecount.sum() / (table.end - table.start).sum()
[docs]def sample_midsize_regions(regions, max_num): """Randomly select a subset of up to `max_num` regions.""" sizes = regions.end - regions.start lo_size, hi_size = np.percentile(sizes[sizes > 0], [25, 75]) midsize_regions = regions.data[(sizes >= lo_size) & (sizes <= hi_size)] if len(midsize_regions) > max_num: midsize_regions = midsize_regions.sample(max_num, random_state=0xA5EED) return midsize_regions
[docs]def shared_chroms(*tables): """Intersection of DataFrame .chromosome values.""" chroms = tables[0].chromosome.drop_duplicates() for tab in tables[1:]: if tab is not None: new_chroms = tab.chromosome.drop_duplicates() chroms = chroms[chroms.isin(new_chroms)] return [None if tab is None else tab[tab.chromosome.isin(chroms)] for tab in tables]
[docs]def update_chrom_length(rc_table, regions): if regions is not None and len(regions): chrom_sizes = region_size_by_chrom(regions) rc_table = rc_table.merge(chrom_sizes, on='chromosome', how='inner') rc_table['length'] = rc_table['length_y'] # ? rc_table = rc_table.drop(['length_x', 'length_y'], axis=1) return rc_table
[docs]def region_size_by_chrom(regions): chromgroups = regions.data.groupby('chromosome', sort=False) # sizes = chromgroups.apply(total_region_size) # XXX sizes = [total_region_size(g) for _key, g in chromgroups] return pd.DataFrame({'chromosome': regions.chromosome.drop_duplicates(), 'length': sizes})
[docs]def total_region_size(regions): """Aggregate area of all genomic ranges in `regions`.""" return (regions.end - regions.start).sum()