Source code for cnvlib.autobin

"""Estimate reasonable bin sizes from BAM read counts or depths."""
import logging
import os
import tempfile

import numpy as np
import pandas as pd
from skgenome import tabio, GenomicArray as GA

from . import coverage, samutil
from .antitarget import compare_chrom_names
from .descriptives import weighted_median


[docs]def midsize_file(fnames): """Select the median-size file from several given filenames. If an even number of files is given, selects the file just below the median. """ return sorted(fnames, key=lambda f: os.stat(f).st_size )[len(fnames) // 2 - 1]
[docs]def do_autobin(bam_fname, method, targets=None, access=None, bp_per_bin=100000., target_min_size=20, target_max_size=50000, antitarget_min_size=500, antitarget_max_size=1000000, fasta=None): """Quickly calculate reasonable bin sizes from BAM read counts. Parameters ---------- bam_fname : string BAM filename. method : string One of: 'wgs' (whole-genome sequencing), 'amplicon' (targeted amplicon capture), 'hybrid' (hybridization capture). targets : GenomicArray Targeted genomic regions (for 'hybrid' and 'amplicon'). access : GenomicArray Sequencing-accessible regions of the reference genome (for 'hybrid' and 'wgs'). bp_per_bin : int Desired number of sequencing read nucleotide bases mapped to each bin. Returns ------- 2-tuple of 2-tuples: ((target depth, target avg. bin size), (antitarget depth, antitarget avg. bin size)) """ if method in ('amplicon', 'hybrid'): if targets is None: raise ValueError("Target regions are required for method %r " "but were not provided." % method) if not len(targets): raise ValueError("Target regions are required for method %r " "but were not provided." % method) # Closes over bp_per_bin def depth2binsize(depth, min_size, max_size): if depth: bin_size = int(round(bp_per_bin / depth)) if bin_size < min_size: logging.info("Limiting est. bin size %d to given min. %d", bin_size, min_size) bin_size = min_size elif bin_size > max_size: logging.info("Limiting est. bin size %d to given max. %d", bin_size, max_size) bin_size = max_size return bin_size samutil.ensure_bam_index(bam_fname) rc_table = samutil.idxstats(bam_fname, drop_unmapped=True, fasta=fasta) read_len = samutil.get_read_length(bam_fname, fasta=fasta) logging.info("Estimated read length %s", read_len) # Dispatch if method == 'amplicon': # From BAM index # rc_table = update_chrom_length(rc_table, targets) # tgt_depth = average_depth(rc_table, read_len) # By sampling tgt_depth = sample_region_cov(bam_fname, targets, fasta=fasta) anti_depth = None elif method == 'hybrid': tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets, access, fasta) elif method == 'wgs': if access is not None and len(access): rc_table = update_chrom_length(rc_table, access) tgt_depth = average_depth(rc_table, read_len) anti_depth = None # Clip bin sizes to specified ranges tgt_bin_size = depth2binsize(tgt_depth, target_min_size, target_max_size) anti_bin_size = depth2binsize(anti_depth, antitarget_min_size, antitarget_max_size) return ((tgt_depth, tgt_bin_size), (anti_depth, anti_bin_size))
[docs]def hybrid(rc_table, read_len, bam_fname, targets, access=None, fasta=None): """Hybrid capture sequencing.""" # Identify off-target regions if access is None: access = idxstats2ga(rc_table, bam_fname) # Verify BAM chromosome names match those in target BED compare_chrom_names(access, targets) antitargets = access.subtract(targets) # Only examine chromosomes present in all 2-3 input datasets rc_table, targets, antitargets = shared_chroms(rc_table, targets, antitargets) # Deal with targets target_depth = sample_region_cov(bam_fname, targets, fasta=fasta) # Antitargets: subtract captured reads from total target_length = region_size_by_chrom(targets)['length'] target_reads = (target_length * target_depth / read_len).values anti_table = update_chrom_length(rc_table, antitargets) anti_table = anti_table.assign(mapped=anti_table.mapped - target_reads) anti_depth = average_depth(anti_table, read_len) return target_depth, anti_depth
# ---
[docs]def average_depth(rc_table, read_length): """Estimate the average read depth across the genome. Returns ------- float Median of the per-chromosome mean read depths, weighted by chromosome size. """ mean_depths = read_length * rc_table.mapped / rc_table.length return weighted_median(mean_depths, rc_table.length)
[docs]def idxstats2ga(table, bam_fname): return GA(table.assign(start=0, end=table.length) .loc[:, ('chromosome', 'start', 'end')], meta_dict={'filename': bam_fname})
[docs]def sample_region_cov(bam_fname, regions, max_num=100, fasta=None): """Calculate read depth in a randomly sampled subset of regions.""" midsize_regions = sample_midsize_regions(regions, max_num) with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f: tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4') f.flush() table = coverage.bedcov(f.name, bam_fname, 0, fasta) # Mean read depth across all sampled regions return table.basecount.sum() / (table.end - table.start).sum()
[docs]def sample_midsize_regions(regions, max_num): """Randomly select a subset of up to `max_num` regions.""" sizes = regions.end - regions.start lo_size, hi_size = np.percentile(sizes[sizes > 0], [25, 75]) midsize_regions = regions.data[(sizes >= lo_size) & (sizes <= hi_size)] if len(midsize_regions) > max_num: midsize_regions = midsize_regions.sample(max_num, random_state=0xA5EED) return midsize_regions
[docs]def shared_chroms(*tables): """Intersection of DataFrame .chromosome values.""" chroms = tables[0].chromosome.drop_duplicates() for tab in tables[1:]: if tab is not None: new_chroms = tab.chromosome.drop_duplicates() chroms = chroms[chroms.isin(new_chroms)] return [None if tab is None else tab[tab.chromosome.isin(chroms)] for tab in tables]
[docs]def update_chrom_length(rc_table, regions): if regions is not None and len(regions): chrom_sizes = region_size_by_chrom(regions) rc_table = rc_table.merge(chrom_sizes, on='chromosome', how='inner') rc_table['length'] = rc_table['length_y'] # ? rc_table = rc_table.drop(['length_x', 'length_y'], axis=1) return rc_table
[docs]def region_size_by_chrom(regions): chromgroups = regions.data.groupby('chromosome', sort=False) # sizes = chromgroups.apply(total_region_size) # XXX sizes = [total_region_size(g) for _key, g in chromgroups] return pd.DataFrame({'chromosome': regions.chromosome.drop_duplicates(), 'length': sizes})
[docs]def total_region_size(regions): """Aggregate area of all genomic ranges in `regions`.""" return (regions.end - regions.start).sum()