Source code for cnvlib.autobin

"""Estimate reasonable bin sizes from BAM read counts or depths."""
import logging
import os
import tempfile

import numpy as np
import pandas as pd
from skgenome import tabio, GenomicArray as GA

from . import coverage, samutil
from .antitarget import compare_chrom_names
from .descriptives import weighted_median


[docs]def midsize_file(fnames):
    """Select the median-size file from several given filenames.

    If an even number of files is given, selects the file just below the median.
    """
    assert fnames, 'No files provided to calculate the median size.'
    return sorted(fnames, key=lambda f: os.stat(f).st_size)[(len(fnames) - 1) // 2]


[docs]def do_autobin(bam_fname, method, targets=None, access=None,
               bp_per_bin=100000., target_min_size=20, target_max_size=50000,
               antitarget_min_size=500, antitarget_max_size=1000000, fasta=None):
    """Quickly calculate reasonable bin sizes from BAM read counts.

    Parameters
    ----------
    bam_fname : string
        BAM filename.
    method : string
        One of: 'wgs' (whole-genome sequencing), 'amplicon' (targeted amplicon
        capture), 'hybrid' (hybridization capture).
    targets : GenomicArray
        Targeted genomic regions (for 'hybrid' and 'amplicon').
    access : GenomicArray
        Sequencing-accessible regions of the reference genome (for 'hybrid' and
        'wgs').
    bp_per_bin : int
        Desired number of sequencing read nucleotide bases mapped to each bin.

    Returns
    -------
    2-tuple of 2-tuples:
        ((target depth, target avg. bin size),
         (antitarget depth, antitarget avg. bin size))
    """
    if method in ('amplicon', 'hybrid'):
        if targets is None:
            raise ValueError("Target regions are required for method %r "
                             "but were not provided." % method)
        if not len(targets):
            raise ValueError("Target regions are required for method %r "
                             "but were not provided." % method)

    # Closes over bp_per_bin
    def depth2binsize(depth, min_size, max_size):
        if depth:
            bin_size = int(round(bp_per_bin / depth))
            if bin_size < min_size:
                logging.info("Limiting est. bin size %d to given min. %d",
                             bin_size, min_size)
                bin_size = min_size
            elif bin_size > max_size:
                logging.info("Limiting est. bin size %d to given max. %d",
                             bin_size, max_size)
                bin_size = max_size
            return bin_size

    samutil.ensure_bam_index(bam_fname)
    rc_table = samutil.idxstats(bam_fname, drop_unmapped=True, fasta=fasta)
    read_len = samutil.get_read_length(bam_fname, fasta=fasta)
    logging.info("Estimated read length %s", read_len)

    # Dispatch
    if method == 'amplicon':
        # From BAM index
        # rc_table = update_chrom_length(rc_table, targets)
        # tgt_depth = average_depth(rc_table, read_len)
        # By sampling
        tgt_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
        anti_depth = None
    elif method == 'hybrid':
        tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets,
                                       access, fasta)
    elif method == 'wgs':
        if access is not None and len(access):
            rc_table = update_chrom_length(rc_table, access)
        tgt_depth = average_depth(rc_table, read_len)
        anti_depth = None

    # Clip bin sizes to specified ranges
    tgt_bin_size = depth2binsize(tgt_depth, target_min_size, target_max_size)
    anti_bin_size = depth2binsize(anti_depth, antitarget_min_size,
                                  antitarget_max_size)
    return ((tgt_depth, tgt_bin_size),
            (anti_depth, anti_bin_size))


[docs]def hybrid(rc_table, read_len, bam_fname, targets, access=None, fasta=None):
    """Hybrid capture sequencing."""
    # Identify off-target regions
    if access is None:
        access = idxstats2ga(rc_table, bam_fname)
        # Verify BAM chromosome names match those in target BED
        compare_chrom_names(access, targets)
    antitargets = access.subtract(targets)
    # Only examine chromosomes present in all 2-3 input datasets
    rc_table, targets, antitargets = shared_chroms(rc_table, targets,
                                                   antitargets)
    # Deal with targets
    target_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
    # Antitargets: subtract captured reads from total
    target_length = region_size_by_chrom(targets)['length']
    target_reads = (target_length * target_depth / read_len).values
    anti_table = update_chrom_length(rc_table, antitargets)
    anti_table = anti_table.assign(mapped=anti_table.mapped - target_reads)
    anti_depth = average_depth(anti_table, read_len)
    return target_depth, anti_depth


# ---

[docs]def average_depth(rc_table, read_length):
    """Estimate the average read depth across the genome.

    Returns
    -------
    float
        Median of the per-chromosome mean read depths, weighted by chromosome
        size.
    """
    mean_depths = read_length * rc_table.mapped / rc_table.length
    return weighted_median(mean_depths, rc_table.length)


[docs]def idxstats2ga(table, bam_fname):
    return GA(table.assign(start=0, end=table.length)
              .loc[:, ('chromosome', 'start', 'end')],
              meta_dict={'filename': bam_fname})


[docs]def sample_region_cov(bam_fname, regions, max_num=100, fasta=None):
    """Calculate read depth in a randomly sampled subset of regions."""
    midsize_regions = sample_midsize_regions(regions, max_num)
    with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f:
        tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4')
        f.flush()
        table = coverage.bedcov(f.name, bam_fname, 0, fasta)
    # Mean read depth across all sampled regions
    return table.basecount.sum() / (table.end - table.start).sum()


[docs]def sample_midsize_regions(regions, max_num):
    """Randomly select a subset of up to `max_num` regions."""
    sizes = regions.end - regions.start
    lo_size, hi_size = np.percentile(sizes[sizes > 0], [25, 75])
    midsize_regions = regions.data[(sizes >= lo_size) & (sizes <= hi_size)]
    if len(midsize_regions) > max_num:
        midsize_regions = midsize_regions.sample(max_num, random_state=0xA5EED)
    return midsize_regions


[docs]def shared_chroms(*tables):
    """Intersection of DataFrame .chromosome values."""
    chroms = tables[0].chromosome.drop_duplicates()
    for tab in tables[1:]:
        if tab is not None:
            new_chroms = tab.chromosome.drop_duplicates()
            chroms = chroms[chroms.isin(new_chroms)]
    return [None if tab is None else tab[tab.chromosome.isin(chroms)]
            for tab in tables]


[docs]def update_chrom_length(rc_table, regions):
    if regions is not None and len(regions):
        chrom_sizes = region_size_by_chrom(regions)
        rc_table = rc_table.merge(chrom_sizes, on='chromosome', how='inner')
        rc_table['length'] = rc_table['length_y'] # ?
        rc_table = rc_table.drop(['length_x', 'length_y'], axis=1)
    return rc_table


[docs]def region_size_by_chrom(regions):
    chromgroups = regions.data.groupby('chromosome', sort=False)
    # sizes = chromgroups.apply(total_region_size) # XXX
    sizes = [total_region_size(g) for _key, g in chromgroups]
    return pd.DataFrame({'chromosome': regions.chromosome.drop_duplicates(),
                         'length': sizes})


[docs]def total_region_size(regions):
    """Aggregate area of all genomic ranges in `regions`."""
    return (regions.end - regions.start).sum()