Source code for cnvlib.plots

"""Plotting utilities."""
from __future__ import absolute_import, division
from builtins import str
from past.builtins import basestring

import collections
import logging
import math

import numpy as np

from . import core, params


MB = 1e-6  # To rescale from bases to megabases


[docs]def plot_x_dividers(axis, chrom_sizes, pad=None):
    """Plot vertical dividers and x-axis labels given the chromosome sizes.

    Draws vertical black lines between each chromosome, with padding.
    Labels each chromosome range with the chromosome name, centered in the
    region, under a tick.
    Sets the x-axis limits to the covered range.

    Returns
    -------
    OrderedDict
        A table of the x-position offsets of each chromosome.
    """
    assert isinstance(chrom_sizes, collections.OrderedDict)
    if pad is None:
        pad = 0.003 * sum(chrom_sizes.values())
    x_dividers = []
    x_centers = []
    x_starts = collections.OrderedDict()
    curr_offset = pad
    for label, size in list(chrom_sizes.items()):
        x_starts[label] = curr_offset
        x_centers.append(curr_offset + 0.5 * size)
        x_dividers.append(curr_offset + size + pad)
        curr_offset += size + 2 * pad

    axis.set_xlim(0, curr_offset)
    for xposn in x_dividers[:-1]:
        axis.axvline(x=xposn, color='k')
    # Use chromosome names as x-axis labels (instead of base positions)
    axis.set_xticks(x_centers)
    axis.set_xticklabels(list(chrom_sizes.keys()), rotation=60)
    axis.tick_params(labelsize='small')
    axis.tick_params(axis='x', length=0)
    axis.get_yaxis().tick_left()

    return x_starts


# ________________________________________
# Internal supporting functions

[docs]def chromosome_sizes(probes, to_mb=False):
    """Create an ordered mapping of chromosome names to sizes."""
    chrom_sizes = collections.OrderedDict()
    for chrom, rows in probes.by_chromosome():
        chrom_sizes[chrom] = rows['end'].max()
        if to_mb:
            chrom_sizes[chrom] *= MB
    return chrom_sizes


[docs]def partition_by_chrom(chrom_snvs):
    """Group the tumor shift values by chromosome (for statistical testing)."""
    chromnames = set(chrom_snvs.keys())
    bins = {key: {'thisbin': [], 'otherbins': []}
            for key in chrom_snvs}
    for thischrom, snvs in chrom_snvs.items():
        shiftvals = np.array([abs(v[2]) for v in snvs])
        bins[thischrom]['thisbin'].extend(shiftvals)
        for otherchrom in chromnames:
            if otherchrom == thischrom:
                continue
            bins[otherchrom]['otherbins'].extend(shiftvals)
    return bins


[docs]def test_loh(bins, alpha=0.0025):
    """Test each chromosome's SNP shifts and the combined others'.

    The statistical test is Mann-Whitney, a one-sided non-parametric test for
    difference in means.
    """
    # TODO - this doesn't work right if there are many shifted regions
    try:
        from scipy import stats
    except ImportError:
        # SciPy not installed; can't test for significance
        return []

    significant_chroms = []
    for chrom, partitions in bins.items():
        these_shifts = np.array(partitions['thisbin'], np.float_)
        other_shifts = np.array(partitions['otherbins'], np.float_)
        if len(these_shifts) < 20:
            logging.info("Too few points (%d) to test chrom %s",
                         len(these_shifts), chrom)
        elif these_shifts.mean() > other_shifts.mean():
            logging.debug("\nThese ~= %f (N=%d), Other ~= %f (N=%d)",
                          these_shifts.mean(), len(these_shifts),
                          other_shifts.mean(), len(other_shifts))
            u, prob = stats.mannwhitneyu(these_shifts, other_shifts)
            logging.info("Mann-Whitney - %s: u=%s, p=%s", chrom, u, prob)
            if prob < alpha:
                significant_chroms.append(chrom)

    return significant_chroms


# ________________________________________
# Utilies used by other modules


[docs]def cvg2rgb(cvg, desaturate):
    """Choose a shade of red or blue representing log2-coverage value."""
    cutoff = 1.33  # Values above this magnitude are shown with max intensity
    x = min(abs(cvg) / cutoff, 1.0)
    if desaturate:
        # Adjust intensity sigmoidally -- reduce near 0, boost near 1
        # Exponent <1 shifts the fixed point leftward (from x=0.5)
        x = ((1. - math.cos(x * math.pi)) / 2.) ** 0.8
        # Slight desaturation of colors at lower coverage
        s = x**1.2
    else:
        s = x
    if cvg < 0:
        rgb = (1 - s, 1 - s, 1 - .25*x)  # Blueish
    else:
        rgb = (1 - .25*x, 1 - s, 1 - s)  # Reddish
    return rgb


# XXX should this be a CopyNumArray method?
# or: use by_genes internally
# or: have by_genes use this internally
[docs]def gene_coords_by_name(probes, names):
    """Find the chromosomal position of each named gene in probes.

    Returns
    -------
    dict
        Of: {chromosome: [(start, end, gene name), ...]}
    """
    # Create an index of gene names
    gene_index = collections.defaultdict(set)
    for i, gene in enumerate(probes['gene']):
        for gene_name in gene.split(','):
            if gene_name in names:
                gene_index[gene_name].add(i)
    # Retrieve coordinates by name
    all_coords = collections.defaultdict(lambda : collections.defaultdict(set))
    for name in names:
        gene_probes = probes.data.take(sorted(gene_index.get(name, [])))
        if not len(gene_probes):
            raise ValueError("No targeted gene named '%s' found" % name)
        # Find the genomic range of this gene's probes
        start = gene_probes['start'].min()
        end = gene_probes['end'].max()
        chrom = core.check_unique(gene_probes['chromosome'], name)
        # Deduce the unique set of gene names for this region
        orig_names = set()
        for oname in set(gene_probes['gene']):
            orig_names.update(oname.split(','))
        all_coords[chrom][start, end].update(orig_names)
    # Consolidate each region's gene names into a string
    uniq_coords = {}
    for chrom, hits in all_coords.items():
        uniq_coords[chrom] = [(start, end, ",".join(sorted(orig_names)))
                             for (start, end), orig_names in hits.items()]
    return uniq_coords


[docs]def gene_coords_by_range(probes, chrom, start, end,
                         ignore=params.IGNORE_GENE_NAMES):
    """Find the chromosomal position of all genes in a range.

    Returns
    -------
    dict
        Of: {chromosome: [(start, end, gene), ...]}
    """
    ignore += ('Background',)
    # Tabulate the genes in the selected region
    genes = collections.OrderedDict()
    for row in probes.in_range(chrom, start, end):
        name = str(row.gene)
        if name in ignore:
            continue
        if name in genes:
            genes[name][1] = row.end
        else:
            genes[name] = [row.start, row.end]
    # Reorganize the data structure
    return {chrom: [(gstart, gend, name)
                    for name, (gstart, gend) in list(genes.items())]}