Source code for cnvlib.heatmap

"""The 'heatmap' command."""
import collections
import logging

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

from skgenome.rangelabel import unpack_range
from . import plots


cna2df = lambda cna: cna.data.loc[:, ['start', 'end', 'log2']]

[docs]def do_heatmap(cnarrs, show_range=None, do_desaturate=False, by_bin=False, delim_sampl=False, vertical=False, title=None, ax=None): """Plot copy number for multiple samples as a heatmap.""" if ax is None: _fig, axis = plt.subplots() else: axis = ax # List sample names on the appropriate axis. if not vertical: axis.set_yticks([i + 0.5 for i in range(len(cnarrs))]) axis.set_yticklabels([c.sample_id for c in cnarrs]) axis.set_ylim(0, len(cnarrs)) axis.set_ylabel('Samples') else: axis.set_xticks([i + 0.5 for i in range(len(cnarrs))]) axis.set_xticklabels([c.sample_id for c in cnarrs], rotation=90) axis.set_xlim(0, len(cnarrs)) axis.set_xlabel('Samples') if hasattr(axis, 'set_facecolor'): # matplotlib >= 2.0 axis.set_facecolor('#DDDDDD') else: # Older matplotlib. axis.set_axis_bgcolor('#DDDDDD') if by_bin and show_range: try: a_cnarr = next(c for c in cnarrs if 'probes' not in c) except StopIteration: r_chrom, r_start, r_end = unpack_range(show_range) if r_start is not None or r_end is not None: raise ValueError( 'Need at least 1 .cnr input file if --by-bin (by_bin) and --chromosome (show_range) are both used ' 'to specify a sub-chromosomal region.' ) else: logging.info('Using sample {} to map {} to bin coordinates'.format(a_cnarr.sample_id, show_range)) r_chrom, r_start, r_end = plots.translate_region_to_bins(show_range, a_cnarr) else: r_chrom, r_start, r_end = unpack_range(show_range) if r_start is not None or r_end is not None: logging.info('Showing log2 ratios in range {}:{}-{}'.format(r_chrom, r_start or 0, r_end or '*')) elif r_chrom: logging.info('Showing log2 ratios on chromosome {}'.format(r_chrom)) # Group each file's probes/segments by chromosome sample_data = [collections.defaultdict(list) for _c in cnarrs] # Calculate the size (max endpoint value) of each chromosome chrom_sizes = collections.OrderedDict() for i, cnarr in enumerate(cnarrs): if by_bin: cnarr = plots.update_binwise_positions_simple(cnarr) if r_chrom: subcna = cnarr.in_range(r_chrom, r_start, r_end, mode='trim') sample_data[i][r_chrom] = cna2df(subcna) chrom_sizes[r_chrom] = max(subcna.end.iat[-1] if subcna else 0, chrom_sizes.get(r_chrom, 0)) else: for chrom, subcna in cnarr.by_chromosome(): sample_data[i][chrom] = cna2df(subcna) chrom_sizes[chrom] = max(subcna.end.iat[-1] if subcna else 0, chrom_sizes.get(r_chrom, 0)) dict_log2 = collections.OrderedDict() if show_range: # Lay out only the selected chromosome # Set x-axis the chromosomal positions (in Mb), title as the selection if by_bin: MB = 1 axis.set_xlabel('Position (bin)') else: MB = plots.MB axis.set_xlabel('Position (Mb)') axis.set_title(show_range) axis.tick_params(which='both', direction='out') axis.get_xaxis().tick_bottom() axis.get_yaxis().tick_left() if not vertical: axis.set_xlim((r_start or 0) * MB, (r_end or chrom_sizes[r_chrom]) * MB) else: axis.set_ylim((r_start or 0) * MB, (r_end or chrom_sizes[r_chrom]) * MB) # Plot the individual probe/segment coverages for i, sample in enumerate(sample_data): sampl_crow = sample[r_chrom] if not len(sampl_crow): logging.warning('Sample #{} has no data points in selection {}', i+1, show_range) sampl_crow['start'] *= MB sampl_crow['end'] *= MB dict_log2[i] = sampl_crow.set_index(['start', 'end']).log2 else: # Lay out chromosome dividers and x-axis labels # (Just enough padding to avoid overlap with the divider line) chrom_offsets = plots.plot_chromosome_dividers(axis, chrom_sizes, 1, along='y' if vertical else 'x') # Plot the individual probe/segment coverages for i, sample in enumerate(sample_data): all_crows = [] for chrom, curr_offset in chrom_offsets.items(): crow = sample[chrom] if len(crow): crow['start'] += curr_offset crow['end'] += curr_offset all_crows.append(crow) else: logging.warning('Sample #%d has no datapoints', i+1) sampl_crow = pd.concat(all_crows, axis='index') dict_log2[i] = sampl_crow.set_index(['start', 'end']).log2 log2_df = pd.DataFrame.from_dict(dict_log2) # Need to explicitly insert NaN-filled rows in-between 2 discontiguous intervals log2_df.reset_index(inplace=True) compt = 0 for i in range(1, len(log2_df.index)): end_previous = log2_df.iloc[i-1].end start_current = log2_df.iloc[i].start if end_previous != start_current: # Discontiguous. compt += 1 log2_df.loc[i-1+0.5, :] = [end_previous, start_current] + [np.nan] * len(cnarrs) log2_df.sort_index(inplace=True) logging.debug('Inserted {} empty intervals (log2 = NaN for all samples).'.format(compt)) # If no data for all samples, return an empty plot. Without this, further log2_df.end.iat[-1] causes an IndexError. if not len(log2_df): return axis # If shading='flat' (which is default) the dimensions of X and Y should be one greater than those of C. start2plt = np.array(log2_df.start.to_list() + [log2_df.end.iat[-1]]) sampl2plt = np.array(range(len(cnarrs) + 1)) if not vertical: dat2plot = log2_df.drop(['start', 'end'], axis='columns').transpose() x_pcolor, y_pcolor = start2plt, sampl2plt else: dat2plot = log2_df.drop(['start', 'end'], axis='columns') x_pcolor, y_pcolor = sampl2plt, start2plt cmap = ListedColormap([plots.cvg2rgb(x, do_desaturate) for x in np.linspace(-1.33, 1.33, 200)]) im = axis.pcolormesh(x_pcolor, y_pcolor, dat2plot, vmin=-1.33, vmax=1.33, cmap=cmap) cbar = plt.colorbar(im, ax=axis, fraction=0.04, pad=0.03, shrink=0.6) cbar.set_label('log2', labelpad=0) if delim_sampl: delim_method = axis.axvline if vertical else axis.axhline for i in range(len(cnarrs)): delim_method(i, color='k') axis.invert_yaxis() if title: axis.set_title(title) return axis