Source code for cnvlib.scatter

"""The 'scatter' command for rendering copy number as scatter plots."""

from __future__ import annotations
import collections
import logging

import numpy as np
from matplotlib import pyplot
from skgenome.rangelabel import unpack_range

from . import core, params, plots
from .plots import MB
from .cnary import CopyNumArray as CNA
from typing import TYPE_CHECKING, Optional

if TYPE_CHECKING:
    from cnvlib.cnary import CopyNumArray
    from matplotlib.axes._axes import Axes
    from matplotlib.figure import Figure

HIGHLIGHT_COLOR = "gold"
POINT_COLOR = "#606060"
SEG_COLOR = "darkorange"
TREND_COLOR = "#A0A0A0"


[docs] def do_scatter( cnarr: CopyNumArray, segments: Optional[CopyNumArray] = None, variants: None = None, show_range: None = None, show_gene: None = None, do_trend: bool = False, by_bin: bool = False, window_width: float = 1e6, y_min: None = None, y_max: None = None, fig_size: None = None, antitarget_marker: None = None, segment_color: str = SEG_COLOR, title: None = None, ) -> Figure: """Plot probe log2 coverages and segmentation calls together. Parameters ---------- cnarr : CopyNumArray Bin-level copy number data to plot. segments : CopyNumArray, optional Segmented copy number data to overlay. variants : VariantArray, optional Variant allele frequency data to plot. show_range : str, optional Genomic range to display (e.g., "chr1:1000000-2000000"). show_gene : str, optional Gene name to zoom into. do_trend : bool, optional Plot a smoothed trendline. Default is False. by_bin : bool, optional Plot by bin index rather than genomic coordinates. Default is False. window_width : float, optional Smoothing window width in bases (or bins if by_bin=True). Default is 1e6. y_min : float, optional Minimum y-axis value. y_max : float, optional Maximum y-axis value. fig_size : tuple of float, optional Figure size as (width, height) in inches. antitarget_marker : str, optional Marker style for antitarget bins. segment_color : str, optional Color for segment lines. Default is SEG_COLOR. title : str, optional Plot title. Returns ------- matplotlib.figure.Figure The scatter plot figure. """ if by_bin: bp_per_bin = sum(c.end.iat[-1] for _, c in cnarr.by_chromosome()) / len(cnarr) window_width /= bp_per_bin show_range_bins = plots.translate_region_to_bins(show_range, cnarr) cnarr, segments, variants = plots.update_binwise_positions( cnarr, segments, variants ) global MB orig_mb = MB MB = 1 if not show_gene and not show_range: fig = genome_scatter( cnarr, segments, variants, do_trend, y_min, y_max, title, segment_color ) else: if by_bin: show_range = show_range_bins fig = chromosome_scatter( cnarr, segments, variants, show_range, show_gene, antitarget_marker, do_trend, by_bin, window_width, y_min, y_max, title, segment_color, ) if by_bin: # Reset to avoid permanently altering the value of cnvlib.scatter.MB MB = orig_mb if fig_size: width, height = fig_size fig.set_size_inches(w=width, h=height) return fig
# === Genome-level scatter plots ===
[docs] def genome_scatter( cnarr: CopyNumArray, segments: Optional[CopyNumArray] = None, variants: None = None, do_trend: bool = False, y_min: None = None, y_max: None = None, title: None = None, segment_color: str = SEG_COLOR, ) -> Figure: """Plot all chromosomes, concatenated on one plot.""" if (cnarr or segments) and variants: # Lay out top 3/5 for the CN scatter, bottom 2/5 for SNP plot axgrid = pyplot.GridSpec(5, 1, hspace=0.85) axis = pyplot.subplot(axgrid[:3]) axis2 = pyplot.subplot(axgrid[3:], sharex=axis) # Place chromosome labels between the CNR and SNP plots axis2.tick_params(labelbottom=False) chrom_sizes = plots.chromosome_sizes(cnarr or segments) axis2 = snv_on_genome( axis2, variants, chrom_sizes, segments, do_trend, segment_color ) else: _fig, axis = pyplot.subplots() if title is None: title = (cnarr or segments or variants).sample_id if cnarr or segments: axis.set_title(title) axis = cnv_on_genome( axis, cnarr, segments, do_trend, y_min, y_max, segment_color ) else: axis.set_title(f"Variant allele frequencies: {title}") chrom_sizes = collections.OrderedDict( (chrom, subarr["end"].max()) for chrom, subarr in variants.by_chromosome() ) axis = snv_on_genome( axis, variants, chrom_sizes, segments, do_trend, segment_color ) return axis.get_figure()
[docs] def cnv_on_genome( axis: Axes, probes: CopyNumArray, segments: CopyNumArray, do_trend: bool = False, y_min: None = None, y_max: None = None, segment_color: str = SEG_COLOR, ) -> Axes: """Plot bin ratios and/or segments for all chromosomes on one plot.""" # Configure axes etc. axis.axhline(color="k") axis.set_ylabel("Copy ratio (log2)") if not (y_min and y_max): if segments: # Auto-scale y-axis according to segment mean-coverage values # (Avoid spuriously low log2 values in HLA and chrY) low_chroms = segments.chromosome.isin(("6", "chr6", "Y", "chrY")) seg_auto_vals = segments[~low_chroms]["log2"].dropna() if not y_min: y_min = ( np.nanmin([seg_auto_vals.min() - 0.2, -1.5]) if len(seg_auto_vals) else -2.5 ) if not y_max: y_max = ( np.nanmax([seg_auto_vals.max() + 0.2, 1.5]) if len(seg_auto_vals) else 2.5 ) else: if not y_min: y_min = -2.5 if not y_max: y_max = 2.5 axis.set_ylim(y_min, y_max) # Group probes by chromosome (to calculate plotting coordinates) if probes: chrom_sizes = plots.chromosome_sizes(probes) chrom_probes = dict(probes.by_chromosome()) # Precalculate smoothing window size so all chromosomes have similar # degree of smoothness # NB: Target panel has ~1k bins/chrom. -> 250-bin window # Exome: ~10k bins/chrom. -> 2500-bin window window_size = round(0.15 * len(probes) / probes.chromosome.nunique()) else: chrom_sizes = plots.chromosome_sizes(segments) # Same for segment calls chrom_segs = dict(segments.by_chromosome()) if segments else {} # Plot points & segments x_starts = plots.plot_chromosome_dividers(axis, chrom_sizes) for chrom, x_offset in x_starts.items(): if probes and chrom in chrom_probes: subprobes = chrom_probes[chrom] x = 0.5 * (subprobes["start"] + subprobes["end"]) + x_offset axis.scatter( x, subprobes["log2"], marker=".", color=POINT_COLOR, edgecolor="none", alpha=0.2, ) if do_trend: # ENH break trendline by chromosome arm boundaries? # Here and in subsequent occurrences, it's important to use snap=False # to avoid short lines/segment disappearing when saving as PNG. # See also: https://github.com/etal/cnvkit/issues/604 axis.plot( x, subprobes.smooth_log2(), color=POINT_COLOR, linewidth=2, zorder=-1, snap=False, ) if chrom in chrom_segs: for seg in chrom_segs[chrom]: color = choose_segment_color(seg, segment_color) axis.plot( (seg.start + x_offset, seg.end + x_offset), (seg.log2, seg.log2), color=color, linewidth=3, solid_capstyle="round", snap=False, ) return axis
[docs] def snv_on_genome(axis, variants, chrom_sizes, segments, do_trend, segment_color): """Plot a scatter-plot of SNP chromosomal positions and shifts.""" axis.set_ylim(0.0, 1.0) axis.set_ylabel("VAF") x_starts = plots.plot_chromosome_dividers(axis, chrom_sizes) # Calculate the coordinates of plot components chrom_snvs = dict(variants.by_chromosome()) if segments: chrom_segs = dict(segments.by_chromosome()) elif do_trend: # Pretend a single segment covers each chromosome chrom_segs = {chrom: None for chrom in chrom_snvs} else: chrom_segs = {} for chrom, x_offset in x_starts.items(): if chrom not in chrom_snvs: continue snvs = chrom_snvs[chrom] # Plot the points axis.scatter( snvs["start"].values + x_offset, snvs["alt_freq"].values, color=POINT_COLOR, edgecolor="none", alpha=0.2, marker=".", ) # Trend bars: always calculated, only shown on request if chrom in chrom_segs: # Draw average VAF within each segment segs = chrom_segs[chrom] for seg, v_freq in get_segment_vafs(snvs, segs): if seg: posn = [seg.start + x_offset, seg.end + x_offset] color = choose_segment_color( seg, segment_color, default_bright=False ) else: posn = [snvs.start.iat[0] + x_offset, snvs.start.iat[-1] + x_offset] color = TREND_COLOR axis.plot( posn, [v_freq, v_freq], color=color, linewidth=2, zorder=-1, solid_capstyle="round", snap=False, ) return axis
# === Chromosome-level scatter plots ===
[docs] def chromosome_scatter( cnarr, segments, variants, show_range, show_gene, antitarget_marker, do_trend, by_bin, window_width, y_min, y_max, title, segment_color, ): """Plot a specified region on one chromosome. Possibilities:: Options | Shown ------------ | -------- -c | -g | Genes | Region ------- | -- | ----- | ------ - | + | given | auto: gene(s) + margin chr | - | none | whole chrom chr | + | given | whole chrom chr:s-e | - | all | given chr:s-e | + | given | given """ sel_probes, sel_segs, sel_snvs, window_coords, genes, chrom = select_range_genes( cnarr, segments, variants, show_range, show_gene, window_width ) # Create plots if cnarr or segments: # Plot CNVs at chromosome level if variants: # Lay out top 3/5 for the CN scatter, bottom 2/5 for SNP plot axgrid = pyplot.GridSpec(5, 1, hspace=0.5) axis = pyplot.subplot(axgrid[:3]) axis2 = pyplot.subplot(axgrid[3:], sharex=axis) # Plot allele freqs for only the selected region snv_on_chromosome( axis2, sel_snvs, sel_segs, genes, do_trend, by_bin, segment_color ) else: _fig, axis = pyplot.subplots() if by_bin: axis.set_xlabel("Position (bin)") else: axis.set_xlabel("Position (Mb)") axis = cnv_on_chromosome( axis, sel_probes, sel_segs, genes, antitarget_marker=antitarget_marker, do_trend=do_trend, x_limits=window_coords, y_min=y_min, y_max=y_max, segment_color=segment_color, ) elif variants: # Only plot SNVs in a single-panel layout _fig, axis = pyplot.subplots() axis = snv_on_chromosome( axis, sel_snvs, sel_segs, genes, do_trend, by_bin, segment_color ) if title is None: title = "%s %s" % ((cnarr or segments or variants).sample_id, chrom) axis.set_title(title) return axis.get_figure()
[docs] def select_range_genes(cnarr, segments, variants, show_range, show_gene, window_width): """Determine which datapoints to show based on the given options. Behaviors:: start/end show_gene + + given region + genes; err if any gene outside it - + window +/- around genes + - given region, highlighting any genes within it - - whole chromosome, no genes If `show_range` is a chromosome name only, no start/end positions, then the whole chromosome will be shown. If region start/end coordinates are given and `show_gene` is '' or ',' (or all commas, etc.), then instead of highlighting all genes in the selection, no genes will be highlighted. """ chrom, start, end = unpack_range(show_range) if start is None and end is None: # Either the specified range is only chrom, no start-end, or gene names # were given window_coords = () else: # Viewing region coordinates were specified -- take them as given # Fill in open-ended ranges' endpoints if start is None or start < 0: start = 0 if not end: # Default selection endpoint to the maximum chromosome position end = (cnarr or segments or variants).filter(chromosome=chrom).end.iat[-1] if end <= start: raise ValueError( f"Coordinate range {chrom}:{start}-{end} (from {show_range}) " + "has size <= 0" ) window_coords = (start, end) gene_ranges = [] if show_gene is None: if window_coords: if cnarr: # Highlight all genes within the given range gene_ranges = plots.gene_coords_by_range(cnarr, chrom, start, end)[ chrom ] if not gene_ranges and (end - start) < 10 * window_width: # No genes in the selected region, so if the selection is small # (i.e. <80% of the displayed window, <10x window padding), # highlight the selected region itself. # (To prevent this, use show_gene='' or window_width=0) logging.info( "No genes found in selection; will highlight the " "selected region itself instead" ) gene_ranges = [(start, end, "Selection")] window_coords = (max(0, start - window_width), end + window_width) else: gene_names = filter(None, show_gene.split(",")) if gene_names: # Scan for probes matching the specified gene(s) gene_coords = plots.gene_coords_by_name(cnarr or segments, gene_names) if len(gene_coords) > 1: raise ValueError( f"Genes {show_gene} are split across chromosomes " f"{list(gene_coords.keys())}" ) g_chrom, gene_ranges = gene_coords.popitem() if chrom: # Confirm that the selected chromosomes match core.assert_equal( "Chromosome also selected by region (-c) does not match", **{"chromosome": chrom, "gene(s)": g_chrom}, ) else: chrom = g_chrom gene_ranges.sort() if window_coords: # Verify all genes fit in the given window for gene_start, gene_end, gene_name in gene_ranges: if not (start <= gene_start and gene_end <= end): raise ValueError( f"Selected gene {gene_name} " + f"({chrom}:{gene_start}-{gene_end}) " + f"is outside specified region {show_range}" ) elif not show_range: # Set the display window to the selected genes +/- a margin window_coords = ( max(0, gene_ranges[0][0] - window_width), gene_ranges[-1][1] + window_width, ) # Prune plotted elements to the selected region sel_probes = cnarr.in_range(chrom, *window_coords) if cnarr else CNA([]) sel_segs = ( segments.in_range(chrom, *window_coords, mode="trim") if segments else CNA([]) ) sel_snvs = variants.in_range(chrom, *window_coords) if variants else None logging.info( "Showing %d probes and %d selected genes in region %s", len(sel_probes), len(gene_ranges), (chrom + ":{}-{}".format(*window_coords) if window_coords else chrom), ) return sel_probes, sel_segs, sel_snvs, window_coords, gene_ranges, chrom
[docs] def cnv_on_chromosome( axis, probes, segments, genes, antitarget_marker=None, do_trend=False, x_limits=None, y_min=None, y_max=None, segment_color=SEG_COLOR, ): """Draw a scatter plot of probe values with optional segments overlaid. Parameters ---------- genes : list Of tuples: (start, end, gene name) """ # TODO - allow plotting just segments without probes # Get scatter plot coordinates x = 0.5 * (probes["start"] + probes["end"]) * MB # bin midpoints y = probes["log2"] if "weight" in probes: w = 46 * probes["weight"] ** 2 + 2 else: w = np.repeat(30, len(x)) # Configure axes if not y_min: y_min = max(-5.0, min(y.min() - 0.1, -0.3)) if len(y) else -1.1 if not y_max: y_max = max(0.3, y.max() + (0.25 if genes else 0.1)) if len(y) else 1.1 if x_limits: x_min, x_max = x_limits axis.set_xlim(x_min * MB, x_max * MB) else: set_xlim_from(axis, probes, segments) setup_chromosome(axis, y_min, y_max, "Copy ratio (log2)") if genes: highlight_genes(axis, genes, min(2.4, y.max() + 0.1) if len(y) else 0.1) if antitarget_marker in (None, "o"): # Plot targets and antitargets with the same marker axis.scatter(x, y, w, color=POINT_COLOR, alpha=0.4, marker="o") else: # Use the given marker to plot antitargets separately x_fg = [] y_fg = [] w_fg = [] x_bg = [] y_bg = [] # w_bg = [] is_bg = probes["gene"].isin(params.ANTITARGET_ALIASES) for x_pt, y_pt, w_pt, is_bg_pt in zip(x, y, w, is_bg, strict=False): if is_bg_pt: x_bg.append(x_pt) y_bg.append(y_pt) # w_bg.append(w_pt) else: x_fg.append(x_pt) y_fg.append(y_pt) w_fg.append(w_pt) axis.scatter(x_fg, y_fg, w_fg, color=POINT_COLOR, alpha=0.4, marker="o") axis.scatter(x_bg, y_bg, color=POINT_COLOR, alpha=0.5, marker=antitarget_marker) # Add a local trend line if do_trend: axis.plot( x, probes.smooth_log2(), # .1), color=POINT_COLOR, linewidth=2, zorder=-1, snap=False, ) # Draw segments as horizontal lines if segments: for row in segments: color = choose_segment_color(row, segment_color) axis.plot( (row.start * MB, row.end * MB), (row.log2, row.log2), color=color, linewidth=4, solid_capstyle="round", snap=False, ) # Warn about segments masked by default pruning at 'y_min=-5': hidden_seg = segments.log2 < y_min if hidden_seg.sum(): logging.warning( "WARNING: With 'y_min=%s' %s segments are hidden" " --> Add parameter '--y-min %s' to see them", y_min, hidden_seg.sum(), np.floor(segments.log2.min()).astype(int), ) # Signal them as triangles crossing y-axis: x_hidden = segments.start[hidden_seg] * MB y_hidden = np.array([y_min] * len(x_hidden)) axis.scatter( x_hidden, y_hidden, marker="^", linewidth=3, snap=False, color=segment_color, edgecolor="none", clip_on=False, zorder=10, ) return axis
[docs] def snv_on_chromosome(axis, variants, segments, genes, do_trend, by_bin, segment_color): # TODO set x-limits if not already done for probes/segments # set_xlim_from(axis, None, segments, variants) # setup_chromosome(axis, 0.0, 1.0, "VAF") axis.set_ylim(0.0, 1.0) axis.set_ylabel("VAF") if by_bin: axis.set_xlabel("Position (bin)") else: axis.set_xlabel("Position (Mb)") axis.get_yaxis().tick_left() axis.get_xaxis().tick_top() axis.tick_params(which="both", direction="out", labelbottom=False, labeltop=False) x_mb = variants["start"].values * MB y = variants["alt_freq"].values axis.scatter(x_mb, y, color=POINT_COLOR, alpha=0.3) if segments or do_trend: # Draw average VAF within each segment for seg, v_freq in get_segment_vafs(variants, segments): if seg: posn = [seg.start * MB, seg.end * MB] color = choose_segment_color(seg, segment_color, default_bright=False) else: posn = [variants.start.iat[0] * MB, variants.start.iat[-1] * MB] color = TREND_COLOR axis.plot( posn, [v_freq, v_freq], color=color, linewidth=2, zorder=1, solid_capstyle="round", snap=False, ) if genes: highlight_genes(axis, genes, 0.9) return axis
[docs] def set_xlim_from(axis, probes=None, segments=None, variants=None) -> None: """Configure axes for plotting a single chromosome's data. Parameters ---------- probes : CopyNumArray segments : CopyNumArray variants : VariantArray All should already be subsetted to the region that will be plotted. """ min_x = np.inf max_x = 0 for arr in (probes, segments, variants): if arr and len(arr): max_x = max(max_x, arr.end.iat[-1]) min_x = min(min_x, arr.start.iat[0]) if max_x <= min_x: if min_x != np.inf: logging.warning( "WARNING: selection start %s > end %s; did you " "correctly sort the input file by genomic " "location?", min_x, max_x, ) raise ValueError( "No usable data points to plot out of " f"{len(probes) if probes else 0} probes, " f"{len(segments) if segments else 0} segments, " f"{len(variants) if variants else 0} variants" ) axis.set_xlim(min_x * MB, max_x * MB)
[docs] def setup_chromosome(axis, y_min=None, y_max=None, y_label=None) -> None: """Configure axes for plotting a single chromosome's data.""" if y_min and y_max: axis.set_ylim(y_min, y_max) if y_min < 0 < y_max: axis.axhline(color="k") if y_label: axis.set_ylabel(y_label) axis.tick_params(which="both", direction="out") axis.get_xaxis().tick_bottom() axis.get_yaxis().tick_left()
# === Shared ===
[docs] def choose_segment_color(segment, highlight_color, default_bright=True): """Choose a display color based on a segment's CNA status. Uses the fields added by the 'call' command. If these aren't present, use `highlight_color` for everything. For sex chromosomes, some single-copy deletions or gains might not be highlighted, since sample sex isn't used to infer the neutral ploidies. """ neutral_color = TREND_COLOR if "cn" not in segment._fields: # No 'call' info return highlight_color if default_bright else neutral_color # Detect copy number alteration expected_ploidies = {"chrY": (0, 1), "Y": (0, 1), "chrX": (1, 2), "X": (1, 2)} if segment.cn not in expected_ploidies.get(segment.chromosome, [2]): return highlight_color # Detect regions of allelic imbalance / LOH if ( segment.chromosome not in expected_ploidies and "cn1" in segment._fields and "cn2" in segment._fields and (segment.cn1 != segment.cn2) ): return highlight_color return neutral_color
[docs] def get_segment_vafs(variants, segments): """Group SNP allele frequencies by segment. Assume variants and segments were already subset to one chromosome. Yields ------ tuple (segment, value) """ if segments: chunks = variants.by_ranges(segments) else: # Fake segments cover the whole region chunks = [(None, variants)] for seg, seg_snvs in chunks: # ENH: seg_snvs.tumor_boost() freqs = seg_snvs["alt_freq"].values # Separately emit VAFs above and below .5 for plotting idx_above_mid = freqs > 0.5 for idx_vaf in (idx_above_mid, ~idx_above_mid): if sum(idx_vaf) > 1: yield (seg, np.median(freqs[idx_vaf]))
[docs] def highlight_genes(axis, genes, y_posn) -> None: """Show gene regions with background color and a text label.""" # Rotate text in proportion to gene density ngenes = len(genes) text_size = "small" if ngenes <= 6 else "x-small" if ngenes <= 3: text_rot = "horizontal" elif ngenes <= 6: text_rot = 30 elif ngenes <= 10: text_rot = 45 elif ngenes <= 20: text_rot = 60 else: text_rot = "vertical" for gene in genes: gene_start, gene_end, gene_name = gene # Highlight and label gene region # (rescale positions from bases to megabases) axis.axvspan( gene_start * MB, gene_end * MB, alpha=0.5, color=HIGHLIGHT_COLOR, zorder=-1 ) axis.text( 0.5 * (gene_start + gene_end) * MB, y_posn, gene_name, horizontalalignment="center", rotation=text_rot, size=text_size, )