Source code for cnvlib.export

"""Export CNVkit objects and files to other formats."""
import logging
import time
from collections import OrderedDict as OD

import numpy as np
import pandas as pd
from skgenome import tabio

from . import call
from .cmdutil import read_cna
from ._version import __version__


[docs]def merge_samples(filenames):
    """Merge probe values from multiple samples into a 2D table (of sorts).

    Input:
        dict of {sample ID: (probes, values)}
    Output:
        list-of-tuples: (probe, log2 coverages...)
    """
    def label_with_gene(cnarr):
        row2label = lambda row: "{}:{}-{}:{}".format(
            row.chromosome, row.start, row.end, row.gene)
        return cnarr.data.apply(row2label, axis=1)

    if not filenames:
        return []
    first_cnarr = read_cna(filenames[0])
    out_table = first_cnarr.data.reindex(columns=["chromosome", "start", "end", "gene"])
    out_table["label"] = label_with_gene(first_cnarr)
    out_table[first_cnarr.sample_id] = first_cnarr["log2"]
    for fname in filenames[1:]:
        cnarr = read_cna(fname)
        if not (len(cnarr) == len(out_table)
                and (label_with_gene(cnarr) == out_table["label"]).all()):
            raise ValueError("Mismatched row coordinates in %s" % fname)
        # Copy the next column by sample ID
        if cnarr.sample_id in out_table.columns:
            raise ValueError("Duplicate sample ID: %s" % cnarr.sample_id)
        out_table[cnarr.sample_id] = cnarr["log2"]
        del cnarr
    return out_table


# Supported formats:

[docs]def fmt_cdt(sample_ids, table):
    """Format as CDT.

    See:

    - http://jtreeview.sourceforge.net/docs/JTVUserManual/ch02s11.html
    - http://www.eisenlab.org/FuzzyK/cdt.html
    """
    outheader = ['GID', 'CLID', 'NAME', 'GWEIGHT'] + sample_ids
    header2 = ['AID', '', '', '']
    header2.extend(['ARRY' + str(i).zfill(3) + 'X'
                    for i in range(len(sample_ids))])
    header3 = ['EWEIGHT', '', '', ''] + ['1'] * len(sample_ids)
    outrows = [header2, header3]
    outtable = pd.concat([
        pd.DataFrame.from_dict(OD([
           ("GID", pd.Series(table.index).apply(lambda x: "GENE%dX" % x)),
           ("CLID", pd.Series(table.index).apply(lambda x: "IMAGE:%d" % x)),
           ("NAME", table["label"]),
           ("GWEIGHT", 1),
        ])),
        table.drop(["chromosome", "start", "end", "gene", "label"],
                   axis=1)],
        axis=1)
    outrows.extend(outtable.itertuples(index=False))
    return outheader, outrows


# TODO
[docs]def fmt_gct(sample_ids, table):
    return NotImplemented


[docs]def fmt_jtv(sample_ids, table):
    """Format for Java TreeView."""
    outheader = ["CloneID", "Name"] + sample_ids
    outtable = pd.concat([
        pd.DataFrame({
            "CloneID": "IMAGE:",
            "Name": table["label"],
        }),
        table.drop(["chromosome", "start", "end", "gene", "label"],
                   axis=1)],
        axis=1)
    outrows = outtable.itertuples(index=False)
    return outheader, outrows


# Special cases

[docs]def export_nexus_basic(cnarr):
    """Biodiscovery Nexus Copy Number "basic" format.

    Only represents one sample per file.
    """
    out_table = cnarr.data.reindex(columns=['chromosome', 'start', 'end', 'gene', 'log2'])
    out_table['probe'] = cnarr.labels()
    return out_table


[docs]def export_nexus_ogt(cnarr, varr, min_weight=0.0):
    """Biodiscovery Nexus Copy Number "Custom-OGT" format.

    To create the b-allele frequencies column, alterate allele frequencies from
    the VCF are aligned to the .cnr file bins.  Bins that contain no variants
    are left blank; if a bin contains multiple variants, then the frequencies
    are all "mirrored" to be above or below .5 (majority rules), then the median
    of those values is taken.
    """
    if min_weight and "weight" in cnarr:
        mask_low_weight = (cnarr["weight"] < min_weight)
        logging.info("Dropping %d bins with weight below %f",
                     mask_low_weight.sum(), min_weight)
        cnarr.data = cnarr.data[~mask_low_weight]
    bafs = varr.baf_by_ranges(cnarr)
    logging.info("Placed %d variants into %d bins",
                 sum(~np.isnan(bafs)), len(cnarr))
    out_table = cnarr.data.reindex(columns=['chromosome', 'start', 'end', 'log2'])
    out_table = out_table.rename(columns={
        "chromosome": "Chromosome",
        "start": "Position",
        "end": "Position",
        "log2": "Log R Ratio",
    })
    out_table["B-Allele Frequency"] = bafs
    return out_table


[docs]def export_seg(sample_fnames, chrom_ids=False):
    """SEG format for copy number segments.

    Segment breakpoints are not the same across samples, so samples are listed
    in serial with the sample ID as the left column.
    """
    dframes, sample_ids = zip(*(_load_seg_dframe_id(fname)
                                for fname in sample_fnames))
    out_table = tabio.seg.write_seg(dframes, sample_ids, chrom_ids)
    return out_table


def _load_seg_dframe_id(fname):
    segarr = read_cna(fname)
    assert segarr is not None
    assert segarr.data is not None
    assert segarr.sample_id is not None
    return segarr.data, segarr.sample_id


# _____________________________________________________________________________
# BED

[docs]def export_bed(segments, ploidy, is_reference_male, is_sample_female,
               label, show):
    """Convert a copy number array to a BED-like DataFrame.

    For each region in each sample (possibly filtered according to `show`),
    the columns are:

        - reference sequence name
        - start (0-indexed)
        - end
        - sample name or given label
        - integer copy number

    By default (show="ploidy"), skip regions where copy number is the default
    ploidy, i.e. equal to 2 or the value set by --ploidy.
    If show="variant", skip regions where copy number is neutral, i.e. equal to
    the reference ploidy on autosomes, or half that on sex chromosomes.
    """
    out = segments.data.reindex(columns=["chromosome", "start", "end"])
    out["label"] = label if label else segments["gene"]
    out["ncopies"] = (segments["cn"] if "cn" in segments
                      else call.absolute_pure(segments, ploidy, is_reference_male)
                           .round().astype('int'))
    if show == "ploidy":
        # Skip regions of default ploidy
        out = out[out["ncopies"] != ploidy]
    elif show == "variant":
        # Skip regions of non-neutral copy number
        exp_copies = call.absolute_expect(segments, ploidy, is_sample_female)
        out = out[out["ncopies"] != exp_copies]
    return out


# _____________________________________________________________________________
# VCF

VCF_HEADER = """\
##fileformat=VCFv4.2
##fileDate={date}
##source=CNVkit v{version}
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=FOLD_CHANGE,Number=1,Type=Float,Description="Fold change">
##INFO=<ID=FOLD_CHANGE_LOG,Number=1,Type=Float,Description="Log fold change">
##INFO=<ID=PROBES,Number=1,Type=Integer,Description="Number of probes in CNV">
##ALT=<ID=DEL,Description="Deletion">
##ALT=<ID=DUP,Description="Duplication">
##ALT=<ID=CNV,Description="Copy number variable region">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality">
##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">
##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events">
""".format(date=time.strftime("%Y%m%d"), version=__version__)
# #CHROM  POS   ID  REF ALT   QUAL  FILTER  INFO  FORMAT  NA00001
# 1 2827693   . CCGTGGATGCGGGGACCCGCATCCCCTCTCCCTTCACAGCTGAGTGACCCACATCCCCTCTCCCCTCGCA  C . PASS  SVTYPE=DEL;END=2827680;BKPTID=Pindel_LCS_D1099159;HOMLEN=1;HOMSEQ=C;SVLEN=-66 GT:GQ 1/1:13.9
# 2 321682    . T <DEL>   6 PASS    IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62  GT:GQ 0/1:12
# 3 12665100  . A <DUP>   14  PASS  IMPRECISE;SVTYPE=DUP;END=12686200;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500   GT:GQ:CN:CNQ  ./.:0:3:16.2
# 4 18665128  . T <DUP:TANDEM>  11  PASS  IMPRECISE;SVTYPE=DUP;END=18665204;SVLEN=76;CIPOS=-10,10;CIEND=-10,10  GT:GQ:CN:CNQ  ./.:0:5:8.3


[docs]def export_vcf(segments, ploidy, is_reference_male, is_sample_female,
               sample_id=None, cnarr=None):
    """Convert segments to Variant Call Format.

    For now, only 1 sample per VCF. (Overlapping CNVs seem tricky.)

    Spec: https://samtools.github.io/hts-specs/VCFv4.2.pdf
    """
    vcf_columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
                   "INFO", "FORMAT", sample_id or segments.sample_id]
    if cnarr:
        segments = assign_ci_start_end(segments, cnarr)
    vcf_rows = segments2vcf(segments, ploidy, is_reference_male,
                            is_sample_female)
    table = pd.DataFrame.from_records(vcf_rows, columns=vcf_columns)
    vcf_body = table.to_csv(sep='\t', header=True, index=False,
                            float_format="%.3g")
    return VCF_HEADER, vcf_body


[docs]def assign_ci_start_end(segarr, cnarr):
    """Assign ci_start and ci_end fields to segments.

    Values for each segment indicate the CI boundary points within that segment,
    i.e. the right CI boundary for the left-side breakpoint (segment start), and
    left CI boundary for the right-side breakpoint (segment end).

    This is a little unintuitive because the CI refers to the breakpoint, not
    the segment, but we're storing the info in an array of segments.

    Calculation: Just use the boundaries of the bins left- and right-adjacent to
    each segment breakpoint.
    """
    lefts_rights = ((bins.end.iat[0], bins.start.iat[-1])
                    for _seg, bins in cnarr.by_ranges(segarr, mode="outer"))
    ci_lefts, ci_rights = zip(*lefts_rights)
    return segarr.as_dataframe(
        segarr.data.assign(ci_left=ci_lefts, ci_right=ci_rights))


[docs]def segments2vcf(segments, ploidy, is_reference_male, is_sample_female):
    """Convert copy number segments to VCF records."""
    out_dframe = segments.data.reindex(columns=["chromosome", "end", "log2", "probes"])
    out_dframe["start"] = segments.start.replace(0, 1)

    if "cn" in segments:
        out_dframe["ncopies"] = segments["cn"]
        abs_expect = call.absolute_expect(segments, ploidy, is_sample_female)
    else:
        abs_dframe = call.absolute_dataframe(segments, ploidy, 1.0,
                                             is_reference_male,
                                             is_sample_female)
        out_dframe["ncopies"] = abs_dframe["absolute"].round().astype('int')
        abs_expect = abs_dframe["expect"]
    idx_losses = (out_dframe["ncopies"] < abs_expect)

    svlen = segments.end - segments.start
    svlen[idx_losses] *= -1
    out_dframe["svlen"] = svlen

    out_dframe["svtype"] = "DUP"
    out_dframe.loc[idx_losses, "svtype"] = "DEL"

    out_dframe["format"] = "GT:GQ:CN:CNQ"
    out_dframe.loc[idx_losses, "format"] = "GT:GQ" # :CN:CNQ ?

    if "ci_left" in segments and "ci_right" in segments:
        has_ci = True
        # Calculate fuzzy left&right coords for CIPOS and CIEND
        left_margin = segments["ci_left"].values - segments.start.values
        right_margin = segments.end.values - segments["ci_right"].values
        out_dframe["ci_pos_left"] = np.r_[0, -right_margin[:-1]]
        out_dframe["ci_pos_right"] = left_margin
        out_dframe["ci_end_left"] = right_margin
        out_dframe["ci_end_right"] = np.r_[left_margin[1:], 0]
    else:
        has_ci = False

    # Reformat this data to create INFO and genotype
    # TODO be more clever about this
    for out_row, abs_exp in zip(out_dframe.itertuples(index=False), abs_expect):
        if (out_row.ncopies == abs_exp or
            # Survive files from buggy v0.7.1 (#53)
            not str(out_row.probes).isdigit()):
            # Skip regions of neutral copy number
            continue  # or "CNV" for subclonal?

        if out_row.ncopies > abs_exp:
            genotype = "0/1:0:%d:%d" % (out_row.ncopies, out_row.probes)
        elif out_row.ncopies < abs_exp:
            # TODO XXX handle non-diploid ploidies, haploid chroms
            if out_row.ncopies == 0:
                # Complete deletion, 0 copies
                gt = "1/1"
            else:
                # Single copy deletion
                gt = "0/1"
            genotype = "%s:%d" % (gt, out_row.probes)

        fields = ["IMPRECISE",
                  "SVTYPE=%s" % out_row.svtype,
                  "END=%d" % out_row.end,
                  "SVLEN=%d" % out_row.svlen,
                  "FOLD_CHANGE=%f" % 2.0 ** out_row.log2,
                  "FOLD_CHANGE_LOG=%f" % out_row.log2,
                  "PROBES=%d" % out_row.probes
                 ]
        if has_ci:
            fields.extend([
                "CIPOS=(%d,%d)" % (out_row.ci_pos_left, out_row.ci_pos_right),
                "CIEND=(%d,%d)" % (out_row.ci_end_left, out_row.ci_end_right),
            ])
        info = ";".join(fields)

        yield (out_row.chromosome, out_row.start, '.', 'N',
               "<%s>" % out_row.svtype, '.', '.',
               info, out_row.format, genotype)


# _____________________________________________________________________________
# GISTIC

[docs]def export_gistic_markers(cnr_fnames):
    """Generate a GISTIC 2.0 "markers" file from a set of .cnr files.

    GISTIC documentation:

    ftp://ftp.broadinstitute.org/pub/GISTIC2.0/GISTICDocumentation_standalone.htm
    http://genepattern.broadinstitute.org/ftp/distribution/genepattern/modules_public_server_doc/GISTIC2.pdf
    http://gdac.broadinstitute.org/runs/analyses__2013_05_23/reports/cancer/KICH-TP/CopyNumber_Gistic2/nozzle.html

    The markers file identifies the marker names and positions of the markers in
    the original dataset (before segmentation). It is a three column,
    tab-delimited file with an optional header. The column headers are:

    (1) Marker Name
    (2) Chromosome
    (3) Marker Position (in bases)

    GISTIC also needs an accompanying SEG file generated from corresponding .cns
    files.
    """
    colnames = ["ID", "CHROM", "POS"]
    out_chunks = []
    # TODO since markers will mostly be the same,
    #   detect duplicates & exclude them
    # seen_marker_ids = None
    for fname in cnr_fnames:
        cna = read_cna(fname).autosomes()
        marker_ids = cna.labels()
        tbl = pd.concat([
            pd.DataFrame({
                "ID": marker_ids,
                "CHROM": cna.chromosome,
                "POS": cna.start + 1,
            }, columns=colnames),
            pd.DataFrame({
                "ID": marker_ids,
                "CHROM": cna.chromosome,
                "POS": cna.end,
            }, columns=colnames),
        ], ignore_index=True)
        out_chunks.append(tbl)
    return pd.concat(out_chunks).drop_duplicates()


# _____________________________________________________________________________
# THetA

[docs]def export_theta(tumor_segs, normal_cn):
    """Convert tumor segments and normal .cnr or reference .cnn to THetA input.

    Follows the THetA segmentation import script but avoid repeating the
    pileups, since we already have the mean depth of coverage in each target
    bin.

    The options for average depth of coverage and read length do not matter
    crucially for proper operation of THetA; increased read counts per bin
    simply increase the confidence of THetA's results.

    THetA2 input format is tabular, with columns:
        ID, chrm, start, end, tumorCount, normalCount

    where chromosome IDs ("chrm") are integers 1 through 24.
    """
    out_columns = ["#ID", "chrm", "start", "end", "tumorCount", "normalCount"]
    if not tumor_segs:
        return pd.DataFrame(columns=out_columns)

    # Drop any chromosomes that are not integer or XY
    # THetA hard-codes X & Y, so we can, too
    #  xy_names = (["chrX", "chrY"]
    #              if tumor_segs.chromosome.iat[0].startswith('chr')
    #              else ["X", "Y"])
    # NB: THetA2 now apparently just drops X and Y (#153)
    xy_names = []
    tumor_segs = tumor_segs.autosomes(also=xy_names)
    if normal_cn:
        normal_cn = normal_cn.autosomes(also=xy_names)

    table = tumor_segs.data.reindex(columns=["start", "end"])

    # Convert chromosome names to 1-based integer indices
    chr2idx = {c: i+1
               for i, c in enumerate(tumor_segs.chromosome.drop_duplicates())}
    table["chrm"] = tumor_segs.chromosome.replace(chr2idx)
    # Unique string identifier for each row, e.g. "start_1_93709:end_1_19208166"
    table["#ID"] = ["start_%d_%d:end_%d_%d"
                    % (row.chrm, row.start, row.chrm, row.end)
                    for row in table.itertuples(index=False)]

    # Calculate/estimate per-segment read counts in tumor and normal samples
    ref_means, nbins = ref_means_nbins(tumor_segs, normal_cn)
    table["tumorCount"] = theta_read_counts(tumor_segs.log2, nbins)
    table["normalCount"] = theta_read_counts(ref_means, nbins)
    return table[out_columns]


[docs]def ref_means_nbins(tumor_segs, normal_cn):
    """Extract segments' reference mean log2 values and probe counts.

    Code paths::

        wt_mdn  wt_old  probes  norm -> norm, nbins
        +       *       *       -       0,  wt_mdn
        -       +       +       -       0,  wt_old * probes
        -       +       -       -       0,  wt_old * size?
        -       -       +       -       0,  probes
        -       -       -       -       0,  size?

        +       -       +       +       norm, probes
        +       -       -       +       norm, bin counts
        -       +       +       +       norm, probes
        -       +       -       +       norm, bin counts
        -       -       +       +       norm, probes
        -       -       -       +       norm, bin counts
    """
    if normal_cn:
        log2s_in_segs = [bins['log2']
                         for _seg, bins in normal_cn.by_ranges(tumor_segs)]
        # For the normal/reference bin count, take the mean of the bin values
        # within each segment so that segments match between tumor and normal.
        # ENH: weighted mean, like genemetrics
        ref_means = np.array([s.mean() for s in log2s_in_segs])
        if "probes" in tumor_segs:
            nbins = tumor_segs["probes"]
        else:
            nbins = np.array([len(s) for s in log2s_in_segs])
    else:
        # Assume neutral reference log2 across all segments
        # (weights already account for reference log2)
        ref_means = np.zeros(len(tumor_segs))
        if "weight" in tumor_segs and (tumor_segs["weight"] > 1.0).any():
            # Segment weights are already multiplied by probe counts
            nbins = tumor_segs["weight"]
            # Rescale to average 1.0 (we'll do the same below, too)
            nbins /= nbins.max() / nbins.mean()
        else:
            if "probes" in tumor_segs:
                nbins = tumor_segs["probes"]
            else:
                logging.warning("No probe counts in tumor segments file and no "
                                "normal reference given; guessing normal "
                                "read-counts-per-segment from segment sizes")
                sizes = tumor_segs.end - tumor_segs.start
                nbins = sizes / sizes.mean()
            if "weight" in tumor_segs:
                # Already checked -- these are old-style weights that were not
                # multiplied by `probes` originally
                nbins *= tumor_segs["weight"] / tumor_segs["weight"].mean()
    return ref_means, nbins


[docs]def theta_read_counts(log2_ratio, nbins,
                      # These scaling factors don't meaningfully affect
                      # THetA's calculation unless they're too small
                      avg_depth=500, avg_bin_width=200, read_len=100):
    """Calculate segments' read counts from log2-ratios.

    Math:
        nbases = read_length * read_count
    and
        nbases = bin_width * read_depth
    where
        read_depth = read_depth_ratio * avg_depth

    So:
        read_length * read_count = bin_width * read_depth
        read_count = bin_width * read_depth / read_length
    """
    read_depth = (2 ** log2_ratio) * avg_depth
    read_count = nbins * avg_bin_width * read_depth / read_len
    return read_count.round().fillna(0).astype('int')


[docs]def export_theta_snps(varr):
    """Generate THetA's SNP per-allele read count "formatted.txt" files."""
    # Drop any chromosomes that are not integer or XY
    varr = varr.autosomes(also=(['chrX', 'chrY']
                                if varr.chromosome.iat[0].startswith('chr')
                                else ['X', 'Y']))
    # Skip indels
    varr = varr[(varr["ref"].str.len() == 1) & (varr["alt"].str.len() == 1)]
    # Drop rows with any NaN
    varr.data.dropna(subset=["depth", "alt_count"], inplace=True)
    if "n_depth" in varr and "n_alt_count" in varr:
        varr.data.dropna(subset=["n_depth", "alt_count"], inplace=True)
    # Avoid weird situation I've seen on alt contigs
    varr = varr[varr["depth"] >= varr["alt_count"]]
    # Reformat for THetA2
    for depth_key, alt_key in (("depth", "alt_count"),
                               ("n_depth", "n_alt_count")):
        # Extract the SNP allele counts that THetA2 uses
        table = varr.data.reindex(columns=('chromosome', 'start', depth_key, alt_key))
        table = (table.assign(ref_depth=table[depth_key] - table[alt_key])
                 .reindex(columns=('chromosome', 'start', 'ref_depth', alt_key))
                 .dropna())
        table['ref_depth'] = table['ref_depth'].astype('int')
        table[alt_key] = table[alt_key].astype('int')
        table.columns = ["#Chrm", "Pos", "Ref_Allele", "Mut_Allele"]
        yield table


# _____________________________________________________________________________

EXPORT_FORMATS = {
    'cdt': fmt_cdt,
    # 'gct': fmt_gct,
    'gistic': export_gistic_markers,
    'jtv': fmt_jtv,
    'nexus-basic': export_nexus_basic,
    'nexus-ogt': export_nexus_ogt,
    'seg': export_seg,
    'theta': export_theta,
    'vcf': export_vcf,
}