Source code for cnvlib.export

"""Export CNVkit objects and files to other formats."""
from __future__ import absolute_import, division, print_function

import collections
import logging

import numpy as np
import pandas as pd
from Bio._py3k import map, range, zip

from . import call, core, params
from .cnary import CopyNumArray as CNA
from .vary import VariantArray as VA


[docs]def merge_samples(filenames): """Merge probe values from multiple samples into a 2D table (of sorts). Input: dict of {sample ID: (probes, values)} Output: list-of-tuples: (probe, log2 coverages...) """ def label_with_gene(cnarr): row2label = lambda row: "{}:{}-{}:{}".format( row.chromosome, row.start, row.end, row.gene) return cnarr.data.apply(row2label, axis=1) if not filenames: return [] first_cnarr = CNA.read(filenames[0]) out_table = first_cnarr.data.loc[:, ["chromosome", "start", "end", "gene"]] out_table["label"] = label_with_gene(first_cnarr) out_table[first_cnarr.sample_id] = first_cnarr["log2"] for fname in filenames[1:]: cnarr = CNA.read(fname) # Verify labels match labels = label_with_gene(cnarr) if not (labels == out_table["label"]).all(): raise ValueError("Mismatched row coordinates in %s" % fname) # Copy the next column by sample ID if cnarr.sample_id in out_table.columns: raise ValueError("Duplicate sample ID: %s" % cnarr.sample_id) out_table[cnarr.sample_id] = cnarr["log2"] del cnarr return out_table
# Supported formats:
[docs]def fmt_cdt(sample_ids, table): """Format as CDT.""" outheader = ['GID', 'CLID', 'NAME', 'GWEIGHT'] + sample_ids header2 = ['AID', '', '', ''] header2.extend(['ARRY' + str(i).zfill(3) + 'X' for i in range(len(sample_ids))]) outrows = [header2] outtable = pd.concat([ pd.DataFrame({ "GID": table.index.apply(lambda x: "GENE%dX" % x), "CLID": table.index.apply(lambda x: "IMAGE:%d" % x), "NAME": table["label"], "GWEIGHT": 1, }), table.drop(["chromosome", "start", "end", "gene", "label"], axis=1)], axis=1) outrows.extend(outtable.itertuples(index=False)) return outheader, outrows
# TODO
[docs]def fmt_gct(sample_ids, table): return NotImplemented
[docs]def fmt_jtv(sample_ids, table): """Format for Java TreeView.""" outheader = ["CloneID", "Name"] + sample_ids outtable = pd.concat([ pd.DataFrame({ "CloneID": "IMAGE:", "Name": table["label"], }), table.drop(["chromosome", "start", "end", "gene", "label"], axis=1)], axis=1) outrows = outtable.itertuples(index=False) return outheader, outrows
# Special cases
[docs]def export_nexus_basic(sample_fname): """Biodiscovery Nexus Copy Number "basic" format. Only represents one sample per file. """ cnarr = CNA.read(sample_fname) out_table = cnarr.data.loc[:, ['chromosome', 'start', 'end', 'gene', 'log2']] out_table['probe'] = cnarr.labels() return out_table
[docs]def export_nexus_ogt(sample_fname, vcf_fname): """Biodiscovery Nexus Copy Number "Custom-OGT" format. To create the b-allele frequencies column, alterate allele frequencies from the VCF are aligned to the .cnr file bins. Bins that contain no variants are left blank; if a bin contains multiple variants, then the frequencies are all "mirrored" to be above .5, then the median of those values is taken. """ def mirrored_baf_median(vals): shift = np.median(np.abs(vals - .5)) if np.median(vals) > .5: return .5 + shift else: return .5 - shift cnarr = CNA.read(sample_fname) varr = VA.read_vcf(vcf_fname, skip_hom=True, skip_somatic=True) bafs = cnarr.match_to_bins(varr, 'alt_freq', np.nan, summary_func=mirrored_baf_median) logging.info("Placed %d variants into %d bins", sum(~np.isnan(bafs)), len(cnarr)) out_table = cnarr.data.loc[:, ['chromosome', 'start', 'end', 'log2']] out_table = out_table.rename(columns={ "chromosome": "Chromosome", "start": "Position", "end": "Position", "log2": "Log R Ratio", }) out_table["B-Allele Frequency"] = bafs return out_table
[docs]def export_seg(sample_fnames): """SEG format for copy number segments. Segment breakpoints are not the same across samples, so samples are listed in serial with the sample ID as the left column. """ out_tables = [] chrom_ids = None for fname in sample_fnames: segments = CNA.read(fname) if chrom_ids is None: # Create & store chrom_ids = create_chrom_ids(segments) else: # Verify core.assert_equal("Segment chromosome names differ", previous=chrom_ids.keys(), current=create_chrom_ids(segments).keys()) table = segments.data.loc[:, ["start", "end"]] table["ID"] = segments.sample_id table["mean"] = segments.data["log2"] table["chromosome"] = [chrom_ids[chrom] for chrom in segments["chromosome"]] if "probes" in segments: table["num_probes"] = segments["probes"] sorted_cols = ["ID", "chromosome", "start", "end", "num_probes", "mean"] else: sorted_cols = ["ID", "chromosome", "start", "end", "mean"] out_tables.append(table.reindex(columns=sorted_cols)) return pd.concat(out_tables)
[docs]def create_chrom_ids(segments): """Map chromosome names to integers in the order encountered.""" mapping = collections.OrderedDict() curr_idx = 1 for chrom in segments.chromosome: if chrom not in mapping: mapping[chrom] = curr_idx curr_idx += 1 return mapping
# _____________________________________________________________________________ # BED
[docs]def export_bed(segments, ploidy, is_reference_male, is_sample_female, label, show): """Convert a copy number array to a BED-like DataFrame. For each region in each sample (possibly filtered according to `show`), the columns are: - reference sequence name - start (0-indexed) - end - sample name or given label - integer copy number By default (show="ploidy"), skip regions where copy number is the default ploidy, i.e. equal to 2 or the value set by --ploidy. If show="variant", skip regions where copy number is neutral, i.e. equal to the reference ploidy on autosomes, or half that on sex chromosomes. """ absolutes = call.absolute_pure(segments, ploidy, is_reference_male) out = segments.data.loc[:, ["chromosome", "start", "end"]] out["label"] = label out["ncopies"] = np.rint(absolutes) if show == "ploidy": # Skip regions of default ploidy out = out[out["ncopies"] != ploidy] elif show == "variant": # Skip regions of non-neutral copy number abs_dframe = call.absolute_dataframe(segments, ploidy, 1.0, is_reference_male, is_sample_female) out = out[out["ncopies"] != abs_dframe["expect"]] return out
# _____________________________________________________________________________ # VCF VCF_HEADER = """\ ##fileformat=VCFv4.0 ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants"> ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants"> ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> ##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation"> ##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles"> ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> ##ALT=<ID=DEL,Description="Deletion"> ##ALT=<ID=DUP,Description="Duplication"> ##ALT=<ID=CNV,Description="Copy number variable region"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality"> ##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events"> ##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events"> """ # #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 # 1 2827693 . CCGTGGATGCGGGGACCCGCATCCCCTCTCCCTTCACAGCTGAGTGACCCACATCCCCTCTCCCCTCGCA C . PASS SVTYPE=DEL;END=2827680;BKPTID=Pindel_LCS_D1099159;HOMLEN=1;HOMSEQ=C;SVLEN=-66 GT:GQ 1/1:13.9 # 2 321682 . T <DEL> 6 PASS IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62 GT:GQ 0/1:12 # 3 12665100 . A <DUP> 14 PASS IMPRECISE;SVTYPE=DUP;END=12686200;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500 GT:GQ:CN:CNQ ./.:0:3:16.2 # 4 18665128 . T <DUP:TANDEM> 11 PASS IMPRECISE;SVTYPE=DUP;END=18665204;SVLEN=76;CIPOS=-10,10;CIEND=-10,10 GT:GQ:CN:CNQ ./.:0:5:8.3
[docs]def export_vcf(segments, ploidy, is_reference_male, is_sample_female, sample_id=None): """Convert segments to Variant Call Format. For now, only 1 sample per VCF. (Overlapping CNVs seem tricky.) Spec: https://samtools.github.io/hts-specs/VCFv4.2.pdf """ vcf_columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", sample_id or segments.sample_id] vcf_rows = segments2vcf(segments, ploidy, is_reference_male, is_sample_female) table = pd.DataFrame.from_records(vcf_rows, columns=vcf_columns) vcf_body = table.to_csv(sep='\t', header=True, index=False, float_format="%.3g") return VCF_HEADER, vcf_body
[docs]def segments2vcf(segments, ploidy, is_reference_male, is_sample_female): """Convert copy number segments to VCF records.""" out_dframe = segments.data.loc[:, ["chromosome", "end", "log2", "probes"]] abs_dframe = call.absolute_dataframe(segments, ploidy, 1.0, is_reference_male, is_sample_female) out_dframe["ncopies"] = np.rint(abs_dframe["absolute"]) idx_losses = (out_dframe["ncopies"] < abs_dframe["expect"]) starts = segments.start.copy() starts[starts == 0] = 1 out_dframe["start"] = starts svlen = segments.end - segments.start svlen[idx_losses] *= -1 out_dframe["svlen"] = svlen out_dframe["svtype"] = "DUP" out_dframe.loc[idx_losses, "svtype"] = "DEL" out_dframe["format"] = "GT:GQ:CN:CNQ" out_dframe.loc[idx_losses, "format"] = "GT:GQ" # :CN:CNQ ? # Reformat this data to create INFO and genotype # TODO be more clever about this for out_row, abs_row in zip(out_dframe.itertuples(index=False), abs_dframe.itertuples(index=False)): if (out_row.ncopies == abs_row.expect or # Survive files from buggy v0.7.1 (#53) not str(out_row.probes).isdigit()): # Skip regions of neutral copy number continue # or "CNV" for subclonal? if out_row.ncopies > abs_row.expect: genotype = "0/1:0:%d:%d" % (out_row.ncopies, out_row.probes) elif out_row.ncopies < abs_row.expect: # TODO XXX handle non-diploid ploidies, haploid chroms if out_row.ncopies == 0: # Complete deletion, 0 copies gt = "1/1" else: # Single copy deletion gt = "0/1" genotype = "%s:%d" % (gt, out_row.probes) info = ";".join(["IMPRECISE", "SVTYPE=%s" % out_row.svtype, "END=%d" % out_row.end, "SVLEN=%d" % out_row.svlen, # CIPOS=-56,20;CIEND=-10,62 ]) yield (out_row.chromosome, out_row.start, '.', 'N', "<%s>" % out_row.svtype, '.', '.', info, out_row.format, genotype)
# _____________________________________________________________________________ # THetA
[docs]def export_theta(tumor, reference): """Convert tumor segments and normal .cnr or reference .cnn to THetA input. Follows the THetA segmentation import script but avoid repeating the pileups, since we already have the mean depth of coverage in each target bin. The options for average depth of coverage and read length do not matter crucially for proper operation of THetA; increased read counts per bin simply increase the confidence of THetA's results. THetA2 input format is tabular, with columns: ID, chrm, start, end, tumorCount, normalCount where chromosome IDs ("chrm") are integers 1 through 24. """ tumor_segs = CNA.read(tumor) ref_cnarr = CNA.read(reference) # Capture parameters in a closure: avg_depth, avg_bin_width # (These two scaling factors don't meaningfully affect THetA's calculation # unless they're too small) avg_depth = 500 # Similar number of reads in on-, off-target bins; treat them equally avg_bin_width = 200 def log2ratio_to_count(log2_ratio, nbins): """Calculate a segment's read count from log2-ratio. Math: nbases = read_length * read_count and nbases = bin_width * read_depth where read_depth = read_depth_ratio * avg_depth So: read_length * read_count = bin_width * read_depth read_count = bin_width * read_depth / read_length """ read_depth = (2 ** log2_ratio) * avg_depth read_count = nbins * avg_bin_width * read_depth / params.READ_LEN return int(round(read_count)) outheader = ["#ID", "chrm", "start", "end", "tumorCount", "normalCount"] outrows = [] # Convert chromosome names to 1-based integer indices prev_chrom = None chrom_id = 0 for seg, subcnarr in ref_cnarr.by_ranges(tumor_segs): if seg.chromosome != prev_chrom: chrom_id += 1 prev_chrom = seg.chromosome fields = format_theta_row(seg, subcnarr, chrom_id, log2ratio_to_count) outrows.append(fields) return outheader, outrows
[docs]def format_theta_row(seg, cnarr, chrom_id, log2_to_count): """Convert a segment's info to a row of THetA input. For the normal/reference bin count, take the mean of the bin values within each segment so that segments match between tumor and normal. """ nbins = seg.probes if "probes" in seg else len(cnarr) tumor_count = log2_to_count(seg.log2, nbins) ref_count = log2_to_count(cnarr.log2.mean(), nbins) # e.g. "start_1_93709:end_1_19208166" row_id = ("start_%d_%d:end_%d_%d" % (chrom_id, seg.start, chrom_id, seg.end)) return (row_id, # ID chrom_id, # chrm seg.start, # start seg.end, # end tumor_count, # tumorCount ref_count # normalCount )
# _____________________________________________________________________________ EXPORT_FORMATS = { 'cdt': fmt_cdt, # 'gct': fmt_gct, 'jtv': fmt_jtv, 'nexus-basic': export_nexus_basic, 'nexus-ogt': export_nexus_ogt, 'seg': export_seg, 'theta': export_theta, 'vcf': export_vcf, }