Source code for cnvlib.importers

"""Import from other formats to the CNVkit format."""
from __future__ import absolute_import, division, print_function
import math
import os.path
import subprocess

from . import core
from .params import NULL_LOG2_COVERAGE
from .ngfrills import echo
from .cnarray import CopyNumArray as CNA


# __________________________________________________________________________
# import-picard

TOO_MANY_NO_COVERAGE = 100

[docs]def find_picard_files(file_and_dir_names): """Search the given paths for 'targetcoverage' CSV files. Per the convention we use in our Picard applets, the target coverage file names end with '.targetcoverage.csv'; anti-target coverages end with '.antitargetcoverage.csv'. """ filenames = [] for tgt in file_and_dir_names: if os.path.isdir(tgt): # Collect the target coverage files from this directory tree fnames = subprocess.check_output(['find', tgt, '-name', '*targetcoverage.csv'] ).splitlines() if not fnames: raise RuntimeError("Given directory %s does not contain any " "'*targetcoverage.csv' files." % tgt) filenames.extend(fnames) elif os.path.isfile(tgt): filenames.append(tgt) else: raise ValueError("Given path is neither a file nor a directory: %s" % tgt) filenames.sort() return filenames
[docs]def load_targetcoverage_csv(fname): """Parse a target or antitarget coverage file (.csv) into a CopyNumArray. These files are generated by Picard CalculateHsMetrics. The fields of the .csv files are actually separated by tabs, not commas. CSV column names: chrom (str), start, end, length (int), name (str), %gc, mean_coverage, normalized_coverage (float) """ cna_rows = [] no_cvg_cnt = 0 # Count probes with no coverage for row in core.parse_tsv(fname): assert len(row) >= 8, \ "Bad line in {}:\n{}".format(fname, row) chrom = row[0] start = int(row[1]) end = int(row[2]) gene = unpipe_name(row[4]) gc_val = float(row[5]) cvg = float(row[7]) # Coverage already normalized to avg. 1 by Picard if cvg == 0: no_cvg_cnt += 1 coverage = NULL_LOG2_COVERAGE else: coverage = math.log(cvg, 2) cna_rows.append((chrom, start, end, gene, coverage, gc_val)) if no_cvg_cnt > TOO_MANY_NO_COVERAGE: echo("*WARNING* Sample", fname, "has >", TOO_MANY_NO_COVERAGE, "probes with no coverage") pset = CNA.from_rows(core.fbase(fname), cna_rows, ('gc',)) pset.sort() return pset
[docs]def unpipe_name(name): """Fix the duplicated gene names Picard spits out. Return a string containing the single gene name, sans duplications and pipe characters. Picard CalculateHsMetrics combines the labels of overlapping intervals by joining all labels with '|', e.g. 'BRAF|BRAF' -- no two distinct targeted genes actually overlap, though, so these dupes are redundant. Also, in our convention, 'CGH' probes are selected intergenic regions, not meaningful gene names, so 'CGH|FOO' resolves as 'FOO'. """ gene_names = set(name.split('|')) if len(gene_names) > 1: if 'CGH' in gene_names and len(gene_names) == 2: gene_names.remove('CGH') else: echo("*WARNING* Ambiguous gene name:", name) return gene_names.pop() # __________________________________________________________________________ # import-seg
LOG2_10 = math.log(10, 2) # To convert log10 values to log2
[docs]def import_seg(segfname, chrom_names, chrom_prefix, from_log10): """Parse a SEG file. Emit pairs of (sample ID, CopyNumArray) Values are converted from log10 to log2. `chrom_names`: Map (string) chromosome IDs to names. (Applied before chrom_prefix.) e.g. {'23': 'X', '24': 'Y', '25': 'M'} `chrom_prefix`: prepend this string to chromosome names (usually 'chr' or None) """ curr_sample = None curr_rows = [] with open(segfname) as infile: lines = iter(infile) next(lines) # Skip the header for line in lines: sample, chrom, start, end, nprobes, mean = line.split() if chrom_names and chrom in chrom_names: chrom = chrom_names[chrom] if chrom_prefix: chrom = chrom_prefix + chrom mean = float(mean) if from_log10: mean *= LOG2_10 this_row = (chrom, int(start), int(end), ("G" if mean >= 0 else "L"), mean, int(nprobes)) if curr_sample != sample: if curr_sample is not None: assert len(curr_rows) # Emit the current set of segments as a sample yield CNA.from_rows(curr_sample, curr_rows, ('probes',)) # Reset curr_sample = sample curr_rows = [this_row] else: # Continue building this sample curr_rows.append(this_row) # Remainder if curr_sample is not None: assert len(curr_rows) # Emit the current set of segments as a sample yield CNA.from_rows(curr_sample, curr_rows, ('probes',)) # __________________________________________________________________________ # import-theta
[docs]def parse_theta_results(fname): """Parse THetA results into a data structure. Columns: NLL, mu, C, p* """ with open(fname) as handle: header = next(handle).rstrip().split('\t') body = next(handle).rstrip().split('\t') assert len(body) == len(header) == 4 # NLL nll = float(body[0]) # mu mu = body[1].split(',') mu_normal = float(mu[0]) mu_tumors = map(float, mu[1:]) # C copies = body[2].split(':') if len(mu_tumors) == 1: # 1D array of integers # Replace X with None for "missing" copies = [[int(c) if c.isdigit() else None for c in copies]] else: # List of lists of integer-or-None (usu. 2 x #segments) copies = [[int(c) if c.isdigit() else None for c in subcop] for subcop in zip(*[c.split(',') for c in copies])] # p* probs = body[3].split(',') if len(mu_tumors) == 1: # 1D array of floats, or None for "X" (missing/unknown) probs = [float(p) if not p.isalpha() else None for p in probs] else: probs = [[float(p) if not p.isalpha() else None for p in subprob] for subprob in zip(*[p.split(',') for p in probs])] return {"NLL": nll, "mu_normal": mu_normal, "mu_tumors": mu_tumors, "C": copies, "p*": probs}