"""Transform bait intervals into targets more suitable for CNVkit."""
from __future__ import division, absolute_import
import logging
import collections
from itertools import groupby
# _____________________________________________________________________________
# Assign refFlat gene names to target intervals
# ftp://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/
[docs]def add_refflat_names(region_rows, refflat_fname):
"""Apply RefSeq gene names to a list of targeted regions."""
return assign_names(region_rows, refflat_fname)
[docs]def assign_names(region_rows, refflat_fname, default_name='-'):
"""Replace the interval gene names with those at the same loc in refFlat.txt
"""
ref_genes = read_refflat_genes(refflat_fname)
for (chrom, strand), chr_rows in groupby(sorted(region_rows,
key=lambda row: (
row[0], # chromosome
row[4], # strand
row[1], # start
row[2])), # end
lambda row: (row[0], row[4])):
if strand == '.':
strand = "+|-"
if (chrom, strand) in ref_genes:
genes_in_chrom = iter(ref_genes[(chrom, strand)])
elif '|' in strand:
strands_with_genes = []
for std in strand.split('|'):
if (chrom, std) in ref_genes:
strands_with_genes.extend(ref_genes[(chrom, std)])
if strands_with_genes:
genes_in_chrom = iter(sorted(strands_with_genes))
else:
logging.info("Chromosome %s %s not in annotations",
chrom, strand)
continue
else:
logging.info("Chromosome %s %s not in annotations",
chrom, strand)
continue
ex_start, ex_end, ex_name = next(genes_in_chrom)
for row in chr_rows:
start, end = row[1:3]
if ex_end < start:
# Burn through genes until hitting or passing the interval
while ex_end < start:
try:
ex_start, ex_end, ex_name = next(genes_in_chrom)
except StopIteration:
# Interval is past the last annotated gene in chromosome
# Fake it...
ex_start, ex_end = end + 1, end + 2
ex_name = default_name
if ex_start > end:
# Interval is an unannotated intergenic region (we skipped it)
yield (chrom, start, end, default_name)
else:
assert ex_end > start or ex_start < end
# Overlap: Use this gene's name and emit the interval
yield (chrom, start, end, ex_name)
[docs]def read_refflat_genes(fname):
"""Parse genes; merge those with same name and overlapping regions.
Returns a dict of: {(chrom, strand): [(gene start, gene end, gene name), ...]}
"""
genedict = collections.defaultdict(list)
with open(fname) as genefile:
for line in genefile:
name, _rx, chrom, strand, start, end, _ex = parse_refflat_line(line)
# Skip antisense RNA annotations
if name.endswith('-AS1'):
continue
assert strand in '+-'
genedict[name].append((chrom, strand, start, end))
chromdict = collections.defaultdict(list)
for name, locs in genedict.iteritems():
locs = list(set(locs))
if len(locs) == 1:
chrom, strand, start, end = locs[0]
chromdict[(chrom, strand)].append((start, end, name))
else:
# Check locs for overlap; merge if so
locs.sort()
curr_chrom, curr_strand, curr_start, curr_end = locs[0]
curr_start, curr_end = int(curr_start), int(curr_end)
for chrom, strand, start, end in locs[1:]:
start, end = int(start), int(end)
if (chrom != curr_chrom or
strand != curr_strand or
start > curr_end + 1 or
end < curr_start - 1):
chromdict[(curr_chrom, curr_strand)].append(
(curr_start, curr_end, name))
curr_chrom, curr_start, curr_end = chrom, start, end
else:
curr_start = min(start, curr_start)
curr_end = max(end, curr_end)
# Emit the final interval in this group
chromdict[(curr_chrom, curr_strand)].append(
(curr_start, curr_end, name))
# Finally, sort the keys, and return the results
for key in list(chromdict):
chromdict[key].sort()
return chromdict
[docs]def parse_refflat_line(line):
"""Parse one line of refFlat.txt; return relevant info.
Pair up the exon start and end positions. Add strand direction to each
chromosome as a "+"/"-" suffix (it will be stripped off later).
"""
fields = line.split('\t')
name, refseq, chrom, strand = fields[:4]
start, end = fields[4:6]
# start, end = fields[6:8]
exon_count, exon_starts, exon_ends = fields[8:11]
exons = zip(exon_starts.rstrip(',').split(','),
exon_ends.rstrip(',').split(','))
assert len(exons) == int(exon_count), (
"Exon count mismatch at %s: file says %s, but counted %d intervals"
% (name, exon_count, len(exons)))
return name, refseq, chrom, strand, int(start), int(end), exons
# def cover_introns(regions):
# """Apply the name of the surrounding gene to intronic targets.
# If an unnamed target is surrounded by targets that have the same name,
# consider it an intron and apply that gene name to the unnamed target.
# Otherwise, leave it unnamed ("-").
# """
# last_gene = None
# last_chrom = None
# queue = []
# for chrom, start, end, name in regions:
# if chrom != last_chrom:
# # Queue was not intronic (3'-terminal of last chrom)
# for row in queue:
# yield row
# queue = []
# last_chrom = chrom
# last_gene = name if name != '-' else None
# # Track genes; smooth over intronic '-' regions
# if name == '-':
# # Not sure if intronic -- enqueue
# queue.append((chrom, start, end, name))
# else:
# if name == last_gene:
# # Queued row(s) were intronic
# if queue:
# for cm, st, ed, _nm in queue:
# yield cm, st, ed, name
# queue = []
# else:
# # Queued row(s) were intergenic
# for row in queue:
# yield row
# queue = []
# last_gene = name
# yield chrom, start, end, name
# # Remainder
# for row in queue:
# yield row
# _____________________________________________________________________________
# Clean/shorten interval names
[docs]def shorten_labels(interval_rows):
"""Reduce multi-accession interval labels to the minimum consistent.
So: BED or interval_list files have a label for every region. We want this
to be a short, unique string, like the gene name. But if an interval list is
instead a series of accessions, including additional accessions for
sub-regions of the gene, we can extract a single accession that covers the
maximum number of consecutive regions that share this accession.
e.g.
...
chr1 879125 879593 + mRNA|JX093079,ens|ENST00000342066,mRNA|JX093077,ref|SAMD11,mRNA|AF161376,mRNA|JX093104
chr1 880158 880279 + ens|ENST00000483767,mRNA|AF161376,ccds|CCDS3.1,ref|NOC2L
...
becomes:
chr1 879125 879593 + mRNA|AF161376
chr1 880158 880279 + mRNA|AF161376
"""
longest_name_len = 0
curr_names = set()
curr_gene_coords = []
for row in interval_rows:
next_coords = row[:-1]
next_names = set(row[-1].rstrip().split(','))
assert len(next_names)
overlap = curr_names.intersection(next_names)
if overlap:
# Continuing the same gene; update shared accessions
curr_names = filter_names(overlap)
curr_gene_coords.append(next_coords)
else:
# End of the old gene -- emit coords + shared name(s)
for coords in curr_gene_coords:
out_row = emit(coords, curr_names)
yield out_row
longest_name_len = max(longest_name_len, len(out_row[-1]))
# Start of a new gene
curr_gene_coords = [next_coords]
curr_names = next_names
# Final emission
for coords in curr_gene_coords:
out_row = emit(coords, curr_names)
yield out_row
longest_name_len = max(longest_name_len, len(out_row[-1]))
logging.info("Longest name length: %d", longest_name_len)
[docs]def filter_names(names, exclude=('mRNA',)):
"""Remove less-meaningful accessions from the given set."""
if len(names) > 1:
ok_names = set(n for n in names
if not any(n.startswith(ex) for ex in exclude))
if ok_names:
return ok_names
# Names are not filter-worthy; leave them as they are for now
return names
[docs]def emit(coords, names):
"""Try filtering names again. Format the row for yielding."""
try:
final_name = shortest_name(filter_names(names))
out_row = list(coords) + [final_name]
except ValueError:
raise ValueError(coords)
return out_row
[docs]def shortest_name(names):
"""Return the shortest trimmed name from the given set."""
name = min(names, key=len)
if len(name) > 2 and '|' in name[1:-1]:
# Split 'DB|accession' and extract the accession sans-DB
name = name.split('|')[-1]
return name
# _____________________________________________________________________________
# Split targets
# ENH - fix overlapping bins at midpoint of overlap? try to equalize bin size?
# - or just concatenate the bins & divide evenly?
# - do this for adjacent bins too?
# e.g. --balance-adjacent
[docs]def split_targets(region_rows, avg_size):
"""Split large tiled intervals into smaller, consecutive targets.
For each of the tiled regions:
- Divide into equal-size (tile_size/avg_size) portions
- Emit the (chrom, start, end) coords of each portion
Bin the regions according to avg_size.
"""
prev_chrom = None
prev_start = None
prev_end = -1
for chrom, start, end, name in region_rows:
if (chrom, start, end) == (prev_chrom, prev_start, prev_end):
# Skip duplicate regions, even if names differ
# ENH - Update the name?
logging.info("Duplicate row %s %s:%d-%d", name, chrom, start, end)
continue
if chrom == prev_chrom and start <= prev_end:
if end > prev_end:
# Partial overlap: already handled the initial part of this bin
start = prev_end
else:
# Complete overlap: nothing to do on this bin
logging.info("Bin %s %s:%s-%s fully covered by previous bin"
"%s:%s-%s",
name, chrom, start, end,
prev_chrom, prev_start, prev_end)
continue
prev_chrom = chrom
prev_start = start
span = end - start
if span >= avg_size * 1.5:
logging.info("Splitting: %s %s",
name.ljust(15), str(end - start + 1).rjust(6))
# Divide the background region into equal-sized bins
nbins = round(span / avg_size) or 1
bin_size = span / nbins
# Locate & emit background bins
bin_start = start
while bin_start < (end - 2):
bin_end = bin_start + bin_size
yield (chrom, int(bin_start), int(bin_end), name)
prev_end = int(bin_end)
bin_start = bin_end
else:
yield (chrom, start, end, name)
prev_end = end