"""Supporting functions for the 'antitarget' command."""
from __future__ import absolute_import, division
import sys
from itertools import groupby
from Bio._py3k import range
iteritems = (dict.iteritems if sys.version_info[0] < 3 else dict.items)
from . import core, ngfrills
from .params import INSERT_SIZE
[docs]def get_background(target_bed, access_bed, avg_bin_size, min_bin_size):
"""Generate background intervals from target intervals.
Procedure:
- Invert target intervals
- Subtract the inverted targets from accessible regions
- For each of the resulting regions:
- Shrink by a fixed margin on each end
- If it's smaller than min_bin_size, skip
- Divide into equal-size (region_size/avg_bin_size) portions
- Emit the (chrom, start, end) coords of each portion
"""
target_chroms = group_coords(ngfrills.parse_regions(target_bed, True))
if access_bed:
# Chromosome accessible sequence regions are given -- use them
access_chroms = group_coords(ngfrills.parse_regions(access_bed, True))
else:
# Chromosome accessible sequence regions not known -- use heuristics
# (chromosome length is endpoint of last probe; skip initial
# <magic number> of bases that are probably telomeric)
TELOMERE_SIZE = 150000
access_chroms = guess_chromosome_regions(target_chroms, TELOMERE_SIZE)
backgrounds = find_background_regions(access_chroms, target_chroms,
2 * INSERT_SIZE)
# Emit regions as antitarget bins according to avg_bin_size and min_bin_size
for chrom, start, end in sorted(backgrounds,
key=core.sorter_chrom_at(0)):
span = end - start
if span >= min_bin_size:
nbins = int(round(span / avg_bin_size)) or 1
if nbins == 1:
yield (chrom, start, end)
else:
# Divide the background region into equal-sized bins
bin_size = span / nbins
bin_start = start
bin_end = None
for i in range(1, nbins):
bin_end = start + int(i * bin_size)
yield (chrom, bin_start, bin_end)
bin_start = bin_end
yield (chrom, bin_start, end)
[docs]def group_coords(coordinates):
"""Group chromosomal coordinates into a dictionary."""
out = {}
for chrom, coords in groupby(coordinates, lambda cse: cse[0]):
out[chrom] = [(start, end) for _chrom, start, end in coords]
return out
[docs]def guess_chromosome_regions(target_chroms, telomere_size):
"""Determine (minimum) chromosome lengths from target coordinates."""
out = {}
for chrom, target_region in iteritems(target_chroms):
endpoint = max(end for _start, end in target_region)
# Put the tuple inside a list for compatibility w/ group_coords
out[chrom] = [(telomere_size, endpoint)]
return out
[docs]def find_background_regions(access_chroms, target_chroms, pad_size):
"""Take coordinates of accessible regions and targets; emit antitargets.
Note that a chromosome must be present in the target library in order to be
included in the antitargets generated here. So, if chrY is missing from
your output files, it's probably because it had no targets.
"""
for chrom in target_chroms:
target_regions = iter(target_chroms[chrom])
# Split each access_region at the targets it contains
tgt_start, tgt_end = next(target_regions)
assert tgt_start < tgt_end
for acc_start, acc_end in access_chroms[chrom]:
if acc_end - acc_start <= 2 * pad_size:
# Accessible region is way too small
continue
bg_start = acc_start + pad_size
while tgt_start < acc_end:
# There may be at least one more target in this region
if tgt_end + pad_size > bg_start:
# Yes, there is a target in this region
if tgt_start - pad_size > bg_start:
# Split the background region at this target
yield (chrom, bg_start, tgt_start - pad_size)
bg_start = tgt_end + pad_size
# Done splitting that target; is there another?
try:
tgt_start, tgt_end = next(target_regions)
except StopIteration:
# Ensure all the remaining access_regions are unbroken
tgt_start = tgt_end = float('Inf')
# No remaining targets in this region - emit the rest of it
if acc_end - pad_size - bg_start > 0:
yield (chrom, bg_start, acc_end - pad_size)