"""DataFrame-level intersection operations.
Calculate overlapping regions, similar to bedtools intersect.
The functions here operate on pandas DataFrame and Series instances, not
GenomicArray types.
"""
from __future__ import print_function, absolute_import, division
from builtins import zip
from past.builtins import basestring
import numpy as np
import pandas as pd
from .combiners import first_of, join_strings, make_const
[docs]def by_ranges(table, other, mode, keep_empty):
"""Group rows by another GenomicArray's bin coordinate ranges."""
for _chrom, bin_rows, src_rows in by_shared_chroms(other, table,
keep_empty):
if src_rows is not None:
subranges = iter_ranges(src_rows, None, bin_rows['start'],
bin_rows['end'], mode)
for bin_row, subrange in zip(bin_rows.itertuples(index=False),
subranges):
yield bin_row, subrange
elif keep_empty:
for bin_row in bin_rows.itertuples(index=False):
yield bin_row, [] # ENH: empty dframe matching table
[docs]def by_shared_chroms(table, other, keep_empty=True):
other_chroms = {c: o for c, o in other.groupby(['chromosome'], sort=False)}
for chrom, ctable in table.groupby(['chromosome'], sort=False):
if chrom in other_chroms:
otable = other_chroms[chrom]
yield chrom, ctable, otable
elif keep_empty:
yield chrom, ctable, None
[docs]def into_ranges(source, dest, src_col, default, summary_func):
"""Group a column in `source` by regions in `dest` and summarize."""
if not len(source) or not len(dest):
return dest
if summary_func is None:
# Choose a type-appropriate summary function
elem = source[src_col].iat[0]
if isinstance(elem, (basestring, np.string_)):
summary_func = join_strings
elif isinstance(elem, (float, np.float_)):
summary_func = np.nanmedian
else:
summary_func = first_of
elif not callable(summary_func):
# Just fill in the given value, I suppose.
summary_func = make_const(summary_func)
def series2value(ser):
if len(ser) == 1:
return ser.iat[0]
return summary_func(ser)
return pd.Series([(series2value(src_rows[src_col])
if len(src_rows) else default)
for _bin, src_rows in by_ranges(source, dest, 'outer',
True)])
[docs]def iter_ranges(table, chrom, starts, ends, mode):
"""Iterate through sub-ranges."""
assert mode in ('inner', 'outer', 'trim')
# Optional if we've already subsetted by chromosome (not checked!)
if chrom:
assert isinstance(chrom, basestring) # ENH: accept array?
try:
table = table[table['chromosome'] == chrom]
except KeyError:
raise KeyError("Chromosome %s is not in this probe set" % chrom)
# Edge cases
if not len(table) or (starts is None and ends is None):
yield table
raise StopIteration
# Don't be fooled by nested bins
if ((ends is not None and len(ends)) and
(starts is not None and len(starts))
) and not _monotonic(table.end):
# At least one bin is fully nested -- account for it
irange_func = _irange_nested
else:
irange_func = _irange_simple
for region_idx, start_val, end_val in irange_func(table, starts, ends, mode):
subtable = table.iloc[region_idx]
if mode == 'trim':
subtable = subtable.copy()
# Update 5' endpoints to the boundary
if start_val:
subtable.start = subtable.start.clip_lower(start_val)
# Update 3' endpoints to the boundary
if end_val:
subtable.end = subtable.end.clip_upper(end_val)
yield subtable
def _irange_simple(table, starts, ends, mode):
"""Slice subsets of table when regions are not nested."""
if starts is not None and len(starts):
if mode == 'inner':
# Only rows entirely after the start point
start_idxs = table.start.searchsorted(starts)
else:
# Include all rows overlapping the start point
start_idxs = table.end.searchsorted(starts, 'right')
else:
starts = np.zeros(len(ends) if ends is not None else 1,
dtype=np.int_)
start_idxs = starts.copy()
if ends is not None and len(ends):
if mode == 'inner':
end_idxs = table.end.searchsorted(ends, 'right')
else:
end_idxs = table.start.searchsorted(ends)
else:
end_idxs = np.repeat(len(table), len(starts))
ends = [None] * len(starts)
for start_idx, start_val, end_idx, end_val in zip(start_idxs, starts,
end_idxs, ends):
yield (slice(start_idx, end_idx), start_val, end_val)
def _irange_nested(table, starts, ends, mode):
"""Slice subsets of table when regions are nested."""
# ENH: Binary Interval Search (BITS) or Layer&Quinlan(2015)
assert len(starts) == len(ends) > 0
for start_val, end_val in zip(starts, ends):
# Mask of table rows to keep for this query region
region_mask = np.ones(len(table), dtype=np.bool_)
if start_val:
if mode == 'inner':
# Only rows entirely after the start point
start_idx = table.start.searchsorted(start_val)
region_mask[:start_idx.item()] = 0
else:
# Include all rows overlapping the start point
region_mask = (table.end.values > start_val)
if end_val is not None:
if mode == 'inner':
# Only rows up to the end point
region_mask &= (table.end.values <= end_val)
else:
# Include all rows overlapping the end point
end_idx = table.start.searchsorted(end_val)
region_mask[end_idx.item():] = 0
yield region_mask, start_val, end_val
[docs]def venn(table, other, mode):
# TODO -- implement 'venn' via fjoin algorithm
# 'cut' table at all 'other' boundaries
# -> extra column '_venn_':int (0, 1, 2)
# 0=self only, 1=both, 2=other only
# -> 'cut' just drops the '_venn_' column
# -> 'subtract' drops 1 and 2?
# (is that faster? prolly not)
# -> 'jaccard' does math with it...
return table
# Shim for pandas 0.18.1 (chapmanb/bcbio-nextgen#1836)
if hasattr(pd.Series, 'is_monotonic_increasing'):
def _monotonic(ser):
return ser.is_monotonic_increasing
else:
def _monotonic(ser):
return (np.diff(ser) >= 0).all()