Source code for skgenome.intersect

"""DataFrame-level intersection operations.

Calculate overlapping regions, similar to bedtools intersect.

The functions here operate on pandas DataFrame and Series instances, not
GenomicArray types.

"""
from __future__ import print_function, absolute_import, division
from builtins import range, zip
from past.builtins import basestring

import numpy as np
import pandas as pd
from pandas.core.index import Int64Index

from .combiners import first_of, join_strings, make_const


[docs]def by_ranges(table, other, mode, keep_empty): """Group rows by another GenomicArray's bin coordinate ranges.""" for _chrom, bin_rows, src_rows in by_shared_chroms(other, table, keep_empty): if src_rows is not None: subranges = iter_ranges(src_rows, None, bin_rows['start'], bin_rows['end'], mode) for bin_row, subrange in zip(bin_rows.itertuples(index=False), subranges): yield bin_row, subrange elif keep_empty: for bin_row in bin_rows.itertuples(index=False): yield bin_row, [] # ENH: empty dframe matching table
[docs]def by_shared_chroms(table, other, keep_empty=True): if table['chromosome'].is_unique and other['chromosome'].is_unique: yield table['chromosome'].iat[0], table, other # yield None, table, other else: other_chroms = {c: o for c, o in other.groupby(['chromosome'], sort=False)} for chrom, ctable in table.groupby(['chromosome'], sort=False): if chrom in other_chroms: otable = other_chroms[chrom] yield chrom, ctable, otable elif keep_empty: yield chrom, ctable, None
[docs]def into_ranges(source, dest, src_col, default, summary_func): """Group a column in `source` by regions in `dest` and summarize.""" if not len(source) or not len(dest): return dest if summary_func is None: # Choose a type-appropriate summary function elem = source[src_col].iat[0] if isinstance(elem, (basestring, np.string_)): summary_func = join_strings elif isinstance(elem, (float, np.float_)): summary_func = np.nanmedian else: summary_func = first_of elif not callable(summary_func): # Just fill in the given value, I suppose. summary_func = make_const(summary_func) def series2value(ser): if len(ser) == 0: return default if len(ser) == 1: return ser.iat[0] return summary_func(ser) column = source[src_col] result = [series2value(column[slc]) for slc in iter_slices(source, dest, 'outer', True)] return pd.Series(result)
[docs]def iter_ranges(table, chrom, starts, ends, mode): """Iterate through sub-ranges.""" assert mode in ('inner', 'outer', 'trim') if chrom: assert isinstance(chrom, basestring) # ENH: accept array? try: table = table[table['chromosome'] == chrom] except KeyError: raise KeyError("Chromosome %s is not in this probe set" % chrom) for region_idx, start_val, end_val in idx_ranges(table, None, starts, ends, 'inner' if mode == 'inner' else 'outer'): subtable = table.loc[region_idx] if mode == 'trim': subtable = subtable.copy() # Update 5' endpoints to the boundary if start_val: subtable.start = subtable.start.clip_lower(start_val) # Update 3' endpoints to the boundary if end_val: subtable.end = subtable.end.clip_upper(end_val) yield subtable
[docs]def iter_slices(table, other, mode, keep_empty): """Yields indices to extract ranges from `table`. Returns an iterable of integer arrays that can apply to Series objects, i.e. columns of `table`. These indices are of the DataFrame/Series' Index, not array coordinates -- so be sure to use DataFrame.loc, Series.loc, or Series getitem, as opposed to .iloc or indexing directly into Numpy arrays. """ for _c, bin_rows, src_rows in by_shared_chroms(other, table, keep_empty): if src_rows is None: # Emit empty indices since 'table' is missing this chromosome for _ in range(len(bin_rows)): yield Int64Index([]) else: for slc, _s, _e in idx_ranges(src_rows, None, bin_rows.start, bin_rows.end, mode): indices = src_rows.index[slc].values if keep_empty or len(indices): yield indices
[docs]def idx_ranges(table, chrom, starts, ends, mode): """Iterate through sub-ranges.""" assert mode in ('inner', 'outer') # Optional if we've already subsetted by chromosome (not checked!) if chrom: assert isinstance(chrom, basestring) # ENH: accept array? try: table = table[table['chromosome'] == chrom] except KeyError: raise KeyError("Chromosome %s is not in this probe set" % chrom) # Edge cases if not len(table) or (starts is None and ends is None): yield table.index, None, None else: # Don't be fooled by nested bins if ((ends is not None and len(ends)) and (starts is not None and len(starts)) ) and not _monotonic(table.end): # At least one bin is fully nested -- account for it irange_func = _irange_nested else: irange_func = _irange_simple for region_idx, start_val, end_val in irange_func(table, starts, ends, mode): yield region_idx, start_val, end_val
def _irange_simple(table, starts, ends, mode): """Slice subsets of table when regions are not nested.""" if starts is not None and len(starts): if mode == 'inner': # Only rows entirely after the start point start_idxs = table.start.searchsorted(starts) else: # Include all rows overlapping the start point start_idxs = table.end.searchsorted(starts, 'right') else: starts = np.zeros(len(ends) if ends is not None else 1, dtype=np.int_) start_idxs = starts.copy() if ends is not None and len(ends): if mode == 'inner': end_idxs = table.end.searchsorted(ends, 'right') else: end_idxs = table.start.searchsorted(ends) else: end_idxs = np.repeat(len(table), len(starts)) ends = [None] * len(starts) for start_idx, start_val, end_idx, end_val in zip(start_idxs, starts, end_idxs, ends): yield (slice(start_idx, end_idx), start_val, end_val) def _irange_nested(table, starts, ends, mode): """Slice subsets of table when regions are nested.""" # ENH: Binary Interval Search (BITS) or Layer&Quinlan(2015) assert len(starts) == len(ends) > 0 for start_val, end_val in zip(starts, ends): # Mask of table rows to keep for this query region region_mask = np.ones(len(table), dtype=np.bool_) if start_val: if mode == 'inner': # Only rows entirely after the start point start_idx = table.start.searchsorted(start_val) region_mask[:int(start_idx)] = 0 else: # Include all rows overlapping the start point region_mask = (table.end.values > start_val) if end_val is not None: if mode == 'inner': # Only rows up to the end point region_mask &= (table.end.values <= end_val) else: # Include all rows overlapping the end point end_idx = table.start.searchsorted(end_val) region_mask[int(end_idx):] = 0 yield region_mask, start_val, end_val
[docs]def venn(table, other, mode): # TODO -- implement 'venn' via fjoin algorithm # 'cut' table at all 'other' boundaries # -> extra column '_venn_':int (0, 1, 2) # 0=self only, 1=both, 2=other only # -> 'cut' just drops the '_venn_' column # -> 'subtract' drops 1 and 2? # (is that faster? probably not) # -> 'jaccard' does math with it... return table
# Shim for pandas 0.18.1 (chapmanb/bcbio-nextgen#1836) if hasattr(pd.Series, 'is_monotonic_increasing'): def _monotonic(ser): return ser.is_monotonic_increasing else: def _monotonic(ser): return (np.diff(ser) >= 0).all()