Source code for skgenome.intersect

"""DataFrame-level intersection operations.

Calculate overlapping regions, similar to bedtools intersect.

The functions here operate on pandas DataFrame and Series instances, not
GenomicArray types.

"""
import numpy as np
import pandas as pd
from pandas import Int64Index

from .combiners import first_of, join_strings, make_const


[docs]def by_ranges(table, other, mode, keep_empty): """Group rows by another GenomicArray's bin coordinate ranges.""" for _chrom, bin_rows, src_rows in by_shared_chroms(other, table, keep_empty): if src_rows is not None: subranges = iter_ranges(src_rows, None, bin_rows['start'], bin_rows['end'], mode) for bin_row, subrange in zip(bin_rows.itertuples(index=False), subranges): yield bin_row, subrange elif keep_empty: for bin_row in bin_rows.itertuples(index=False): yield bin_row, [] # ENH: empty dframe matching table
[docs]def by_shared_chroms(table, other, keep_empty=True): if table['chromosome'].is_unique and other['chromosome'].is_unique: yield table['chromosome'].iat[0], table, other # yield None, table, other else: other_chroms = {c: o for c, o in other.groupby(['chromosome'], sort=False)} for chrom, ctable in table.groupby(['chromosome'], sort=False): if chrom in other_chroms: otable = other_chroms[chrom] yield chrom, ctable, otable elif keep_empty: yield chrom, ctable, None
[docs]def into_ranges(source, dest, src_col, default, summary_func): """Group a column in `source` by regions in `dest` and summarize.""" if not len(source) or not len(dest): return dest if summary_func is None: # Choose a type-appropriate summary function elem = source[src_col].iat[0] if isinstance(elem, (str, np.string_)): summary_func = join_strings elif isinstance(elem, (float, np.float_)): summary_func = np.nanmedian else: summary_func = first_of elif not callable(summary_func): # Just fill in the given value, I suppose. summary_func = make_const(summary_func) def series2value(ser): if len(ser) == 0: return default if len(ser) == 1: return ser.iat[0] return summary_func(ser) column = source[src_col] result = [series2value(column[slc]) for slc in iter_slices(source, dest, 'outer', True)] return pd.Series(result)
[docs]def iter_ranges(table, chrom, starts, ends, mode): """Iterate through sub-ranges.""" assert mode in ('inner', 'outer', 'trim') # Optional if we've already subsetted by chromosome (not checked!) if chrom: assert isinstance(chrom, str) # ENH: accept array? try: table = table[table['chromosome'] == chrom] except KeyError: raise KeyError("Chromosome %s is not in this probe set" % chrom) for region_idx, start_val, end_val in idx_ranges(table, starts, ends, 'inner' if mode == 'inner' else 'outer'): subtable = table.iloc[region_idx] if mode == 'trim': subtable = subtable.copy() # Update 5' endpoints to the boundary if start_val: subtable.start = subtable.start.clip(lower=start_val) # Update 3' endpoints to the boundary if end_val: subtable.end = subtable.end.clip(upper=end_val) yield subtable
[docs]def iter_slices(table, other, mode, keep_empty): """Yields indices to extract ranges from `table`. Returns an iterable of integer arrays that can apply to Series objects, i.e. columns of `table`. These indices are of the DataFrame/Series' Index, not array coordinates -- so be sure to use DataFrame.loc, Series.loc, or Series getitem, as opposed to .iloc or indexing directly into Numpy arrays. """ for _c, bin_rows, src_rows in by_shared_chroms(other, table, keep_empty): if src_rows is None: # Emit empty indices since 'table' is missing this chromosome for _ in range(len(bin_rows)): yield Int64Index([]) else: for slc, _s, _e in idx_ranges(src_rows, bin_rows.start, bin_rows.end, mode): indices = src_rows.index[slc].values if keep_empty or len(indices): yield indices
[docs]def idx_ranges(table, starts, ends, mode): """Iterate through sub-ranges.""" assert mode in ('inner', 'outer') # Edge cases: when the `table` is either empty, or both `starts` and `ends` are None, we want to signal the calling # function to use the entire table. To do this, we return slice(None), which, when passed to either .loc or .iloc, # will do just this. We cannot pass table.index to accomplish this because it will not work with .iloc if the table # is already subset by chromosome. if not len(table) or (starts is None and ends is None): yield slice(None), None, None else: # Don't be fooled by nested bins if ((ends is not None and len(ends)) and (starts is not None and len(starts)) ) and not table.end.is_monotonic_increasing: # At least one bin is fully nested -- account for it irange_func = _irange_nested else: irange_func = _irange_simple for region_idx, start_val, end_val in irange_func(table, starts, ends, mode): yield region_idx, start_val, end_val
def _irange_simple(table, starts, ends, mode): """Slice subsets of table when regions are not nested.""" if starts is not None and len(starts): if mode == 'inner': # Only rows entirely after the start point start_idxs = table.start.searchsorted(starts) else: # Include all rows overlapping the start point start_idxs = table.end.searchsorted(starts, 'right') else: starts = np.zeros(len(ends) if ends is not None else 1, dtype=np.int_) start_idxs = starts.copy() if ends is not None and len(ends): if mode == 'inner': end_idxs = table.end.searchsorted(ends, 'right') else: end_idxs = table.start.searchsorted(ends) else: end_idxs = np.repeat(len(table), len(starts)) ends = [None] * len(starts) for start_idx, start_val, end_idx, end_val in zip(start_idxs, starts, end_idxs, ends): yield (slice(start_idx, end_idx), start_val, end_val) def _irange_nested(table, starts, ends, mode): """Slice subsets of table when regions are nested.""" # ENH: Binary Interval Search (BITS) or Layer&Quinlan(2015) assert len(starts) == len(ends) > 0 for start_val, end_val in zip(starts, ends): # Mask of table rows to keep for this query region region_mask = np.ones(len(table), dtype=np.bool_) if start_val: if mode == 'inner': # Only rows entirely after the start point start_idx = table.start.searchsorted(start_val) region_mask[:int(start_idx)] = 0 else: # Include all rows overlapping the start point region_mask = (table.end.values > start_val) if end_val is not None: if mode == 'inner': # Only rows up to the end point region_mask &= (table.end.values <= end_val) else: # Include all rows overlapping the end point end_idx = table.start.searchsorted(end_val) region_mask[int(end_idx):] = 0 yield region_mask, start_val, end_val
[docs]def venn(table, other, mode): # TODO -- implement 'venn' via fjoin algorithm # 'cut' table at all 'other' boundaries # -> extra column '_venn_':int (0, 1, 2) # 0=self only, 1=both, 2=other only # -> 'cut' just drops the '_venn_' column # -> 'subtract' drops 1 and 2? # (is that faster? probably not) # -> 'jaccard' does math with it... return table