Source code for skgenome.merge

"""DataFrame-level merging operations.

Merge overlapping regions into single rows, similar to bedtools merge.

The functions here operate on pandas DataFrame and Series instances, not
GenomicArray types.

"""
import itertools

import numpy as np
import pandas as pd

from .chromsort import sorter_chrom
from .combiners import get_combiners, first_of


[docs]def flatten(table, combine=None, split_columns=None): if not len(table): return table if (table.start.values[1:] >= table.end.cummax().values[:-1]).all(): return table # NB: Input rows and columns should already be sorted like this table = table.sort_values(['chromosome', 'start', 'end']) cmb = get_combiners(table, False, combine) out = (table.groupby(by='chromosome', as_index=False, group_keys=False, sort=False) .apply(_flatten_overlapping, cmb, split_columns) .reset_index(drop=True)) return out.reindex(out.chromosome.apply(sorter_chrom) .sort_values(kind='mergesort').index)
def _flatten_overlapping(table, combine, split_columns): """Merge overlapping regions within a chromosome/strand. Assume chromosome and (if relevant) strand are already identical, so only start and end coordinates are considered. """ if split_columns: row_groups = (tuple(_flatten_tuples_split(row_group, combine, split_columns)) for row_group in _nonoverlapping_groups(table, 0)) else: row_groups = (tuple(_flatten_tuples(row_group, combine)) for row_group in _nonoverlapping_groups(table, 0)) all_row_groups = itertools.chain(*row_groups) return pd.DataFrame.from_records(list(all_row_groups), columns=table.columns) def _flatten_tuples(keyed_rows, combine): """Divide multiple rows where they overlap. Parameters ---------- keyed_rows : iterable pairs of (non-overlapping-group index, overlapping rows) combine : dict Mapping of field names to functions applied to combine overlapping regions. Returns ------- DataFrame """ rows = [kr[1] for kr in keyed_rows] first_row = rows[0] if len(rows) == 1: yield first_row else: # TODO speed this up! Bottleneck is in dictcomp extra_cols = [x for x in first_row._fields[3:] if x in combine] breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows]))) for bp_start, bp_end in zip(breaks[:-1], breaks[1:]): # Find the row(s) overlapping this region # i.e. any not already seen and not already passed rows_in_play = [row for row in rows if row.start <= bp_start and row.end >= bp_end] # Combine the extra fields of the overlapping regions extra_fields = {key: combine[key]([getattr(r, key) for r in rows_in_play]) for key in extra_cols} yield first_row._replace(start=bp_start, end=bp_end, **extra_fields) def _flatten_tuples_split(keyed_rows, combine, split_columns): """Divide multiple rows where they overlap. Parameters ---------- keyed_rows : iterable pairs of (non-overlapping-group index, overlapping rows) combine : dict Mapping of field names to functions applied to combine overlapping regions. split_columns : list or tuple Field names where numeric values should be subdivided a region. Returns ------- DataFrame """ rows = [kr[1] for kr in keyed_rows] first_row = rows[0] if len(rows) == 1: yield first_row else: # TODO - use split_columns extra_cols = [x for x in first_row._fields[3:] if x in combine] breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows]))) for bp_start, bp_end in zip(breaks[:-1], breaks[1:]): # Find the row(s) overlapping this region # i.e. any not already seen and not already passed rows_in_play = [row for row in rows if row.start <= bp_start and row.end >= bp_end] # Combine the extra fields of the overlapping regions extra_fields = {key: combine[key]([getattr(r, key) for r in rows_in_play]) for key in extra_cols} yield first_row._replace(start=bp_start, end=bp_end, **extra_fields)
[docs]def merge(table, bp=0, stranded=False, combine=None): """Merge overlapping rows in a DataFrame.""" if not len(table): return table gap_sizes = table.start.values[1:] - table.end.cummax().values[:-1] if (gap_sizes > -bp).all(): return table if stranded: groupkey = ['chromosome', 'strand'] else: # NB: same gene name can appear on alt. contigs groupkey = ['chromosome'] table = table.sort_values(groupkey + ['start', 'end']) cmb = get_combiners(table, stranded, combine) out = (table.groupby(by=groupkey, as_index=False, group_keys=False, sort=False) .apply(_merge_overlapping, bp, cmb) .reset_index(drop=True)) # Re-sort chromosomes cleverly instead of lexicographically return out.reindex(out.chromosome.apply(sorter_chrom) .sort_values(kind='mergesort').index)
def _merge_overlapping(table, bp, combine): """Merge overlapping regions within a chromosome/strand. Assume chromosome and (if relevant) strand are already identical, so only start and end coordinates are considered. """ merged_rows = [_squash_tuples(row_group, combine) for row_group in _nonoverlapping_groups(table, bp)] return pd.DataFrame.from_records(merged_rows, columns=merged_rows[0]._fields) def _nonoverlapping_groups(table, bp): """Identify and enumerate groups of overlapping rows. That is, increment the group ID after each non-negative gap between intervals. Intervals (rows) will be merged if any bases overlap by at least `bp`. """ # Examples: # # gap? F T F F T T T F # group 0 1 1 2 3 3 3 3 4 # # gap? T F T T F F F T # group 0 0 1 1 1 2 3 4 4 gap_sizes = table.start.values[1:] - table.end.cummax().values[:-1] group_keys = np.r_[False, gap_sizes > (-bp)].cumsum() # NB: pandas groupby seems like the obvious choice over itertools, but it is # very slow -- probably because it traverses the whole table (i.e. # chromosome) again to select groups, redoing the various inferences that # would be worthwhile outside this inner loop. With itertools, we take # advantage of the grouping and sorting already done, and don't repeat # pandas' traversal and inferences. # ENH: Find & use a lower-level, 1-pass pandas function keyed_groups = zip(group_keys, table.itertuples(index=False)) return (row_group for _key, row_group in itertools.groupby(keyed_groups, first_of)) # Squash rows according to a given grouping criterion # XXX see also segfilter.py def _squash_tuples(keyed_rows, combine): """Combine multiple rows into one NamedTuple. Parameters ---------- keyed_rows : iterable pairs of (non-overlapping-group index, overlapping rows) combine : dict Returns ------- namedtuple """ rows = [kr[1] for kr in keyed_rows] #list(rows) firsttup = rows[0] if len(rows) == 1: return firsttup newfields = {key: combiner([getattr(r, key) for r in rows]) for key, combiner in combine.items()} return firsttup._replace(**newfields)