"""DataFrame-level merging operations.
Merge overlapping regions into single rows, similar to bedtools merge.
The functions here operate on pandas DataFrame and Series instances, not
GenomicArray types.
"""
import itertools
import numpy as np
import pandas as pd
from .chromsort import sorter_chrom
from .combiners import get_combiners, first_of
[docs]def flatten(table, combine=None, split_columns=None):
if not len(table):
return table
if (table.start.values[1:] >= table.end.cummax().values[:-1]).all():
return table
# NB: Input rows and columns should already be sorted like this
table = table.sort_values(['chromosome', 'start', 'end'])
cmb = get_combiners(table, False, combine)
out = (table.groupby(by='chromosome',
as_index=False, group_keys=False, sort=False)
.apply(_flatten_overlapping, cmb, split_columns)
.reset_index(drop=True))
return out.reindex(out.chromosome.apply(sorter_chrom)
.sort_values(kind='mergesort').index)
def _flatten_overlapping(table, combine, split_columns):
"""Merge overlapping regions within a chromosome/strand.
Assume chromosome and (if relevant) strand are already identical, so only
start and end coordinates are considered.
"""
if split_columns:
row_groups = (tuple(_flatten_tuples_split(row_group, combine,
split_columns))
for row_group in _nonoverlapping_groups(table, 0))
else:
row_groups = (tuple(_flatten_tuples(row_group, combine))
for row_group in _nonoverlapping_groups(table, 0))
all_row_groups = itertools.chain(*row_groups)
return pd.DataFrame.from_records(list(all_row_groups),
columns=table.columns)
def _flatten_tuples(keyed_rows, combine):
"""Divide multiple rows where they overlap.
Parameters
----------
keyed_rows : iterable
pairs of (non-overlapping-group index, overlapping rows)
combine : dict
Mapping of field names to functions applied to combine overlapping
regions.
Returns
-------
DataFrame
"""
rows = [kr[1] for kr in keyed_rows]
first_row = rows[0]
if len(rows) == 1:
yield first_row
else:
# TODO speed this up! Bottleneck is in dictcomp
extra_cols = [x for x in first_row._fields[3:] if x in combine]
breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows])))
for bp_start, bp_end in zip(breaks[:-1], breaks[1:]):
# Find the row(s) overlapping this region
# i.e. any not already seen and not already passed
rows_in_play = [row for row in rows
if row.start <= bp_start and row.end >= bp_end]
# Combine the extra fields of the overlapping regions
extra_fields = {key: combine[key]([getattr(r, key)
for r in rows_in_play])
for key in extra_cols}
yield first_row._replace(start=bp_start, end=bp_end,
**extra_fields)
def _flatten_tuples_split(keyed_rows, combine, split_columns):
"""Divide multiple rows where they overlap.
Parameters
----------
keyed_rows : iterable
pairs of (non-overlapping-group index, overlapping rows)
combine : dict
Mapping of field names to functions applied to combine overlapping
regions.
split_columns : list or tuple
Field names where numeric values should be subdivided a region.
Returns
-------
DataFrame
"""
rows = [kr[1] for kr in keyed_rows]
first_row = rows[0]
if len(rows) == 1:
yield first_row
else:
# TODO - use split_columns
extra_cols = [x for x in first_row._fields[3:] if x in combine]
breaks = sorted(set(itertools.chain(*[(r.start, r.end) for r in rows])))
for bp_start, bp_end in zip(breaks[:-1], breaks[1:]):
# Find the row(s) overlapping this region
# i.e. any not already seen and not already passed
rows_in_play = [row for row in rows
if row.start <= bp_start and row.end >= bp_end]
# Combine the extra fields of the overlapping regions
extra_fields = {key: combine[key]([getattr(r, key)
for r in rows_in_play])
for key in extra_cols}
yield first_row._replace(start=bp_start, end=bp_end,
**extra_fields)
[docs]def merge(table, bp=0, stranded=False, combine=None):
"""Merge overlapping rows in a DataFrame."""
if not len(table):
return table
gap_sizes = table.start.values[1:] - table.end.cummax().values[:-1]
if (gap_sizes > -bp).all():
return table
if stranded:
groupkey = ['chromosome', 'strand']
else:
# NB: same gene name can appear on alt. contigs
groupkey = ['chromosome']
table = table.sort_values(groupkey + ['start', 'end'])
cmb = get_combiners(table, stranded, combine)
out = (table.groupby(by=groupkey,
as_index=False, group_keys=False, sort=False)
.apply(_merge_overlapping, bp, cmb)
.reset_index(drop=True))
# Re-sort chromosomes cleverly instead of lexicographically
return out.reindex(out.chromosome.apply(sorter_chrom)
.sort_values(kind='mergesort').index)
def _merge_overlapping(table, bp, combine):
"""Merge overlapping regions within a chromosome/strand.
Assume chromosome and (if relevant) strand are already identical, so only
start and end coordinates are considered.
"""
merged_rows = [_squash_tuples(row_group, combine)
for row_group in _nonoverlapping_groups(table, bp)]
return pd.DataFrame.from_records(merged_rows,
columns=merged_rows[0]._fields)
def _nonoverlapping_groups(table, bp):
"""Identify and enumerate groups of overlapping rows.
That is, increment the group ID after each non-negative gap between
intervals. Intervals (rows) will be merged if any bases overlap by at least
`bp`.
"""
# Examples:
#
# gap? F T F F T T T F
# group 0 1 1 2 3 3 3 3 4
#
# gap? T F T T F F F T
# group 0 0 1 1 1 2 3 4 4
gap_sizes = table.start.values[1:] - table.end.cummax().values[:-1]
group_keys = np.r_[False, gap_sizes > (-bp)].cumsum()
# NB: pandas groupby seems like the obvious choice over itertools, but it is
# very slow -- probably because it traverses the whole table (i.e.
# chromosome) again to select groups, redoing the various inferences that
# would be worthwhile outside this inner loop. With itertools, we take
# advantage of the grouping and sorting already done, and don't repeat
# pandas' traversal and inferences.
# ENH: Find & use a lower-level, 1-pass pandas function
keyed_groups = zip(group_keys, table.itertuples(index=False))
return (row_group
for _key, row_group in itertools.groupby(keyed_groups, first_of))
# Squash rows according to a given grouping criterion
# XXX see also segfilter.py
def _squash_tuples(keyed_rows, combine):
"""Combine multiple rows into one NamedTuple.
Parameters
----------
keyed_rows : iterable
pairs of (non-overlapping-group index, overlapping rows)
combine : dict
Returns
-------
namedtuple
"""
rows = [kr[1] for kr in keyed_rows] #list(rows)
firsttup = rows[0]
if len(rows) == 1:
return firsttup
newfields = {key: combiner([getattr(r, key) for r in rows])
for key, combiner in combine.items()}
return firsttup._replace(**newfields)