Source code for skgenome.subdivide

"""DataFrame-level subdivide operation.

Split each region into similar-sized sub-regions.

The functions here operate on pandas DataFrame and Series instances, not
GenomicArray types.

"""
import logging

import pandas as pd

from .merge import merge


[docs]def subdivide(table, avg_size, min_size=0, verbose=False): return pd.DataFrame.from_records( _split_targets(table, avg_size, min_size, verbose), columns=table.columns)
def _split_targets(regions, avg_size, min_size, verbose): """Split large regions into smaller, consecutive regions. Output bin metadata and additional columns match the input dataframe. Parameters ---------- avg_size : int Split regions into equal-sized subregions of about this size. Specifically, subregions are no larger than 150% of this size, no smaller than 75% this size, and the average will approach this size when subdividing a large region. min_size : int Drop any regions smaller than this size. verbose : bool Print a log message when subdividing a region. """ for row in merge(regions).itertuples(index=False): span = row.end - row.start if span >= min_size: nbins = int(round(span / avg_size)) or 1 if nbins == 1: yield row else: # Divide the region into equal-sized bins bin_size = span / nbins bin_start = row.start if verbose: label = (row.gene if 'gene' in regions else "%s:%d-%d" % (row.chromosome, row.start, row.end)) logging.info("Splitting: {:30} {:7} / {} = {:.2f}" .format(label, span, nbins, bin_size)) for i in range(1, nbins): bin_end = row.start + int(i * bin_size) yield row._replace(start=bin_start, end=bin_end) bin_start = bin_end yield row._replace(start=bin_start)