Source code for skgenome.chromsort
"""Operations on chromosome/contig/sequence names."""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Iterable
from itertools import takewhile
import numpy as np
import pandas as pd
[docs]
def detect_big_chroms(sizes: Iterable[int]) -> tuple[int, int]:
"""Determine the number of "big" chromosomes from their lengths.
In the human genome, this returns 24, where the canonical chromosomes 1-22,
X, and Y are considered "big", while mitochrondria and the alternative
contigs are not. This allows us to exclude the non-canonical chromosomes
from an analysis where they're not relevant.
Returns
-------
n_big : int
Number of "big" chromosomes in the genome.
thresh : int
Length of the smallest "big" chromosomes.
"""
sizes = pd.Series(sizes).sort_values(ascending=False)
reldiff = sizes.diff().abs().to_numpy()[1:] / sizes.to_numpy()[:-1]
changepoints = np.nonzero(reldiff > 0.5)[0]
if changepoints.any():
n_big = changepoints[0] + 1
thresh = sizes.iat[n_big - 1]
else:
n_big = len(sizes)
thresh = sizes.to_numpy()[-1]
return n_big, thresh
[docs]
def sorter_chrom(label: str) -> tuple[int, str]:
"""Create a sorting key from chromosome label.
Sort by integers first, then letters or strings. The prefix "chr"
(case-insensitive), if present, is stripped automatically for sorting.
E.g. chr1 < chr2 < chr10 < chrX < chrY < chrM
"""
# Strip "chr" prefix
chrom = label[3:] if label.lower().startswith("chr") else label
if chrom in ("X", "Y"):
key = (1000, chrom)
else:
# Separate numeric and special chromosomes
nums = "".join(takewhile(str.isdigit, chrom))
chars = chrom[len(nums) :]
nums = int(nums) if nums else 0
if not chars:
key = (nums, "")
elif len(chars) == 1:
key = (2000 + nums, chars)
else:
key = (3000 + nums, chars)
return key