Source code for cnvlib.antitarget

"""Supporting functions for the 'antitarget' command."""

from __future__ import annotations

import logging
import re
from typing import TYPE_CHECKING, Optional

from skgenome import GenomicArray as GA

from .params import INSERT_SIZE, MIN_REF_COVERAGE, ANTITARGET_NAME



[docs]
def do_antitarget(
    targets: GA,
    access: Optional[GA] = None,
    avg_bin_size: int = 150000,
    min_bin_size: Optional[int] = None,
) -> GA:
    """Derive off-target ("antitarget") bins from target regions.

    Parameters
    ----------
    targets : GenomicArray
        Target genomic regions.
    access : GenomicArray, optional
        Accessible genomic regions to constrain antitarget bins.
    avg_bin_size : int, optional
        Average size for antitarget bins. Default is 150000.
    min_bin_size : int, optional
        Minimum size for antitarget bins. If not specified, calculated as
        2 * avg_bin_size * (2^MIN_REF_COVERAGE).

    Returns
    -------
    GenomicArray
        Antitarget genomic regions.
    """
    if not min_bin_size:
        min_bin_size = 2 * int(avg_bin_size * (2**MIN_REF_COVERAGE))
    return get_antitargets(targets, access, avg_bin_size, min_bin_size)




[docs]
def get_antitargets(
    targets: GA, accessible: GA, avg_bin_size: int, min_bin_size: Optional[int]
) -> GA:
    """Generate antitarget intervals between/around target intervals.

    Procedure:

    - Invert target intervals
    - Subtract the inverted targets from accessible regions
    - For each of the resulting regions:

        - Shrink by a fixed margin on each end
        - If it's smaller than min_bin_size, skip
        - Divide into equal-size (region_size/avg_bin_size) portions
        - Emit the (chrom, start, end) coords of each portion
    """
    if accessible:
        # Chromosomes' accessible sequence regions are given -- use them
        accessible = drop_noncanonical_contigs(accessible, targets)
    else:
        # Chromosome accessible sequence regions not known -- use heuristics
        # (chromosome length is endpoint of last probe; skip initial
        # <magic number> of bases that are probably telomeric)
        TELOMERE_SIZE = 150000
        accessible = guess_chromosome_regions(targets, TELOMERE_SIZE)
    pad_size = 2 * INSERT_SIZE
    bg_arr = (
        accessible.resize_ranges(-pad_size)
        .subtract(targets.resize_ranges(pad_size))
        .subdivide(avg_bin_size, min_bin_size)
    )
    bg_arr["gene"] = ANTITARGET_NAME
    return bg_arr




[docs]
def drop_noncanonical_contigs(accessible: GA, targets: GA, verbose: bool = True) -> GA:
    """Drop contigs that are not targeted or canonical chromosomes.

    Antitargets will be binned over chromosomes that:

    - Appear in the sequencing-accessible regions of the reference genome
      sequence, and
    - Contain at least one targeted region, or
    - Are named like a canonical chromosome (1-22,X,Y for human)

    This allows antitarget binning to pick up canonical chromosomes that do not
    contain any targets, as well as non-canonical or oddly named chromosomes
    that were targeted.
    """
    # TODO - generalize: (garr, by="name", verbose=True):

    access_chroms, target_chroms = compare_chrom_names(accessible, targets)
    # Filter out untargeted alternative contigs and mitochondria
    untgt_chroms = access_chroms - target_chroms
    # Autosomes typically have numeric names, allosomes are X and Y
    if any(is_canonical_contig_name(c) for c in target_chroms):
        chroms_to_skip = [c for c in untgt_chroms if not is_canonical_contig_name(c)]
    else:
        # Alternative contigs have longer names -- skip them
        max_tgt_chr_name_len = max(map(len, target_chroms))
        chroms_to_skip = [c for c in untgt_chroms if len(c) > max_tgt_chr_name_len]
    if chroms_to_skip:
        logging.info(
            "Skipping untargeted chromosomes %s", " ".join(sorted(chroms_to_skip))
        )
        skip_idx = accessible.chromosome.isin(chroms_to_skip)
        accessible = accessible[~skip_idx]
    return accessible




[docs]
def compare_chrom_names(a_regions: GA, b_regions: GA) -> tuple[set, set]:
    """Check if chromosome names overlap, and preview each if not.

    This summary message will help triage cases of e.g. "chr1" vs. "1" in the
    two genomic datasets being compared.
    """
    a_chroms = set(a_regions.chromosome.unique())
    b_chroms = set(b_regions.chromosome.unique())
    if a_chroms and a_chroms.isdisjoint(b_chroms):
        msg = "Chromosome names do not match between files"
        a_fname = a_regions.meta.get("filename")
        b_fname = b_regions.meta.get("filename")
        if a_fname and b_fname:
            msg += f" {a_fname} and {b_fname}"
        msg += ": {} vs. {}".format(
            ", ".join(map(repr, sorted(a_chroms)[:3])),
            ", ".join(map(repr, sorted(b_chroms)[:3])),
        )
        raise ValueError(msg)
    return a_chroms, b_chroms




[docs]
def guess_chromosome_regions(targets: GA, telomere_size: int) -> GA:
    """Determine (minimum) chromosome lengths from target coordinates."""
    endpoints = [subarr.end.iat[-1] for _c, subarr in targets.by_chromosome()]
    whole_chroms = GA.from_columns(
        {
            "chromosome": targets.chromosome.drop_duplicates(),
            "start": telomere_size,
            "end": endpoints,
        }
    )
    return whole_chroms



# TODO - move to skgenome.chromsort

# CNVkit's original inclusion regex
re_canonical = re.compile(r"(chr)?(\d+|[XYxy])$")
# goleft indexcov's exclusion regex
re_noncanonical = re.compile(
    "|".join(
        (
            r"^chrEBV$",
            r"^NC|_random$",
            r"Un_",
            r"^HLA\-",
            r"_alt$",
            r"hap\d$",
            r"chrM",
            r"MT",
        )
    )
)



[docs]
def is_canonical_contig_name(name: str) -> bool:
    # return bool(re_canonical.match(name))
    return not re_noncanonical.search(name)



def _drop_short_contigs(garr: GA) -> GA:
    """Drop contigs that are much shorter than the others.

    Cutoff is where a contig is less than half the size of the next-shortest
    contig.
    """
    from .plots import chromosome_sizes
    from skgenome.chromsort import detect_big_chroms

    chrom_sizes = chromosome_sizes(garr)
    n_big, thresh = detect_big_chroms(chrom_sizes.values())
    chrom_names_to_keep = {c for c, s in chrom_sizes.items() if s >= thresh}
    assert len(chrom_names_to_keep) == n_big
    return garr[garr.chromosome.isin(chrom_names_to_keep)]
Source code for cnvlib.antitarget

CNVkit

Navigation

Related Topics