Source code for cnvlib.rna

"""RNA expression levels quantified per gene.

Process per-gene expression levels, or the equivalent, by cohort.
"""

import logging

import numpy as np
import pandas as pd
from scipy.stats import gmean

from .cnary import CopyNumArray as CNA
from .fix import center_by_window


NULL_LOG2_COVERAGE = -5



[docs]
def before(char):
    """Create a function that extracts the substring before a delimiter character.

    This is a higher-order function (closure) that creates and returns a new
    function. The returned function takes a string and returns the portion of
    the string before the first occurrence of the specified delimiter character.

    Parameters
    ----------
    char : str
        The delimiter character to split on.

    Returns
    -------
    function
        A function that takes a string and returns the substring before the
        first occurrence of `char`. If `char` is not in the string, returns
        the entire string.

    Examples
    --------
    >>> get_base = before(".")
    >>> get_base("ENSG00000001.5")
    'ENSG00000001'
    >>> get_base("simple")
    'simple'
    """
    def selector(string):
        return string.split(char, 1)[0]

    return selector




[docs]
def filter_probes(sample_counts):
    """Filter probes to only include high-quality, transcribed genes.

    The human genome has ~25,000 protein coding genes, yet the RSEM output
    includes ~58,000 rows. Some of these rows correspond to read counts over
    control probes (e.g.  spike-in sequences). Some rows correspond to poorly
    mapped genes in contigs that have not been linked to the 24 chromosomes
    (e.g. HLA region). Others correspond to pseudo-genes and non-coding genes.
    For the purposes of copy number inference, these rows are best removed.
    """
    gene_medians = sample_counts.median(axis=1)
    # Make sure the gene has detectable transcript in at least half of samples
    is_mostly_transcribed = gene_medians >= 1.0
    logging.info(
        "Dropping %d / %d rarely expressed genes from input samples",
        (~is_mostly_transcribed).sum(),
        len(is_mostly_transcribed),
    )
    return sample_counts[is_mostly_transcribed]




[docs]
def load_gene_info(gene_resource, corr_fname, default_r=0.1):
    """Read gene info from BioMart, and optionally TCGA, into a dataframe.

    RSEM only outputs the Ensembl ID. We have used the BioMART tool in Ensembl
    to export a list of all Ensembl genes with a RefSeq mRNA (meaning it is high
    quality, curated, and bona fide gene) and resides on chromosomes 1-22, X, or
    Y. The tool also outputs the GC content of the gene, chromosomal coordinates
    of the gene, and HUGO gene symbol.

    The gene resource input can be obtained from a resource bundle we provide
    (for reference genome hg19) or generated from BioMart.
    """
    # Load the gene info file and clean up column names
    # Original columns:
    # "Gene stable ID" -- Ensembl ID
    # "Gene % GC content"
    # "Chromosome/scaffold name"
    # "Gene start (bp)"
    # "Gene end (bp)"
    # "Gene name"
    # "NCBI gene ID"
    # "Transcript length (including UTRs and CDS)"
    # "Transcript support level (TSL)"
    info_col_names = [
        "gene_id",
        "gc",
        "chromosome",
        "start",
        "end",
        "gene",
        "entrez_id",
        "tx_length",
        "tx_support",
    ]
    gene_info = pd.read_csv(
        gene_resource,
        sep="\t",
        header=1,
        names=info_col_names,
        converters={
            "gene_id": before("."),
            "tx_support": tsl2int,
            "gc": lambda x: float(x) / 100,
        },
    ).sort_values("gene_id")
    logging.info("Loaded %s with shape: %s", gene_resource, gene_info.shape)

    if corr_fname:
        corr_table = load_cnv_expression_corr(corr_fname)

        gi_corr = gene_info.join(corr_table, on="entrez_id", how="left")
        if not gi_corr["gene_id"].is_unique:
            unique_idx = gi_corr.groupby("gene_id").apply(dedupe_ens_hugo)
            gi_corr = gi_corr.loc[unique_idx]

        # DBG verify that the tables were aligned correctly
        #  for colname in ('pearson_r', 'spearman_r', 'kendall_t'):
        #      print("In column", colname, "length", len(gi_corr),
        #            ", have", gi_corr[colname].count(), "not-NA,",
        #            (gi_corr[colname] > 0.1).sum(), ">0.1",
        #            ", max value", np.nanmax(gi_corr[colname]))

        if not gene_info["entrez_id"].is_unique:
            # Treat untraceable entrez_id duplicates (scarce at this point, ~15)
            # as unknown correlation, i.e. default, as if not in TCGA
            entrez_dupes_idx = tuple(locate_entrez_dupes(gi_corr))
            logging.info(
                "Resetting %d ambiguous genes' correlation coefficients to default %f",
                len(entrez_dupes_idx),
                default_r,
            )
            gi_corr.loc[entrez_dupes_idx, ("pearson_r", "spearman_r", "kendall_t")] = (
                default_r
            )

        # Genes w/o TCGA info get default correlation 0.1 (= .5*corr.median())
        gene_info = gi_corr.fillna(
            {
                "pearson_r": default_r,
                "spearman_r": default_r,
                "kendall_t": default_r,
            }
        )

    elif not gene_info["gene_id"].is_unique:
        # Simpler deduplication, not using TCGA fields
        unique_idx = gene_info.groupby("gene_id").apply(dedupe_ens_no_hugo)
        gene_info = gene_info.loc[unique_idx]

    logging.info("Trimmed gene info table to shape: %s", gene_info.shape)
    assert gene_info["gene_id"].is_unique
    gene_info["entrez_id"] = gene_info["entrez_id"].fillna(0).astype("int")
    return gene_info.set_index("gene_id")




[docs]
def load_cnv_expression_corr(fname):
    """Load CNV-expression correlation coefficients from a file.

    Reads a tab-delimited file containing correlation coefficients between
    copy number variation and gene expression levels. This file is typically
    generated by the cnv_expression_correlate.py script from matched CNV and
    expression data, such as from TCGA cohorts.

    Parameters
    ----------
    fname : str
        Path to the correlation coefficients file. The file must be
        tab-delimited with 'Entrez_Gene_Id' as a column that will be used
        as the index.

    Returns
    -------
    pandas.DataFrame
        DataFrame indexed by Entrez Gene ID, containing correlation
        coefficient columns (e.g., 'pearson_r', 'spearman_r', 'kendall_t').

    See Also
    --------
    load_gene_info : Loads and merges gene info with correlation data
    """
    shared_key = "Entrez_Gene_Id"
    table = pd.read_csv(
        fname, sep="\t", na_filter=False, dtype={shared_key: int}
    ).set_index(shared_key)
    logging.info("Loaded %s with shape: %s", fname, table.shape)
    return table




[docs]
def tsl2int(tsl):
    """Convert an Ensembl Transcript Support Level (TSL) code to an integer.

    The code has the format "tsl([1-5]|NA)".

    See: https://www.ensembl.org/Help/Glossary?id=492
    """
    if tsl in (np.nan, "", "tslNA"):
        return 0
    # Remove "tsl" prefix
    assert tsl[:3] == "tsl"
    value = tsl[3:]
    # Remove possible suffix " (assigned to previous version _)"
    if len(value) > 2:
        value = value[:2].rstrip()
    if value == "NA":
        return 0
    return int(value)




[docs]
def dedupe_ens_hugo(dframe):
    """Emit the "best" index from a group of the same Ensembl ID.

    The RSEM gene rows are the data of interest, and they're associated with
    Ensembl IDs to indicate the transcribed gene being measured in each row.
    The BioMart table of gene info can have duplicate rows for Ensembl ID, which
    would cause duplicate rows in RSEM import if joined naively.  So, we need to
    select a single row for each group of "gene info" rows with the same Ensembl
    ID (column 'gene_id').

    The keys we can use for this are:

    - Entrez ID ('entrez_id')
    - Ensembl gene name ('gene')
    - Entrez gene name ('hugo_gene')

    Entrez vs. Ensembl IDs and gene names are potentially many-to-many, e.g.
    CALM1/2/3. However, if we also require that the Ensembl and HUGO gene names
    match within a group, that (usually? always?) results in a unique row
    remaining.

    (Example: CALM1/2/3 IDs are many-to-many, but of the 3 Entrez IDs associated
    with Ensembl's CALM1, only 1 is called CALM1 in the Entrez/corr. table.)

    Failing that (no matches or multiple matches), prefer a lower Entrez ID,
    because well-characterized, protein-coding genes tend to have been
    discovered and accessioned first.
    """
    if len(dframe) == 1:
        return dframe.index[0]
    match_gene = dframe[dframe["gene"] == dframe["hugo_gene"]]
    if len(match_gene) == 1:
        return match_gene.index[0]
    if len(match_gene) > 1:
        # Take lowest Entrez ID of the matched
        return dedupe_tx(match_gene)
    # No matching names -> lowest Entrez ID key
    return dedupe_tx(dframe)




[docs]
def dedupe_ens_no_hugo(dframe):
    """Deduplicate Ensembl ID using Entrez ID but not HUGO gene name."""
    if len(dframe) == 1:
        return dframe.index[0]
    return dedupe_tx(dframe)




[docs]
def dedupe_tx(dframe):
    """Deduplicate table rows to select one transcript length per gene.

    Choose the lowest-number Entrez ID and the transcript with the greatest
    support (primarily) and length (secondarily).

    This is done at the end of Ensembl ID deduplication, after filtering on gene
    names and for single-row tables.

    Returns an integer row index corresponding to the original table.
    """
    # NB: Transcripts are many-to-1 vs. Ensembl ID, not Entrez ID, so each
    # unique Entrez ID here will have the same collection of transcripts
    # associated with it.
    return dframe.sort_values(
        ["entrez_id", "tx_support", "tx_length"],
        ascending=[True, False, False],
        na_position="last",
    ).index[0]




[docs]
def locate_entrez_dupes(dframe):
    """In case the same Entrez ID was assigned to multiple Ensembl IDs.

    Use HUGO vs. HGNC name again, similar to `dedupe_hugo`, but instead of
    emiting the indices of the rows to keep, emit the indices of the extra rows
    -- their correlation values will then be filled in with a default value
    (np.nan or 0.1).

    It will then be as if those genes hadn't appeared in the TCGA tables at all,
    i.e. CNV-expression correlation is unknown, but all entries are still
    retained in the BioMart table (gene_info).
    """
    for _key, group in dframe.groupby("entrez_id"):
        if len(group) == 1:
            continue
        match_gene_idx = group["gene"] == group["hugo_gene"]
        match_gene_cnt = match_gene_idx.sum()
        if match_gene_cnt == 1:
            yield from group.index[~match_gene_idx]
        else:
            # Keep the lowest Ensemble ID (of the matched, if any)
            if match_gene_cnt:  # >1
                keepable = group[match_gene_idx]
            else:
                keepable = group
            idx_to_keep = keepable.sort_values("gene_id").index.to_numpy()[0]
            for idx in group.index:
                if idx != idx_to_keep:
                    yield idx




[docs]
def align_gene_info_to_samples(gene_info, sample_counts, tx_lengths, normal_ids):
    """Align columns and sort.

    Also calculate weights and add to gene_info as 'weight', along with
    transcript lengths as 'tx_length'.
    """
    logging.debug(
        "Dimensions: gene_info=%s, sample_counts=%s",
        gene_info.shape,
        sample_counts.shape,
    )
    sc, gi = sample_counts.align(gene_info, join="inner", axis=0)

    # Check that alignment resulted in some genes
    if len(gi) == 0:
        raise ValueError(
            "No genes in common between sample data and gene resource file. "
            f"Sample data has {len(sample_counts)} genes, "
            f"gene resource has {len(gene_info)} genes. "
            "Check that gene IDs match between your input files and gene resource. "
            "Sample gene IDs (first 5): " + ", ".join(sample_counts.index[:5].tolist()) + "; "
            "Gene resource IDs (first 5): " + ", ".join(gene_info.index[:5].tolist())
        )

    gi = gi.sort_values(by=["chromosome", "start"])
    sc = sc.loc[gi.index]

    if tx_lengths is not None:
        # Replace the existing tx_lengths from gene_resource
        # (RSEM results have this, TCGA gene counts don't)
        gi["tx_length"] = tx_lengths.loc[gi.index]

    # Validate transcript lengths before any calculations
    if (gi["tx_length"] <= 0).any():
        n_invalid = (gi["tx_length"] <= 0).sum()
        logging.warning(
            "Found %d genes with invalid transcript length (<= 0); filtering these out",
            n_invalid,
        )
        valid_idx = gi["tx_length"] > 0
        gi = gi[valid_idx]
        sc = sc.loc[gi.index]
        if len(gi) == 0:
            raise ValueError(
                "All genes have invalid transcript lengths (<= 0). "
                "Check your gene resource file or RSEM output."
            )

    # Calculate per-gene weights similarly to cnvlib.fix
    # NB: chrX doesn't need special handling because with X-inactivation,
    # expression should be similar in male and female samples, i.e. neutral is 0
    logging.info("Weighting genes with below-average read counts")
    gene_counts = sc.median(axis=1)
    weights = [np.sqrt((gene_counts / gene_counts.quantile(0.75)).clip(upper=1))]

    logging.info("Calculating normalized gene read depths")
    sample_depths_log2 = normalize_read_depths(
        sc.divide(gi["tx_length"], axis=0), normal_ids
    )

    logging.info("Weighting genes by spread of read depths")
    gene_spreads = sample_depths_log2.std(axis=1)
    weights.append(gene_spreads)

    corr_weights = []
    for corr_col in ("spearman_r", "pearson_r", "kendall_t"):
        if corr_col in gi:
            logging.info("Weighting genes by %s correlation coefficient", corr_col)
            corr_weights.append(gi[corr_col].values)
    if corr_weights:
        weights.append(np.vstack(corr_weights).mean(axis=0))

    weight = gmean(np.vstack(weights), axis=0)
    gi["weight"] = weight / weight.max()
    if gi["weight"].isna().all():
        gi["weight"] = 1.0
    logging.debug(" --> final zeros: %d / %d", (gi["weight"] == 0).sum(), len(gi))
    return gi, sc, sample_depths_log2




[docs]
def normalize_read_depths(sample_depths, normal_ids):
    """Normalize read depths within each sample.

    Some samples have higher sequencing depth and therefore read depths need to
    be normalized within each sample. TCGA recommends an upper quartile
    normalization.

    After normalizing read depths within each sample, normalize (median-center)
    within each gene, across samples.

    Finally, convert to log2 ratios.
    """
    if sample_depths.to_numpy().sum() <= 0:
        raise ValueError(
            "Sample read depths sum to zero or less. "
            f"Input shape: {sample_depths.shape}, "
            f"sum: {sample_depths.to_numpy().sum()}, "
            f"all values zero: {(sample_depths == 0).all().all()}, "
            f"has inf: {np.isinf(sample_depths.to_numpy()).any()}, "
            f"has nan: {sample_depths.isna().any().any()}. "
            "This likely indicates a problem with gene alignment or transcript lengths."
        )
    sample_depths = sample_depths.fillna(0)
    for _i in range(4):
        # By-sample: 75%ile  among all genes
        q3 = sample_depths.quantile(0.75)
        sample_depths /= q3
        # By-gene: median among all samples
        sm = sample_depths.median(axis=1)
        sample_depths = sample_depths.divide(sm, axis=0)
        # print("  round", _i, "grand mean:", sample_depths.values.mean(),
        #       ": sample-wise denom =", q3.mean(),
        #       ", gene-wise denom =", sm.mean())
    if normal_ids:
        # Use normal samples as a baseline for read depths
        normal_ids = pd.Series(normal_ids)
        if not normal_ids.isin(sample_depths.columns).all():
            raise ValueError(
                "Normal sample IDs not in samples: %s"
                % normal_ids.drop(sample_depths.columns, errors="ignore")
            )
        normal_depths = sample_depths.loc[:, normal_ids]
        use_median = True  # TODO - benchmark & pick one
        if use_median:
            # Simple approach: divide normalized (1=neutral) values by gene medians
            normal_avg = normal_depths.median(axis=1)
            sample_depths = sample_depths.divide(normal_avg, axis=0).clip(lower=0)
        else:
            # Alternate approach: divide their IQR
            # At each gene, sample depths above the normal sample depth 75%ile
            # are divided by the 75%ile, those below 25%ile are divided by
            # 25%ile, and those in between are reset to neutral (=1.0)
            n25, n75 = np.nanpercentile(normal_depths.to_numpy(), [25, 75], axis=1)
            below_idx = sample_depths.to_numpy() < n25[:, np.newaxis]
            above_idx = sample_depths.to_numpy() > n75[:, np.newaxis]
            factor = np.zeros_like(sample_depths.to_numpy())
            factor[below_idx] = (below_idx / n25[:, np.newaxis])[below_idx]
            factor[above_idx] = (above_idx / n75[:, np.newaxis])[above_idx]
            sample_depths *= factor
            sample_depths[~(below_idx | above_idx)] = 1.0
    # Finally, convert normalized read depths to log2 scale
    return safe_log2(sample_depths, NULL_LOG2_COVERAGE)




[docs]
def safe_log2(values, min_log2):
    """Transform values to log2 scale, safely handling zeros.

    Parameters
    ----------
    values : np.array
        Absolute-scale values to transform. Should be non-negative.
    min_log2 : float
        Assign input zeros this log2-scaled value instead of -inf. Rather than
        hard-clipping, input values near 0 (especially below 2^min_log2) will be
        squeezed a bit above `min_log2` in the log2-scale output.
    """
    absolute_shift = 2**min_log2
    return np.log2(values + absolute_shift)




[docs]
def attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info, read_len=100):
    """Join gene info to each sample's log2 expression ratios.

    Add the Ensembl gene info to the aggregated gene expected read counts,
    dropping genes that are not in the Ensembl table
    I.e., filter probes down to those genes that have names/IDs in the gene
    resource table.

    Split out samples to individual .cnr files, keeping (most) gene info.
    """
    gi_cols = ["chromosome", "start", "end", "gene", "gc", "tx_length", "weight"]
    cnr_info = gene_info.loc[:, gi_cols]
    # Fill NA fields with the lowest finite value in the same row.
    # Only use NULL_LOG2_COVERAGE if all samples are NA / zero-depth.
    gene_minima = sample_data_log2.min(axis=1, skipna=True)
    # If a gene has NaN across all samples, fill with NULL_LOG2_COVERAGE
    gene_minima = gene_minima.fillna(NULL_LOG2_COVERAGE)
    assert not gene_minima.hasnans, gene_minima.head()
    for (sample_id, sample_col), (_sid_log2, sample_log2) in zip(
        sample_counts.items(), sample_data_log2.items(), strict=False
    ):
        tx_len = cnr_info.tx_length
        sample_depth = (read_len * sample_col / tx_len).rename("depth")
        sample_log2 = sample_log2.fillna(gene_minima).rename("log2")
        cdata = pd.concat([cnr_info, sample_depth, sample_log2], axis=1).reset_index(
            drop=True
        )
        cnr = CNA(cdata, {"sample_id": sample_id})
        cnr.sort_columns()
        yield cnr




[docs]
def correct_cnr(cnr, do_gc, do_txlen, max_log2, diploid_parx_genome):
    """Apply bias corrections & smoothing.

    - Biases: 'gc', 'length'
    - Smoothing: rolling triangle window using weights.
    """
    cnr.center_all(diploid_parx_genome=diploid_parx_genome)
    # Biases, similar to stock CNVkit
    if any((do_gc, do_txlen)):
        if do_gc and "gc" in cnr:
            cnr = center_by_window(cnr, 0.1, cnr["gc"])
        if do_txlen and "tx_length" in cnr:
            cnr = center_by_window(cnr, 0.1, cnr["tx_length"])
        cnr.center_all()
    if max_log2:
        cnr[cnr["log2"] > max_log2, "log2"] = max_log2
    return cnr
Source code for cnvlib.rna

CNVkit

Navigation

Related Topics