Source code for cnvlib.import_rna

"""The 'import-rna' command."""

import logging
import os

import numpy as np
import pandas as pd

from . import rna


[docs] def do_import_rna( gene_count_fnames, in_format, gene_resource_fname, correlations_fname=None, normal_fnames=(), do_gc=True, do_txlen=True, max_log2=3, diploid_parx_genome=None, ): """Convert a cohort of per-gene read counts to CNVkit .cnr format. The expected data source is TCGA gene-level expression counts for individual samples, but other sources should be fine, too. Parameters ---------- gene_count_fnames : list of str Paths to gene count files for all samples. in_format : str Input file format: 'rsem' or 'counts'. gene_resource_fname : str Path to gene metadata/resource file. correlations_fname : str, optional Path to TCGA gene expression/CNV correlation profiles. normal_fnames : tuple of str, optional Paths to normal/control sample count files. do_gc : bool, optional Apply GC bias correction. Default is True. do_txlen : bool, optional Apply transcript length correction. Default is True. max_log2 : float, optional Maximum log2 ratio to cap values at. Default is 3. diploid_parx_genome : str, optional Reference genome name for pseudo-autosomal region handling (e.g., 'hg19', 'hg38'). Returns ------- tuple (all_data DataFrame, cnrs generator) - all_data: Summary table with gene info and log2-normalized values - cnrs: Generator of CopyNumArray objects, one per sample Raises ------ RuntimeError If input format is not recognized. """ # Deduplicate and ensure all normals are included in the analysis gene_count_fnames = sorted(set(list(gene_count_fnames) + list(normal_fnames))) if in_format == "rsem": sample_counts, tx_lengths = aggregate_rsem(gene_count_fnames) elif in_format == "counts": sample_counts = aggregate_gene_counts(gene_count_fnames) tx_lengths = None else: raise RuntimeError("Unrecognized input format name: {in_format!r}") sample_counts = rna.filter_probes(sample_counts) logging.info( "Loading gene metadata%s", " and TCGA gene expression/CNV profiles" if correlations_fname else "", ) gene_info = rna.load_gene_info(gene_resource_fname, correlations_fname) logging.info("Aligning gene info to sample gene counts") normal_ids = [os.path.basename(f).split(".")[0] for f in normal_fnames] gene_info, sample_counts, sample_data_log2 = rna.align_gene_info_to_samples( gene_info, sample_counts, tx_lengths, normal_ids ) # Summary table has log2-normalized values, not raw counts # ENH show both, with column header suffixes to distinguish? all_data = pd.concat([gene_info, sample_data_log2], axis=1) # CNVkit files have both absolute and log2-normalized read counts cnrs = rna.attach_gene_info_to_cnr(sample_counts, sample_data_log2, gene_info) cnrs = ( rna.correct_cnr(cnr, do_gc, do_txlen, max_log2, diploid_parx_genome) for cnr in cnrs ) return all_data, cnrs
[docs] def aggregate_gene_counts(filenames): prev_row_count = None sample_cols = {} for fname in filenames: d = pd.read_csv( fname, sep="\t", comment="_", header=None, names=["gene_id", "expected_count"], converters={"gene_id": rna.before(".")}, ).set_index("gene_id") # .drop(["__no_feature", "__ambiguous", "__too_low_aQual", # "__not_aligned", "__alignment_not_unique"])) if prev_row_count is None: prev_row_count = len(d) elif len(d) != prev_row_count: raise RuntimeError("Number of rows in each input file is not equal") sample_id = rna.before(".")(os.path.basename(fname)) sample_cols[sample_id] = d.expected_count.fillna(0) sample_counts = pd.DataFrame(sample_cols) return sample_counts
[docs] def aggregate_rsem(fnames): """Pull out the expected read counts from each RSEM file. The format of RSEM's ``*_rsem.genes.results`` output files is tab-delimited with a header row. We extract the Ensembl gene ID, expected read counts, and transcript lengths from each file. Returns ------- sample_counts : DataFrame Row index is Ensembl gene ID, column index is filename. tx_lengths : Series Gene lengths. """ prev_row_count = None sample_cols = {} length_cols = [] length_colname = "length" # or: 'effective_length' for fname in fnames: # NB: read_csv(index_col=_) works independently of combine=, dtype= # so index column needs to be set after parsing, not during. # https://github.com/pandas-dev/pandas/issues/9435 d = pd.read_csv( fname, sep="\t", usecols=["gene_id", length_colname, "expected_count"], # index_col='gene_id', converters={"gene_id": rna.before(".")}, ).set_index("gene_id") if prev_row_count is None: prev_row_count = len(d) elif len(d) != prev_row_count: raise RuntimeError("Number of rows in each input file is not equal") sample_id = rna.before(".")(os.path.basename(fname)) sample_cols[sample_id] = d.expected_count.fillna(0) length_cols.append(d[length_colname]) sample_counts = pd.DataFrame(sample_cols) tx_lengths = pd.Series( np.vstack(length_cols).mean(axis=0), index=sample_counts.index ) return sample_counts, tx_lengths