"""I/O for tabular formats of genomic data (regions or features).
"""
from __future__ import absolute_import, division, print_function
from past.builtins import basestring
import collections
import contextlib
import logging
import os
import re
import sys
import pandas as pd
from Bio.File import as_handle
from ..gary import GenomicArray as GA
from . import bedio, genepred, gff, picard, seg, tab, textcoord, vcfio
[docs]def read(infile, fmt="tab", into=None, sample_id=None, meta=None, **kwargs):
"""Read tabular data from a file or stream into a genome object.
Supported formats: see `READERS`
If a format supports multiple samples, return the sample specified by
`sample_id`, or if unspecified, return the first sample and warn if there
were other samples present in the file.
Parameters
----------
infile : handle or string
Filename or opened file-like object to read.
fmt : string
File format.
into : class
GenomicArray class or subclass to instantiate, overriding the
default for the target file format.
sample_id : string
Sample identifier.
meta : dict
Metadata, as arbitrary key-value pairs.
**kwargs :
Additional keyword arguments to the format-specific reader function.
Returns
-------
GenomicArray or subclass
The data from the given file instantiated as `into`, if specified, or
the default base class for the given file format (usually GenomicArray).
"""
from cnvlib.core import fbase
if fmt == 'auto':
return read_auto(infile)
elif fmt in READERS:
reader, suggest_into = READERS[fmt]
else:
raise ValueError("Unknown format: %s" % fmt)
if meta is None:
meta = {}
if "sample_id" not in meta:
if sample_id:
meta["sample_id"] = sample_id
elif isinstance(infile, basestring):
meta["sample_id"] = fbase(infile)
elif hasattr(infile, "name"):
meta["sample_id"] = fbase(infile.name)
else:
# meta["sample_id"] = "<unknown>"
pass
if "filename" not in meta:
if isinstance(infile, basestring):
meta["filename"] = infile
elif hasattr(infile, "name"):
meta["filename"] = infile.name
if fmt in ("seg", "vcf") and sample_id is not None:
# Multi-sample formats: choose one sample
kwargs["sample_id"] = sample_id
try:
dframe = reader(infile, **kwargs)
except pd.io.common.EmptyDataError:
# File is blank/empty, most likely
logging.info("Blank %s file?: %s", fmt, infile)
dframe = []
if fmt == "vcf":
from cnvlib.vary import VariantArray as VA
suggest_into = VA
result = (into or suggest_into)(dframe, meta)
result.sort_columns()
result.sort()
return result
# ENH CategoricalIndex ---
# if dframe:
# dframe['chromosome'] = pd.Categorical(dframe['chromosome'],
# dframe.chromosome.drop_duplicates(),
# ordered=True)
# Create a multi-index of genomic coordinates (like GRanges)
# dframe.set_index(['chromosome', 'start'], inplace=True)
[docs]def read_auto(infile):
"""Auto-detect a file's format and use an appropriate parser to read it."""
if not isinstance(infile, basestring) and not hasattr(infile, "seek"):
raise ValueError("Can only auto-detect format from filename or "
"seekable (local, on-disk) files, which %s is not"
% infile)
fmt = sniff_region_format(infile)
if hasattr(infile, "seek"):
infile.seek(0)
if fmt:
logging.info("Detected file format: " + fmt)
else:
# File is blank -- simple BED will handle this OK
fmt = "bed3"
return read(infile, fmt or 'tab')
READERS = {
# Format name, formatter, default target class
"auto": (read_auto, GA),
"bed": (bedio.read_bed, GA),
"bed3": (bedio.read_bed3, GA),
"bed4": (bedio.read_bed4, GA),
"bed6": (bedio.read_bed6, GA),
"gff": (gff.read_gff, GA),
"interval": (picard.read_interval, GA),
"refflat": (genepred.read_refflat, GA),
"picardhs": (picard.read_picard_hs, GA),
"seg": (seg.read_seg, GA),
"tab": (tab.read_tab, GA),
"text": (textcoord.read_text, GA),
"vcf": (vcfio.read_vcf, GA),
}
# _____________________________________________________________________
[docs]def write(garr, outfile=None, fmt="tab", verbose=True, **kwargs):
"""Write a genome object to a file or stream."""
formatter, show_header = WRITERS[fmt]
if fmt in ("seg", "vcf"):
kwargs["sample_id"] = garr.sample_id
dframe = formatter(garr.data, **kwargs)
with safe_write(outfile or sys.stdout, verbose=False) as handle:
dframe.to_csv(handle, header=show_header, index=False, sep='\t',
float_format='%.6g')
if verbose:
# Log the output path, if possible
if isinstance(outfile, basestring):
outfname = outfile
elif hasattr(outfile, 'name') and outfile not in (sys.stdout,
sys.stderr):
outfname = outfile.name
else:
# Probably stdout or stderr used in a pipeline -- don't pollute
return
logging.info("Wrote %s with %d regions", outfname, len(dframe))
WRITERS = {
# Format name, formatter, show header
"bed": (bedio.write_bed, False),
"bed3": (bedio.write_bed3, False),
"bed4": (bedio.write_bed4, False),
# "gff": (gff.write_gff, False),
"interval": (picard.write_interval, False),
"picardhs": (picard.write_picard_hs, True),
"seg": (seg.write_seg, True),
"tab": (tab.write_tab, True),
"text": (textcoord.write_text, False),
"vcf": (vcfio.write_vcf, True),
}
# _____________________________________________________________________
@contextlib.contextmanager
[docs]def safe_write(outfile, verbose=True):
"""Write to a filename or file-like object with error handling.
If given a file name, open it. If the path includes directories that don't
exist yet, create them. If given a file-like object, just pass it through.
"""
if isinstance(outfile, basestring):
dirname = os.path.dirname(outfile)
if dirname and not os.path.isdir(dirname):
os.mkdir(dirname)
logging.info("Created directory %s", dirname)
with open(outfile, 'w') as handle:
yield handle
else:
yield outfile
# Log the output path, if possible
if verbose:
if isinstance(outfile, basestring):
outfname = outfile
elif hasattr(outfile, 'name') and outfile not in (sys.stdout,
sys.stderr):
outfname = outfile.name
else:
# Probably stdout or stderr -- don't ruin the pipeline
return
logging.info("Wrote %s", outfname)
format_patterns = collections.OrderedDict([
# ('genepred', re.compile()),
# ('genepredext', re.compile()),
('text', re.compile(r'\w+:\d*-\d*.*')),
('tab', re.compile('\t'.join(('chromosome', 'start', 'end')))),
('interval', re.compile('\t'.join((
r'\w+', r'\d+', r'\d+', r'[.+-]', r'\S+$')))),
('refflat', re.compile('\t'.join((
r'\S+', r'\S+', r'\w+', r'[+-]',
r'\d+', r'\d+', r'\d+', r'\d+', r'\d+',
r'(\d+,)+', r'(\d+,)+$')))),
('gff', re.compile('\t'.join((
r'\w+', r'\S+', r'\w+', r'\d+', r'\d+',
r'\S+', r'[.?+-]', r'[012.]', r'.*')))),
('bed', re.compile('\t'.join((r'\w+', r'\d+', r'\d+')))),
])