Source code for pimlico.datatypes.r

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

import os

from pimlico.datatypes.base import PimlicoDatatypeWriter
from pimlico.datatypes.files import File


[docs]class RTabSeparatedValuesFile(File): """ Tabular data stored in a TSV file, suitable for reading in using R's `read.delim` function. """ datatype_name = "r_tsv" @property def absolute_path(self): return os.path.join(self.data_dir, "r_data.tsv")
[docs] def get_detailed_status(self): return ["Data path: %s" % self.absolute_path]
[docs]class RTabSeparatedValuesFileWriter(PimlicoDatatypeWriter): """ Writer for TSV files suitable for reading with R. If `headings` is specified, this is written as the first line of the file, so `headings=TRUE` should be used when reading into R. Double quotes (") in the fields will be replaced by double-double quotes (""), which R interprets as a double quote. Fields containing tabs will be surrounded by normal double quotes. When you read the data into R, the default value of `quotes` (") should therefore be fine. No escaping is performed on single quotes ('). """ def __init__(self, base_dir, headings=None, **kwargs): super(RTabSeparatedValuesFileWriter, self).__init__(base_dir, **kwargs) self.headings = headings self.file = None if self.headings is not None: self.row_length = len(self.headings) else: # Don't know row length until we write the first row self.row_length = None @property def absolute_path(self): return os.path.join(self.data_dir, "r_data.tsv") def __enter__(self): super(RTabSeparatedValuesFileWriter, self).__enter__() # Open the file for writing and write out the first line, if headings have been given self.file = open(self.absolute_path, "w") self.file.write((u"%s\n" % u"\t".join(self.headings)).encode("utf-8")) return self
[docs] def write_row(self, row): """ If elements are not of string type, they will be coerced to a string for writing. If you want to format them differently, do it before calling this method and pass in strings. """ if self.row_length is None: # Check all future rows have the same length self.row_length = len(row) elif self.row_length != len(row): raise ValueError("row has the wrong number of elements: %d, expected %d" % (len(row), self.row_length)) if self.file is None: raise IOError("cannot write row when file is not open: write_row() should be called within a *with* " "statement") row = [unicode(el) for el in row] # Escape any quotes in the fields row = [el.replace(u'"', u'""') for el in row] # Put quotes around any field include a tab row = [u'"%s"' % el if "\t" in el else el for el in row] self.file.write((u"%s\n" % u"\t".join(row)).encode("utf-8"))
def __exit__(self, exc_type, exc_val, exc_tb): super(RTabSeparatedValuesFileWriter, self).__exit__(exc_type, exc_val, exc_tb) if exc_type is None: # Close the file we opened self.file.close() self.file = None