Source code for pimlico.datatypes.floats

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

"""
Similar to :mod:pimlico.datatypes.ints, but for lists of floats.

"""

import struct
from StringIO import StringIO

from pimlico.cli.browser.formatter import DocumentBrowserFormatter
from pimlico.datatypes.documents import RawDocumentType
from pimlico.datatypes.table import get_struct
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid


[docs]class FloatListsDocumentType(RawDocumentType): formatters = [("float_lists", "pimlico.datatypes.floats.FloatListsFormatter")] def __init__(self, options, metadata): super(FloatListsDocumentType, self).__init__(options, metadata) # Struct for reading in individual floats (actually doubles) self.unpacker = struct.Struct("<d") self.value_size = self.unpacker.size # Struct for unpacking the row length at the start of each row self.length_unpacker = get_struct(2, False, 1) self.length_size = self.length_unpacker.size
[docs] def process_document(self, data): reader = StringIO(data) return list(self.read_rows(reader))
[docs] def read_rows(self, reader): while True: # First read an int that tells us how long the row is row_length_string = reader.read(self.length_size) if row_length_string == "": # Reached end of file break row_length = self.length_unpacker.unpack(row_length_string)[0] # Read the whole row, one int at a time row = [] for i in range(row_length): num_string = reader.read(self.value_size) if num_string == "": raise IOError("file ended mid-row") try: num = self.unpacker.unpack(num_string)[0] except struct.error, e: raise IOError("error interpreting float data: %s" % e) row.append(num) yield row
[docs]class FloatListsFormatter(DocumentBrowserFormatter): DATATYPE = FloatListsDocumentType
[docs] def format_document(self, doc): return "\n".join( " ".join("%.4f" % f for f in lst) for lst in doc )
[docs]class FloatListsDocumentCorpus(TarredCorpus): """ Corpus of float list data: each doc contains lists of float. Unlike :class:`~pimlico.datatypes.table.IntegerTableDocumentCorpus`, they are not all constrained to have the same length. The downside is that the storage format (and probably I/O speed) isn't quite as efficient. It's still better than just storing ints as strings or JSON objects. The floats are stored as C double, which use 8 bytes. At the moment, we don't provide any way to change this. An alternative would be to use C floats, losing precision but (almost) halving storage size. """ datatype_name = "float_lists_corpus" data_point_type = FloatListsDocumentType
[docs]class FloatListsDocumentCorpusWriter(TarredCorpusWriter): def __init__(self, base_dir, **kwargs): # Tell TarredCorpus not to encode/decode text data kwargs["encoding"] = None super(FloatListsDocumentCorpusWriter, self).__init__(base_dir, **kwargs) # Struct for writing individual floats (actually doubles) self.packer = struct.Struct("<d") # Prepare another for encoding the row lengths (ints) self.length_struct = get_struct(2, False, 1) @pass_up_invalid def document_to_raw_data(self, doc): raw_data = StringIO() for row in doc: # Should be rows of floats try: raw_data.write(self.length_struct.pack(len(row))) for num in row: raw_data.write(self.packer.pack(num)) except struct.error, e: raise ValueError("error encoding float row %s using struct format %s: %s" % (row, self.packer.format, e)) return raw_data.getvalue()
[docs]class FloatListDocumentType(RawDocumentType): """ Like FloatListsDocumentType, but each document is treated as a single list of floats. """ def __init__(self, options, metadata): super(FloatListDocumentType, self).__init__(options, metadata) # Struct for reading in individual floats (actually doubles) self.unpacker = struct.Struct("<d") self.value_size = self.unpacker.size
[docs] def process_document(self, data): reader = StringIO(data) return list(self.read_floats(reader))
[docs] def read_floats(self, reader): while True: # Read the whole document, one float at a time num_string = reader.read(self.value_size) if num_string == "": return try: num = self.unpacker.unpack(num_string)[0] except struct.error, e: raise IOError("error interpreting float data: %s" % e) yield num
[docs]class FloatListDocumentCorpus(TarredCorpus): """ Corpus of float data: each doc contains a single sequence of floats. The floats are stored as C doubles, using 8 bytes each. """ datatype_name = "float_list_corpus" data_point_type = FloatListDocumentType
[docs]class FloatListDocumentCorpusWriter(TarredCorpusWriter): def __init__(self, base_dir, **kwargs): # Tell TarredCorpus not to encode/decode text data kwargs["encoding"] = None super(FloatListDocumentCorpusWriter, self).__init__(base_dir, **kwargs) # Struct for writing individual floats (actually doubles) self.packer = struct.Struct("<d") @pass_up_invalid def document_to_raw_data(self, doc): raw_data = StringIO() # Doc should be a list of ints for num in doc: try: raw_data.write(self.packer.pack(num)) except struct.error, e: raise ValueError("error encoding float data %s using struct format %s: %s" % (num, self.packer.format, e)) return raw_data.getvalue()