Source code for pimlico.datatypes.table

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

import struct
from StringIO import StringIO

from pimlico.datatypes.documents import RawDocumentType
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid

BYTE_FORMATS = {
    # (num bytes, signed)
    (1, True): "b",   # signed char
    (1, False): "B",  # unsigned char
    (2, True): "h",   # signed short
    (2, False): "H",  # unsigned short
    (4, True): "l",   # signed long
    (4, False): "L",  # unsigned long
    (8, True): "q",   # signed long long
    (8, False): "Q",  # unsigned long long
}


[docs]def get_struct(bytes, signed, row_length): # Put together the formatting string for converting ints to bytes if (bytes, signed) not in BYTE_FORMATS: raise ValueError("invalid specification for int format: signed=%s, bytes=%s. signed must be bool, " "bytes in [1, 2, 4, 8]" % (signed, bytes)) format_string = "<" + BYTE_FORMATS[(bytes, signed)] * row_length # Compile the format for faster encoding return struct.Struct(format_string)
[docs]class IntegerTableDocumentType(RawDocumentType): def __init__(self, options, metadata): super(IntegerTableDocumentType, self).__init__(options, metadata) self._unpacker = None @property def unpacker(self): # Only ready when we've got metadata (data_ready() == True) if self._unpacker is None: # Read the metadata to prepare the reader struct bytes, signed, row_length = struct.unpack("B?H", self.metadata["struct_format"]) # Compile a struct for unpacking these quickly self._unpacker = get_struct(bytes, signed, row_length) return self._unpacker @property def row_size(self): return self.unpacker.size
[docs] def process_document(self, data): reader = StringIO(data) return list(self.read_rows(reader))
[docs] def read_rows(self, reader): while True: # Read data for a single row row_string = reader.read(self.row_size) if row_string == "": # Reach end of file break try: row = self.unpacker.unpack(row_string) except struct.error, e: if len(row_string) < self.row_size: # Got a partial row at end of file raise IOError("found partial row at end of file: last row has byte length %d, not %d" % (len(row_string), self.row_size)) else: raise IOError("error interpreting row: %s" % e) yield row
[docs]class IntegerTableDocumentCorpus(TarredCorpus): """ Corpus of tabular integer data: each doc contains rows of ints, where each row contains the same number of values. This allows a more compact representation, which doesn't require converting the ints to strings or scanning for line ends, so is quite a bit quicker and results in much smaller file sizes. The downside is that the files are not human-readable. By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they may be signed. """ datatype_name = "integer_table_corpus" data_point_type = IntegerTableDocumentType
[docs]class IntegerTableDocumentCorpusWriter(TarredCorpusWriter): def __init__(self, base_dir, row_length, signed=False, bytes=8, **kwargs): # Tell TarredCorpus not to encode/decode text data kwargs["encoding"] = None super(IntegerTableDocumentCorpusWriter, self).__init__(base_dir, **kwargs) self.row_length = row_length self.signed = signed self.bytes = bytes # Prepare a struct for efficiently encoding int rows as bytes self.struct = get_struct(bytes, signed, row_length) # Write the metadata to denote the representation format self.metadata["struct_format"] = struct.pack("B?H", bytes, signed, row_length) @pass_up_invalid def document_to_raw_data(self, doc): raw_data = StringIO() for row in doc: # Should be rows of ints of the correct length try: raw_data.write(self.struct.pack(*row)) except struct.error, e: # Instead of checking the rows before encoding, catch any encoding errors and give helpful messages if len(row) != self.row_length: raise ValueError("tried to write a row of length %d to a table writer with row length %d" % (len(row), self.row_length)) else: raise ValueError("error encoding int row %s using struct format %s: %s" % (row, self.struct.format, e)) return raw_data.getvalue()