Source code for pimlico.datatypes.corpora.table

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Corpora where each document is a table, i.e. a list of lists, where each
row has the same length and each column has a single datatype.
This is designed to be fast to read, but is not a very flexible datatype.

"""

from future import standard_library
standard_library.install_aliases()
from builtins import object

import struct
from io import BytesIO

from pimlico.datatypes.corpora.data_points import RawDocumentType
from pimlico.utils.core import cached_property

BYTE_FORMATS = {
    # (num bytes, signed)
    (1, True): "b",   # signed char
    (1, False): "B",  # unsigned char
    (2, True): "h",   # signed short
    (2, False): "H",  # unsigned short
    (4, True): "l",   # signed long
    (4, False): "L",  # unsigned long
    (8, True): "q",   # signed long long
    (8, False): "Q",  # unsigned long long
}


[docs]def get_struct(bytes, signed, row_length): # Put together the formatting string for converting ints to bytes if (bytes, signed) not in BYTE_FORMATS: raise ValueError("invalid specification for int format: signed=%s, bytes=%s. signed must be bool, " "bytes in [1, 2, 4, 8]" % (signed, bytes)) format_string = "<" + BYTE_FORMATS[(bytes, signed)] * row_length # Compile the format for faster encoding return struct.Struct(format_string)
[docs]class IntegerTableDocumentType(RawDocumentType): """ Corpus of tabular integer data: each doc contains rows of ints, where each row contains the same number of values. This allows a more compact representation, which doesn't require converting the ints to strings or scanning for line ends, so is quite a bit quicker and results in much smaller file sizes. The downside is that the files are not human-readable. By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they may be signed. """ metadata_defaults = dict(RawDocumentType.metadata_defaults, **{ "bytes": ( 8, "Number of bytes to use to represent each int. Default: 8", ), "signed": ( False, "Stored signed integers. Default: False", ), "row_length": ( 1, "Row length - number of integers in each row. Default: 1", ), }) data_point_type_supports_python2 = True
[docs] def reader_init(self, reader): super(IntegerTableDocumentType, self).reader_init(reader) self.bytes = self.metadata["bytes"] self.signed = self.metadata["signed"] self.row_length = self.metadata["row_length"] self.struct = get_struct(self.bytes, self.signed, self.row_length)
[docs] def writer_init(self, writer): super(IntegerTableDocumentType, self).writer_init(writer) # Metadata should have been set by this point, using kwargs to override the defaults self.bytes = writer.metadata["bytes"] self.signed = writer.metadata["signed"] self.row_length = writer.metadata["row_length"] self.struct = get_struct(self.bytes, self.signed, self.row_length)
[docs] class Document(object): keys = ["table"]
[docs] def raw_to_internal(self, raw_data): reader = BytesIO(raw_data) table = list(self.read_rows(reader)) return { "table": table, }
@property def table(self): return self.internal_data["table"] @cached_property def row_size(self): return self.data_point_type.struct.size
[docs] def read_rows(self, reader): while True: # Read data for a single row row_string = reader.read(self.row_size) if len(row_string) == 0: # Reach end of file break try: row = self.data_point_type.struct.unpack(row_string) except struct.error as e: if len(row_string) < self.row_size: # Got a partial row at end of file raise IOError("found partial row at end of file: last row has byte length %d, not %d" % (len(row_string), self.row_size)) else: raise IOError("error interpreting row: %s" % e) yield row
[docs] def internal_to_raw(self, internal_data): raw_data = BytesIO() for row in internal_data["table"]: # Should be rows of ints of the correct length try: raw_data.write(bytes(self.data_point_type.struct.pack(*row))) except struct.error as e: # Instead of checking the rows before encoding, catch any encoding errors and give helpful messages if len(row) != self.data_point_type.row_length: raise ValueError("tried to write a row of length %d to a table writer with row length %d" % (len(row), self.data_point_type.row_length)) else: raise ValueError("error encoding int row %s using struct format %s: %s" % (row, self.data_point_type.struct.format, e)) return raw_data.getvalue()