Source code for pimlico.datatypes.corpora.table

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Corpora where each document is a table, i.e. a list of lists, where each
row has the same length and each column has a single datatype.
This is designed to be fast to read, but is not a very flexible datatype.

"""

from future import standard_library
standard_library.install_aliases()
from builtins import object

import struct
from io import BytesIO

from pimlico.datatypes.corpora.data_points import RawDocumentType
from pimlico.utils.core import cached_property

BYTE_FORMATS = {
    # (num bytes, signed)
    (1, True): "b",   # signed char
    (1, False): "B",  # unsigned char
    (2, True): "h",   # signed short
    (2, False): "H",  # unsigned short
    (4, True): "l",   # signed long
    (4, False): "L",  # unsigned long
    (8, True): "q",   # signed long long
    (8, False): "Q",  # unsigned long long
}


[docs]def get_struct(bytes, signed, row_length):
    # Put together the formatting string for converting ints to bytes
    if (bytes, signed) not in BYTE_FORMATS:
        raise ValueError("invalid specification for int format: signed=%s, bytes=%s. signed must be bool, "
                         "bytes in [1, 2, 4, 8]" % (signed, bytes))
    format_string = "<" + BYTE_FORMATS[(bytes, signed)] * row_length
    # Compile the format for faster encoding
    return struct.Struct(format_string)


[docs]class IntegerTableDocumentType(RawDocumentType):
    """
    Corpus of tabular integer data: each doc contains rows of ints, where each row contains the same number
    of values. This allows a more compact representation, which doesn't require converting the ints to strings or
    scanning for line ends, so is quite a bit quicker and results in much smaller file sizes. The downside is that
    the files are not human-readable.

    By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this
    big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they
    may be signed.

    """
    metadata_defaults = dict(RawDocumentType.metadata_defaults, **{
        "bytes": (
            8,
            "Number of bytes to use to represent each int. Default: 8",
        ),
        "signed": (
            False,
            "Stored signed integers. Default: False",
        ),
        "row_length": (
            1,
            "Row length - number of integers in each row. Default: 1",
        ),
    })
    data_point_type_supports_python2 = True

[docs]    def reader_init(self, reader):
        super(IntegerTableDocumentType, self).reader_init(reader)
        self.bytes = self.metadata["bytes"]
        self.signed = self.metadata["signed"]
        self.row_length = self.metadata["row_length"]
        self.struct = get_struct(self.bytes, self.signed, self.row_length)

[docs]    def writer_init(self, writer):
        super(IntegerTableDocumentType, self).writer_init(writer)
        # Metadata should have been set by this point, using kwargs to override the defaults
        self.bytes = writer.metadata["bytes"]
        self.signed = writer.metadata["signed"]
        self.row_length = writer.metadata["row_length"]
        self.struct = get_struct(self.bytes, self.signed, self.row_length)

[docs]    class Document(object):
        keys = ["table"]

[docs]        def raw_to_internal(self, raw_data):
            reader = BytesIO(raw_data)
            table = list(self.read_rows(reader))
            return {
                "table": table,
            }

        @property
        def table(self):
            return self.internal_data["table"]

        @cached_property
        def row_size(self):
            return self.data_point_type.struct.size

[docs]        def read_rows(self, reader):
            while True:
                # Read data for a single row
                row_string = reader.read(self.row_size)
                if len(row_string) == 0:
                    # Reach end of file
                    break
                try:
                    row = self.data_point_type.struct.unpack(row_string)
                except struct.error as e:
                    if len(row_string) < self.row_size:
                        # Got a partial row at end of file
                        raise IOError("found partial row at end of file: last row has byte length %d, not %d" %
                                      (len(row_string), self.row_size))
                    else:
                        raise IOError("error interpreting row: %s" % e)
                yield row

[docs]        def internal_to_raw(self, internal_data):
            raw_data = BytesIO()
            for row in internal_data["table"]:
                # Should be rows of ints of the correct length
                try:
                    raw_data.write(bytes(self.data_point_type.struct.pack(*row)))
                except struct.error as e:
                    # Instead of checking the rows before encoding, catch any encoding errors and give helpful messages
                    if len(row) != self.data_point_type.row_length:
                        raise ValueError("tried to write a row of length %d to a table writer with row length %d" %
                                         (len(row), self.data_point_type.row_length))
                    else:
                        raise ValueError("error encoding int row %s using struct format %s: %s" %
                                         (row, self.data_point_type.struct.format, e))
            return raw_data.getvalue()