Source code for pimlico.datatypes.corpora.ints

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Corpora consisting of lists of ints. These data point types are useful,
for example, for encoding text or other sequence data as integer IDs.
They are designed to be fast to read.

"""
from __future__ import absolute_import

from future import standard_library
standard_library.install_aliases()
from builtins import range
from builtins import object
from builtins import bytes

import struct
from io import StringIO, BytesIO

from pimlico.datatypes.corpora.data_points import RawDocumentType
from pimlico.utils.core import cached_property
from .table import get_struct


[docs]class IntegerListsDocumentType(RawDocumentType): """ Corpus of integer list data: each doc contains lists of ints. Unlike :class:`~pimlico.datatypes.table.IntegerTableDocumentType`, they are not all constrained to have the same length. The downside is that the storage format (and I/O speed) isn't quite as good. It's still better than just storing ints as strings or JSON objects. By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they may be signed. """ metadata_defaults = dict(RawDocumentType.metadata_defaults, **{ "bytes": ( 8, "Number of bytes to use to represent each int. Default: 8", ), "signed": ( False, "Stored signed integers. Default: False", ), "row_length_bytes": ( 2, "Number of bytes to use to encode the length of each row. Default: 2. Increase if you " "need to store very long lists" ), }) data_point_type_supports_python2 = True @property def bytes(self): return self.metadata.get("bytes", self.metadata_defaults["bytes"][0]) @property def signed(self): return self.metadata.get("signed", self.metadata_defaults["signed"][0]) @property def row_length_bytes(self): return self.metadata.get("row_length_bytes", self.metadata_defaults["row_length_bytes"][0]) @property def int_size(self): return self.struct.size @property def length_size(self): return self.length_struct.size
[docs] def writer_init(self, writer): super(IntegerListsDocumentType, self).writer_init(writer) # Metadata should have been set by this point, using kwargs to override the defaults self.metadata["bytes"] = writer.metadata["bytes"] self.metadata["signed"] = writer.metadata["signed"] self.metadata["row_length_bytes"] = writer.metadata["row_length_bytes"]
@cached_property def struct(self): return get_struct(self.bytes, self.signed, 1) @cached_property def length_struct(self): # We use a separate struct for the row lengths return get_struct(self.row_length_bytes, False, 1) def __getstate__(self): # Don't pickle the prepared structs, as they don't pickle nicely # They get recreated on demand anyway state = dict(self.__dict__) if "struct" in state: del state["struct"] if "length_struct" in state: del state["length_struct"] return state
[docs] class Document(object): keys = ["lists"]
[docs] def raw_to_internal(self, raw_data): reader = BytesIO(raw_data) lists = list(self.read_rows(reader)) return { "lists": lists, }
@property def lists(self): return self.internal_data["lists"]
[docs] def read_rows(self, reader): unpacker = self.data_point_type.struct int_size = self.data_point_type.int_size length_unpacker = self.data_point_type.length_struct length_size = self.data_point_type.length_struct.size def _read_row(length): for i in range(length): num_string = reader.read(int_size) if num_string == b"": raise IOError("file ended mid-row") try: yield unpacker.unpack(num_string)[0] except struct.error as e: raise IOError("error interpreting int data: %s" % e) while True: # First read an int that tells us how long the row is row_length_string = reader.read(length_size) if row_length_string == b"": # Reached end of file break row_length = length_unpacker.unpack(row_length_string)[0] # Read the whole row, one int at a time yield list(_read_row(row_length))
[docs] def internal_to_raw(self, internal_data): raw_data = BytesIO() for row in internal_data["lists"]: # Should be rows of ints try: raw_data.write(bytes(self.data_point_type.length_struct.pack(len(row)))) for num in row: raw_data.write(bytes(self.data_point_type.struct.pack(num))) except struct.error as e: raise ValueError("error encoding int row %s using struct format %s: %s" % (row, self.data_point_type.struct.format, e)) return raw_data.getvalue()
[docs]class IntegerListDocumentType(RawDocumentType): """ Corpus of integer data: each doc contains a single sequence of ints. Like IntegerListsDocumentType, but each document is treated as a single list of integers. By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they may be signed. """ metadata_defaults = dict(RawDocumentType.metadata_defaults, **{ "bytes": ( 8, "Number of bytes to use to represent each int. Default: 8", ), "signed": ( False, "Stored signed integers. Default: False", ), }) data_point_type_supports_python2 = True
[docs] def reader_init(self, reader): super(IntegerListDocumentType, self).reader_init(reader) self.bytes = self.metadata["bytes"] self.signed = self.metadata["signed"] self.int_size = self.struct.size
[docs] def writer_init(self, writer): super(IntegerListDocumentType, self).writer_init(writer) # Metadata should have been set by this point, using kwargs to override the defaults self.bytes = writer.metadata["bytes"] self.signed = writer.metadata["signed"]
@cached_property def struct(self): return get_struct(self.bytes, self.signed, 1) def __getstate__(self): # Don't pickle the prepared struct, as it doesn't pickle nicely # It gets recreated on demand anyway state = dict(self.__dict__) if "struct" in state: del state["struct"] return state
[docs] class Document(object): keys = ["list"]
[docs] def raw_to_internal(self, raw_data): reader = BytesIO(raw_data) lst = list(self.read_rows(reader)) return { "list": lst, }
@property def list(self): return self.internal_data["list"]
[docs] def read_rows(self, reader): while True: # Read the whole document, one int at a time num_string = reader.read(self.data_point_type.int_size) if len(num_string) == 0: return try: num = self.data_point_type.struct.unpack(num_string)[0] except struct.error as e: raise IOError("error interpreting int data: %s" % e) yield num
[docs] def internal_to_raw(self, internal_data): raw_data = BytesIO() # Doc should be a list of ints for num in internal_data["list"]: try: raw_data.write(self.data_point_type.struct.pack(num)) except struct.error as e: raise ValueError("error encoding int data %s using struct format %s: %s" % (num, self.data_point_type.struct.format, e)) return raw_data.getvalue()
[docs]class IntegerDocumentType(RawDocumentType): """ Corpus of integer data: each doc contains a single int. This may be useful, for example, for storing predicted or gold standard class labels for documents. By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they may be signed. """ metadata_defaults = dict(RawDocumentType.metadata_defaults, **{ "bytes": ( 8, "Number of bytes to use to represent each int. Default: 8", ), "signed": ( False, "Stored signed integers. Default: False", ), }) data_point_type_supports_python2 = True
[docs] def reader_init(self, reader): super(IntegerDocumentType, self).reader_init(reader) self.bytes = self.metadata["bytes"] self.signed = self.metadata["signed"] self.int_size = self.struct.size
[docs] def writer_init(self, writer): super(IntegerDocumentType, self).writer_init(writer) # Metadata should have been set by this point, using kwargs to override the defaults self.bytes = writer.metadata["bytes"] self.signed = writer.metadata["signed"]
@cached_property def struct(self): return get_struct(self.bytes, self.signed, 1) def __getstate__(self): # Don't pickle the prepared struct, as it doesn't pickle nicely # It gets recreated on demand anyway state = dict(self.__dict__) if "struct" in state: del state["struct"] return state
[docs] class Document(object): keys = ["val"]
[docs] def raw_to_internal(self, raw_data): # Read the whole document, which should be a single int if len(raw_data) != self.data_point_type.int_size: raise IOError("expected {} bytes in single-int doc, got {}".format( self.data_point_type.int_size, len(raw_data))) try: val = self.data_point_type.struct.unpack(raw_data)[0] except struct.error as e: raise IOError("error interpreting int data: %s" % e) return { "val": val, }
@property def list(self): return self.internal_data["val"]
[docs] def internal_to_raw(self, internal_data): # Doc should be a single int val = internal_data["val"] try: return self.data_point_type.struct.pack(val) except struct.error as e: raise ValueError("error encoding int data %s using struct format %s: %s" % (val, self.data_point_type.struct.format, e))