Source code for pimlico.datatypes.ints

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

import struct
from cStringIO import StringIO

from pimlico.datatypes.documents import RawDocumentType
from pimlico.datatypes.table import get_struct
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid


[docs]class IntegerListsDocumentType(RawDocumentType): def __init__(self, options, metadata): super(IntegerListsDocumentType, self).__init__(options, metadata) self._unpacker = None self.length_unpacker = get_struct(2, False, 1) self.length_size = self.length_unpacker.size self._int_size = None @property def unpacker(self): # Only ready when we've got metadata (data_ready() == True) if self._unpacker is None: # Read the metadata to prepare the reader struct bytes, signed = struct.unpack("B?", self.metadata["struct_format"]) # Compile a struct for unpacking individual ints quickly self._unpacker = get_struct(bytes, signed, 1) self._int_size = self._unpacker.size return self._unpacker
[docs] def process_document(self, data): reader = StringIO(data) return list(self.read_rows(reader))
[docs] def read_rows(self, reader): unpacker = self.unpacker int_size = self._int_size def _read_row(length): for i in range(length): num_string = reader.read(int_size) if num_string == "": raise IOError("file ended mid-row") try: yield unpacker.unpack(num_string)[0] except struct.error, e: raise IOError("error interpreting int data: %s" % e) while True: # First read an int that tells us how long the row is row_length_string = reader.read(self.length_size) if row_length_string == "": # Reached end of file break row_length = self.length_unpacker.unpack(row_length_string)[0] # Read the whole row, one int at a time yield list(_read_row(row_length))
[docs]class IntegerListsDocumentCorpus(TarredCorpus): """ Corpus of integer list data: each doc contains lists of ints. Unlike :class:`~pimlico.datatypes.table.IntegerTableDocumentCorpus`, they are not all constrained to have the same length. The downside is that the storage format (and probably I/O speed) isn't quite as good. It's still better than just storing ints as strings or JSON objects. By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they may be signed. """ datatype_name = "integer_lists_corpus" data_point_type = IntegerListsDocumentType
[docs]class IntegerListsDocumentCorpusWriter(TarredCorpusWriter): def __init__(self, base_dir, signed=False, bytes=8, **kwargs): # Tell TarredCorpus not to encode/decode text data kwargs["encoding"] = None super(IntegerListsDocumentCorpusWriter, self).__init__(base_dir, **kwargs) self.signed = signed self.bytes = bytes # Prepare a struct for efficiently encoding int rows as bytes self.num_struct = get_struct(bytes, signed, 1) # Prepare another for encoding the row lengths self.length_struct = get_struct(2, False, 1) # Write the metadata to denote the representation format self.metadata["struct_format"] = struct.pack("B?", bytes, signed) @pass_up_invalid def document_to_raw_data(self, doc): raw_data = StringIO() for row in doc: # Should be rows of ints try: raw_data.write(self.length_struct.pack(len(row))) for num in row: raw_data.write(self.num_struct.pack(num)) except struct.error, e: raise ValueError("error encoding int row %s using struct format %s: %s" % (row, self.num_struct.format, e)) return raw_data.getvalue()
[docs]class IntegerListDocumentType(RawDocumentType): """ Like IntegerListsDocumentType, but each document is treated as a single list of integers. """ def __init__(self, options, metadata): super(IntegerListDocumentType, self).__init__(options, metadata) self._unpacker = None @property def unpacker(self): # Only ready when we've got metadata (data_ready() == True) if self._unpacker is None: # Read the metadata to prepare the reader struct bytes, signed = struct.unpack("B?", self.metadata["struct_format"]) # Compile a struct for unpacking individual ints quickly self._unpacker = get_struct(bytes, signed, 1) return self._unpacker @property def int_size(self): return self.unpacker.size
[docs] def process_document(self, data): reader = StringIO(data) return list(self.read_ints(reader))
[docs] def read_ints(self, reader): while True: # Read the whole document, one int at a time num_string = reader.read(self.int_size) if num_string == "": return try: num = self.unpacker.unpack(num_string)[0] except struct.error, e: raise IOError("error interpreting int data: %s" % e) yield num
[docs]class IntegerListDocumentCorpus(TarredCorpus): """ Corpus of integer data: each doc contains a single sequence of ints. By default, the ints are stored as C longs, which use 4 bytes. If you know you don't need ints this big, you can choose 1 or 2 bytes, or even 8 (long long). By default, the ints are unsigned, but they may be signed. """ datatype_name = "integer_list_corpus" data_point_type = IntegerListDocumentType
[docs]class IntegerListDocumentCorpusWriter(TarredCorpusWriter): def __init__(self, base_dir, signed=False, bytes=8, **kwargs): # Tell TarredCorpus not to encode/decode text data kwargs["encoding"] = None super(IntegerListDocumentCorpusWriter, self).__init__(base_dir, **kwargs) self.signed = signed self.bytes = bytes # Prepare a struct for efficiently encoding int rows as bytes self.num_struct = get_struct(bytes, signed, 1) # Write the metadata to denote the representation format self.metadata["struct_format"] = struct.pack("B?", bytes, signed) @pass_up_invalid def document_to_raw_data(self, doc): raw_data = StringIO() # Doc should be a list of ints for num in doc: try: raw_data.write(self.num_struct.pack(num)) except struct.error, e: raise ValueError("error encoding int data %s using struct format %s: %s" % (num, self.num_struct.format, e)) return raw_data.getvalue()