Source code for pimlico.datatypes.corpora.data_points

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Document types used to represent datatypes of individual documents in an IterableCorpus or subtype.

"""
from builtins import object
from collections import OrderedDict
from traceback import format_exc

from future.utils import with_metaclass, PY3

__all__ = ["DataPointType", "RawDocumentType", "TextDocumentType", "RawTextDocumentType", "DataPointError",
           "InvalidDocument"]


class DataPointTypeMeta(type):
    """
    Metaclass for all data point type classes. Takes care of preparing a
    Document class for every datatype.

    You should never need to do anything with this: it's used by the base datatype,
    and hence by every other datatype.

    """
    def __new__(cls, *args, **kwargs):
        new_cls = super(DataPointTypeMeta, cls).__new__(cls, *args, **kwargs)
        # Replace the existing Document class, if any, which is used to construct the actual Document,
        # with the constructed Document
        new_cls.Document = DataPointTypeMeta._get_document_cls(new_cls)
        return new_cls

    @staticmethod
    def _get_document_cls(cls):
        # Cache document subtyping, so that the same type object is returned from repeated calls on the
        # same document type
        if not hasattr(cls, "__document_type"):
            if len(cls.__bases__) == 0 or cls.__bases__[0] is object:
                # On the base class, we just return the base document
                cls.__document_type = cls.Document
            else:
                parent_doc_cls = cls.__bases__[0].Document
                my_doc_cls = cls.Document
                if my_doc_cls is parent_doc_cls:
                    # Nothing overridden
                    new_dict = {}
                else:
                    new_dict = dict(my_doc_cls.__dict__)

                # Don't inherit the __dict__ and __weakref__ attributes
                # These will be created on the new type as necessary
                if "__dict__" in new_dict:
                    del new_dict["__dict__"]
                if "__weakref__" in new_dict:
                    del new_dict["__weakref__"]
                # Set the reader setup's __qualname__ so it's properly treated as a nested class of the datatype's reader
                if PY3:
                    new_dict["__qualname__"] = "{}.Document".format(cls.__qualname__)
                new_dict["__module__"] = cls.__module__

                # If no new documentation is provided, then we don't want to inherit the
                # superclass' docstring, but instead let the reader follow the link to see that
                if "__doc__" not in new_dict or new_dict["__doc__"] is None:
                    new_dict["__doc__"] = "Document class for {}".format(cls.__name__)

                # Perform subclassing so that a new Document is created that is a subclass of the parent's document
                cls.__document_type = type("Document", (parent_doc_cls,), new_dict)
        return cls.__document_type


[docs]class DataPointType(with_metaclass(DataPointTypeMeta, object)): """ Base data-point type for iterable corpora. All iterable corpora should have data-point types that are subclasses of this. Every data point type has a corresponding document class, which can be accessed as `MyDataPointType.Document`. When overriding data point types, you can define a nested `Document` class, with no base class, to override parts of the document class' functionality or add new methods, etc. This will be used to automatically create the `Document` class for the data point type. Some data-point types may specify some options, using the ``data_point_type_options`` field. This works in the same way as PimlicoDatatype's ``datatype_options``. Values for the options can be specified on initialization as args or kwargs of the data-point type. .. note:: I have now implemented the data-point type options, just like datatype options. However, you cannot yet specify these in a config file when loading a stored corpus. An additional datatype option should be added to iterable corpora that allows you to specify data point type options for when a datatype is being loaded using a config file. """ #: List of (name, cls_path) pairs specifying a standard set of formatters that the user might want to choose from to #: view a dataset of this type. The user is not restricted to this set, but can easily choose these by name, #: instead of specifying a class path themselves. #: The first in the list is the default used if no formatter is specified. Falls back to DefaultFormatter if empty formatters = [] #: Metadata keys that should be written for this data point type, with default values and #: strings documenting the meaning of the parameter. Used for writers for this data point #: type. See :class:`~pimlico.datatypes.PimlicoDatatype.Writer`. metadata_defaults = {} data_point_type_options = OrderedDict() """ Options specified in the same way as module options that control the nature of the document type. These are not things to do with reading of specific datasets, for which the dataset's metadata should be used. These are things that have an impact on typechecking, such that options on the two checked datatypes are required to match for the datatypes to be considered compatible. This corresponds exactly to a PimlicoDatatype's datatype_options and is processed in the same way. They should always be an ordered dict, so that they can be specified using positional arguments as well as kwargs and config parameters. """ data_point_type_supports_python2 = True """ Most core Pimlico datatypes support use in Python 2 and 3. Datatypes that do should set this to True. If it is False, the datatype is assumed to work only in Python 3. Python 2 compatibility requires extra work from the programmer. Datatypes should generally declare whether or not they provide this support by overriding this explicitly. Use ``supports_python2()`` to check whether a data-point type instance supports Python 2. (There may be reasons for a datatype's instance to override this class-level setting.) """ def __init__(self, *args, **kwargs): # This is set when the reader is initialized self.metadata = {} # Kwargs specify (processed) values for named datatype options # Check they're all valid options for key in kwargs: if key not in self.data_point_type_options: raise DataPointError("unknown data-point type option '{}' for {}".format(key, self)) self.options = dict(kwargs) # Positional args can also be used to specify options, using the order in which the options are defined for key, arg in zip(self.data_point_type_options.keys(), args): if key in kwargs: raise DataPointError("data-point type option '{}' given by positional arg was also specified " "by a kwarg".format(key)) self.options[key] = arg # Check any required options have been given for opt_name, opt_dict in self.data_point_type_options.items(): if opt_dict.get("required", False) and opt_name not in self.options: raise DataPointError("{} datatype requires option '{}' to be specified".format(self, opt_name)) # Finally, set default options from the datatype options for opt_name, opt_dict in self.data_point_type_options.items(): if opt_name not in self.options: self.options[opt_name] = opt_dict.get("default", None) def __call__(self, **kwargs): """ Produce a document of this type. Data is specified using kwargs, which should be keys listed in the document type's `keys` list. If `raw_data` is given it should be a bytes object. Other kwargs are ignored and the document is instantiated from the raw data alone. Otherwise, it is instantiated from an internal data dictionary containing all of the specified keys. """ if "raw_data" in kwargs: raw_data = kwargs["raw_data"] internal_data = None else: if set(kwargs.keys()) != set(self.Document.keys): # Check that no unknown keys are given for the document's internal representation unknown_keys = set(kwargs.keys()) - set(self.Document.keys) if unknown_keys: raise DocumentInitializationError("{} got unknown key(s) {} to create a document".format( self.name, ", ".join("'{}'".format(k) for k in sorted(unknown_keys)) )) # Check that all required keys are given missing_keys = set(self.Document.keys) - set(kwargs.keys()) if missing_keys: raise DocumentInitializationError("{} requires key(s) {} to create a document, only got {}".format( self.name, ", ".join("'{}'".format(k) for k in sorted(missing_keys)), ", ".join("'{}'".format(k) for k in sorted(kwargs.keys())) )) raw_data = None internal_data = dict(kwargs) return self.Document(self, raw_data=raw_data, internal_data=internal_data)
[docs] def supports_python2(self): """ Just returns data_point_type_supports_python2. """ return self.data_point_type_supports_python2
def __repr__(self): return "{}()".format(self.name) @property def name(self): return self.__class__.__name__
[docs] def check_type(self, supplied_type): """ Type checking for an iterable corpus calls this to check that the supplied data point type matches the required one (i.e. this instance). By default, the supplied type is simply required to be an instance of the required type (or one of its subclasses). This may be overridden to introduce other type checks. """ return isinstance(supplied_type, type(self))
[docs] def is_type_for_doc(self, doc): """ Check whether the given document is of this type, or a subclass of this one. If the object is not a document instance (or, more precisely, doesn't have a data_point_type attr), this will always return False. """ if not hasattr(doc, "data_point_type"): # Sometimes things other than document instances will turn up here, e.g. when # a doc map module's process_document() produces a dict or raw data output # That's fine: we return false simply return False return self.check_type(doc.data_point_type)
[docs] def reader_init(self, reader): """ Called when a reader is initialized. May be overridden to perform any tasks specific to the data point type that need to be done before the reader starts producing data points. The super `reader_init()` should be called. This takes care of making reader metadata available in the `metadata` attribute of the data point type instance. """ self.metadata = reader.metadata
[docs] def writer_init(self, writer): """ Called when a writer is initialized. May be overridden to perform any tasks specific to the data point type that should be done before documents start getting written. The super `writer_init()` should be called. This takes care of updating the writer's metadata from anything in the instance's `metadata` attribute, for any keys given in the data point type's `metadata_defaults`. """ metadata = self.metadata or {} # Don't need to set default values here, as that's handled by the writer # Just pass through any metadata values for the data point type's keys for key in self.metadata_defaults: if key in metadata: writer.metadata[key] = metadata[key]
[docs] @classmethod def full_class_name(cls): """ The fully qualified name of the class for this data point type, by which it is referenced in config files. Used in docs """ return "%s.%s" % (cls.__module__, cls.__name__)
[docs] class Document(object): """ The abstract superclass of all documents. You do not need to subclass or instantiate these yourself: subclasses are created automatically to correspond to each document type. You can add functionality to a datapoint type's document by creating a nested `Document` class. This will inherit from the parent datapoint type's document. This happens automatically - you don't need to do it yourself and shouldn't inherit from anything: .. code-block:: py class MyDataPointType(DataPointType): class Document: # Overide document things here # Add your own methods, properties, etc for getting data from the document A data point type's constructed document class is available as `MyDataPointType.Document`. Each document type should provide a method to convert from raw data (a bytes object in Py3, or ``future``'s backport of ``bytes`` in Py2) to the internal representation (an arbitrary dictionary) called `raw_to_internal()`, and another to convert the other way called `internal_to_raw()`. Both forms of the data are available using the properties `raw_data` and `internal_data`, and these methods are called as necessary to convert back and forth. This is to avoid unnecessary conversions. For example, if the raw data is supplied and then only the raw data is ever used (e.g. passing the document straight through and writing out to disk), we want to avoid converting back and forth. A subtype should then supply methods or properties (typically using the cached_property decorator) to provide access to different parts of the data. See the many built-in document types for examples of doing this. You should not generally need to override the `__init__` method. You may, however, wish to override `internal_available()` or `raw_available()`. These are called as soon as the internal data or raw data, respectively, become available, which may be at instantiation or after conversion. This can be useful if there are bits of computation that you want to do on the basis of one of these and then store to avoid repeated computation. """ #: Specifies the keys that a document has in its internal data #: Subclasses should specify their keys #: The internal data fields corresponding to these can be accessed as attributes of the document keys = [] def __init__(self, data_point_type, raw_data=None, internal_data=None, metadata=None): self.data_point_type = data_point_type if raw_data is None and internal_data is None: raise DataPointError("either raw_data or internal_data must be given when instantiating a document") if raw_data is not None and internal_data is not None: raise DataPointError("only one of raw_data and internal_data may be given when " "instantiating a document") self.metadata = metadata self._raw_data = raw_data self._internal_data = internal_data if self._raw_data is not None: self.raw_available() if self._internal_data is not None: self.internal_available()
[docs] def raw_to_internal(self, raw_data): """ Take a bytes object containing the raw data for a document, read in from disk, and produce a dictionary containing all the processed data in the document's internal format. You will often want to call the super method and replace values or add to the dictionary. Whatever you do, make sure that all the internal data that the super type provides is also provided here, so that all of its properties and methods work. """ raise NotImplementedError( "document type '{}' does not implement raw_to_internal()".format(self.data_point_type))
[docs] def internal_to_raw(self, internal_data): """ Take a dictionary containing all the document's data in its internal format and produce a bytes object containing all that data, which can be written out to disk. """ raise NotImplementedError( "document type '{}' does not implement internal_to_raw()".format(self.data_point_type))
[docs] def raw_available(self): """ Called as soon as the raw data becomes available, either at instantiation or conversion. """ return
[docs] def internal_available(self): """ Called as soon as the internal data becomes available, either at instantiation or conversion. """ return
def __repr__(self): return "{}()".format(self.__class__.__name__) @property def raw_data(self): if self._raw_data is None: try: # Raw data not available yet: convert from internal data self._raw_data = self.internal_to_raw(self._internal_data) self.raw_available() except Exception as e: # Catch any exceptions and wrap them # In particular, it's important to catch attribute errors, as these otherwise lead # to __getatttr__ being called and give mystifying errors raise DataConversionError( "{} error converting internal to raw data for document type {}: {}. [{}]".format( type(e).__name__, self.data_point_type, e, format_exc(), )) return self._raw_data @property def internal_data(self): if self._internal_data is None: try: # Internal data not available yet: convert from raw self._internal_data = self.raw_to_internal(self._raw_data) self.internal_available() except Exception as e: # Catch any exceptions and wrap them raise DataConversionError( "{} error converting raw to internal data for document type {}: {}. [{}]".format( type(e).__name__, self.data_point_type, e, format_exc(), )) return self._internal_data def __reduce__(self): return (_DocumentPickler(), ( self.data_point_type, self._raw_data if self._raw_data is not None else self._internal_data, self._raw_data is None )) def __getattr__(self, item): # Provide the internal data keys defined by the doc type as attributes for easy access if item in self.keys: return self.internal_data[item] else: raise AttributeError("{} document has no attribute or data key '{}'".format( self.data_point_type.name, item ))
class _DocumentPickler(object): """ Our fancy document typing system means pickle has trouble reconstructing document objects. We reduce a document instance as the data point type instance, plus either the raw data or the internal data (avoiding conversion) and a flag to say which type of data it is. Then we can simply reconstruct the document from the datatype's __call__ when unpickling. """ def __call__(self, datatype, data, from_internal): if from_internal: return datatype(**data) else: return datatype(raw_data=data)
[docs]class InvalidDocument(DataPointType): """ Widely used in Pimlico to represent an empty document that is empty not because the original input document was empty, but because a module along the way had an error processing it. Document readers/writers should generally be robust to this and simply pass through the whole thing where possible, so that it's always possible to work out, where one of these pops up, where the error occurred. """ data_point_type_supports_python2 = True
[docs] class Document(object): keys = ["module_name", "error_info"]
[docs] def raw_to_internal(self, raw_data): # Raw data always encoded as utf-8 raw_data = raw_data.decode("utf-8") if not raw_data.startswith(u"***** EMPTY DOCUMENT *****"): raise ValueError(u"tried to read empty document text from invalid text: %s" % raw_data) text = raw_data.partition("\n")[2] module_line, __, text = text.partition("\n\n") module_name = module_line.partition(": ")[2] error_info = text.partition("\n")[2] return {"module_name": module_name, "error_info": error_info}
[docs] def internal_to_raw(self, internal_data): # Encode back to utf-8 for the raw data return bytes( (u"***** EMPTY DOCUMENT *****\nEmpty due to processing error in module: %s\n\n" u"Full error details:\n%s" % (internal_data["module_name"], internal_data["error_info"])).encode("utf-8") )
@property def module_name(self): return self.internal_data["module_name"] @property def error_info(self): return self.internal_data["error_info"] def __unicode__(self): return self.raw_data.decode("utf-8") def __str__(self): return self.raw_data def __repr__(self): return "InvalidDocument()"
def invalid_document(module_name, error_info): """ Convenience function to create an invalid document instance. """ return InvalidDocument()(module_name=module_name, error_info=error_info) def invalid_document_or_raw(data): """ Takes the given raw data, given as a bytes object, and returns it as an InvalidDocument object, if it represents an invalid document, or returns the data as is otherwise. """ is_invalid = False try: if data.startswith(b"***** EMPTY DOCUMENT *****"): # This is the raw data for an invalid doc is_invalid = True except TypeError: if not isinstance(data, bytes): raise DataConversionError("invalid_document_or_raw() should be given a bytes object, " "not {}".format(type(data).__name__)) else: raise if is_invalid: return InvalidDocument()(raw_data=data) else: return data # Alias for backwards compatibility # Doesn't any longer correctly describe the function def invalid_document_or_text(module_name, data): return invalid_document_or_raw(data) def is_invalid_doc(doc): """ Check whether the given document is of the invalid document types """ return isinstance(doc, DataPointType.Document) and InvalidDocument().is_type_for_doc(doc) def is_invalid_doc_raw_data(data): try: return data.startswith(b"***** EMPTY DOCUMENT *****") except: if not isinstance(data, bytes): raise TypeError("is_invalid_doc_raw_data() should be called on a document's raw data, which should be a " "bytes instance: got {}".format(type(data).__name__)) else: raise
[docs]class RawDocumentType(DataPointType): """ Base document type. All document types for grouped corpora should be subclasses of this. It may be used itself as well, where documents are just treated as raw data, though most of the time it will be appropriate to use subclasses to provide more information and processing operations specific to the datatype. """ data_point_type_supports_python2 = True
[docs] class Document(object): keys = ["raw_data"]
[docs] def raw_to_internal(self, raw_data): # Just include the raw data itself, so it's possible to convert back to raw data later return {"raw_data": raw_data.decode("utf-8")}
[docs] def internal_to_raw(self, internal_data): return bytes(internal_data["raw_data"].encode("utf-8"))
[docs]class TextDocumentType(RawDocumentType): """ Documents that contain text, most often human-readable documents from a textual corpus. Most often used as a superclass for other, more specific, document types. This type does not special processing, since the storage format is already a unicode string, which is fine for raw text. However, it serves to indicate that the document represents text (not just any old raw data). The property `text` provides the text, which is, for this base type, just the raw data. However, subclasses will override this, since their raw data will contain information other than the raw text. """ data_point_type_supports_python2 = True formatters = [("text", "pimlico.datatypes.corpora.formatters.text.TextDocumentFormatter")]
[docs] class Document(object): keys = ["text"]
[docs] def internal_to_raw(self, internal_data): return bytes(internal_data["text"].encode("utf-8"))
[docs] def raw_to_internal(self, raw_data): return {"text": raw_data.decode("utf-8")}
[docs]class RawTextDocumentType(TextDocumentType): """ Subclass of TextDocumentType used to indicate that the text hasn't been processed (tokenized, etc). Note that text that has been tokenized, parsed, etc does not use subclasses of this type, so they will not be considered compatible if this type is used as a requirement. """ data_point_type_supports_python2 = True
[docs]class DataPointError(Exception): pass
class DataConversionError(Exception): pass class DocumentInitializationError(Exception): pass