Source code for pimlico.datatypes.corpora.tokenized

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

from __future__ import unicode_literals
from builtins import object

from pimlico.cli.browser.tools.formatter import DocumentBrowserFormatter
from pimlico.datatypes.corpora.data_points import TextDocumentType
from pimlico.utils.core import cached_property

__all__ = ["TokenizedDocumentType", "SegmentedLinesDocumentType", "CharacterTokenizedDocumentType"]


[docs]class TokenizedDocumentType(TextDocumentType):
    """
    Specialized data point type for documents that have had tokenization applied.
    It does very little processing - the main reason for its existence is to allow
    modules to require that a corpus has been tokenized before it's given as input.

    Each document is a list of sentences. Each sentence is a list of words.

    """
    formatters = [("tokenized_doc", "pimlico.datatypes.corpora.tokenized.TokenizedDocumentFormatter")]
    data_point_type_supports_python2 = True

[docs]    class Document(object):
        keys = ["sentences"]

        @cached_property
        def text(self):
            if self._raw_data is not None:
                # The text is just the raw data, decoded, so it's quickest to get it from that
                return self._raw_data.decode("utf-8")
            else:
                return "\n".join(" ".join(sentence) for sentence in self.internal_data["sentences"])

[docs]        def raw_to_internal(self, raw_data):
            return {
                "sentences": [sentence.split(" ") for sentence in raw_data.decode("utf-8").split("\n")],
            }

[docs]        def internal_to_raw(self, internal_data):
            return bytes("\n".join(" ".join(sentence) for sentence in internal_data["sentences"]).encode("utf-8"))


class TokenizedDocumentFormatter(DocumentBrowserFormatter):
    """
    Format a tokenized document by putting sentences on consecutive lines
    and separating tokens with spaces.

    """
    DATATYPE = TokenizedDocumentType()

    def format_document(self, doc):
        return "\n".join(" ".join(sent) for sent in doc.sentences)


class LemmatizedTokensDocumentType(TokenizedDocumentType):
    """
    Identical to :class:`TokenizedDocumentType`. Separate subclass to allow
    modules to require that their input has been lemmatized (and tokenized).

    """
    data_point_type_supports_python2 = True


[docs]class CharacterTokenizedDocumentType(TokenizedDocumentType):
    """
    Simple character-level tokenized corpus. The text isn't stored in any special way,
    but is represented when read internally just as a sequence of characters in each sentence.

    If you need a more sophisticated way to handle character-type (or any non-word) units within each
    sequence, see `SegmentedLinesDocumentType`.

    """
    data_point_type_supports_python2 = True

[docs]    class Document(object):
        @property
        def sentences(self):
            return self.internal_data["sentences"]

[docs]        def raw_to_internal(self, raw_data):
            text = raw_data.decode("utf-8")
            return {
                "sentences": [list(sentence) for sentence in text.split("\n")],
                "text": text,
            }

[docs]        def internal_to_raw(self, internal_data):
            return bytes("\n".join("".join(sentence) for sentence in internal_data["sentences"]).encode("utf-8"))


[docs]class SegmentedLinesDocumentType(TokenizedDocumentType):
    """
    Document consisting of lines, each split into elements, which may be characters, words, or whatever.
    Rather like a tokenized corpus, but doesn't make the assumption that the elements (words in the case of a
    tokenized corpus) don't include spaces.

    You might use this, for example, if you want to train character-level models on a text corpus, but
    don't use strictly single-character units, perhaps grouping together certain short character sequences.

    Uses the character `/` to separate elements in the raw data.
    If a `/` is found in an element, it is stored as `@slash@`,
    so this string is assumed not to be used in any element (which seems reasonable enough, generally).

    """
    data_point_type_supports_python2 = True

[docs]    class Document(object):
        @property
        def text(self):
            return u"\n".join(u"".join(token for token in sent) for sent in self.internal_data["sentences"])

        @property
        def sentences(self):
            return self.internal_data["sentences"]

[docs]        def raw_to_internal(self, raw_data):
            sentences = [[el.replace("@slash@", "/") for el in line.split("/")] for line in raw_data.decode("utf-8").split("\n")]
            return {
                "sentences": sentences,
                # For producing the "text" attribute, we assume it makes sense to join on the empty string
                "text": "\n".join("".join(line) for line in sentences)
            }

[docs]        def internal_to_raw(self, internal_data):
            return bytes("\n".join(
                "/".join(el.replace("/", "@slash@") for el in line).replace("\n", "")
                for line in internal_data["sentences"]
            ).encode("utf-8"))