# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
from __future__ import unicode_literals
from builtins import object
from pimlico.cli.browser.tools.formatter import DocumentBrowserFormatter
from pimlico.datatypes.corpora.data_points import TextDocumentType
from pimlico.utils.core import cached_property
__all__ = ["TokenizedDocumentType", "SegmentedLinesDocumentType", "CharacterTokenizedDocumentType"]
[docs]class TokenizedDocumentType(TextDocumentType):
"""
Specialized data point type for documents that have had tokenization applied.
It does very little processing - the main reason for its existence is to allow
modules to require that a corpus has been tokenized before it's given as input.
Each document is a list of sentences. Each sentence is a list of words.
"""
formatters = [("tokenized_doc", "pimlico.datatypes.corpora.tokenized.TokenizedDocumentFormatter")]
data_point_type_supports_python2 = True
[docs] class Document(object):
keys = ["sentences"]
@cached_property
def text(self):
if self._raw_data is not None:
# The text is just the raw data, decoded, so it's quickest to get it from that
return self._raw_data.decode("utf-8")
else:
return "\n".join(" ".join(sentence) for sentence in self.internal_data["sentences"])
[docs] def raw_to_internal(self, raw_data):
return {
"sentences": [sentence.split(" ") for sentence in raw_data.decode("utf-8").split("\n")],
}
[docs] def internal_to_raw(self, internal_data):
return bytes("\n".join(" ".join(sentence) for sentence in internal_data["sentences"]).encode("utf-8"))
class TokenizedDocumentFormatter(DocumentBrowserFormatter):
"""
Format a tokenized document by putting sentences on consecutive lines
and separating tokens with spaces.
"""
DATATYPE = TokenizedDocumentType()
def format_document(self, doc):
return "\n".join(" ".join(sent) for sent in doc.sentences)
class LemmatizedTokensDocumentType(TokenizedDocumentType):
"""
Identical to :class:`TokenizedDocumentType`. Separate subclass to allow
modules to require that their input has been lemmatized (and tokenized).
"""
data_point_type_supports_python2 = True
[docs]class CharacterTokenizedDocumentType(TokenizedDocumentType):
"""
Simple character-level tokenized corpus. The text isn't stored in any special way,
but is represented when read internally just as a sequence of characters in each sentence.
If you need a more sophisticated way to handle character-type (or any non-word) units within each
sequence, see `SegmentedLinesDocumentType`.
"""
data_point_type_supports_python2 = True
[docs] class Document(object):
@property
def sentences(self):
return self.internal_data["sentences"]
[docs] def raw_to_internal(self, raw_data):
text = raw_data.decode("utf-8")
return {
"sentences": [list(sentence) for sentence in text.split("\n")],
"text": text,
}
[docs] def internal_to_raw(self, internal_data):
return bytes("\n".join("".join(sentence) for sentence in internal_data["sentences"]).encode("utf-8"))
[docs]class SegmentedLinesDocumentType(TokenizedDocumentType):
"""
Document consisting of lines, each split into elements, which may be characters, words, or whatever.
Rather like a tokenized corpus, but doesn't make the assumption that the elements (words in the case of a
tokenized corpus) don't include spaces.
You might use this, for example, if you want to train character-level models on a text corpus, but
don't use strictly single-character units, perhaps grouping together certain short character sequences.
Uses the character `/` to separate elements in the raw data.
If a `/` is found in an element, it is stored as `@slash@`,
so this string is assumed not to be used in any element (which seems reasonable enough, generally).
"""
data_point_type_supports_python2 = True
[docs] class Document(object):
@property
def text(self):
return u"\n".join(u"".join(token for token in sent) for sent in self.internal_data["sentences"])
@property
def sentences(self):
return self.internal_data["sentences"]
[docs] def raw_to_internal(self, raw_data):
sentences = [[el.replace("@slash@", "/") for el in line.split("/")] for line in raw_data.decode("utf-8").split("\n")]
return {
"sentences": sentences,
# For producing the "text" attribute, we assume it makes sense to join on the empty string
"text": "\n".join("".join(line) for line in sentences)
}
[docs] def internal_to_raw(self, internal_data):
return bytes("\n".join(
"/".join(el.replace("/", "@slash@") for el in line).replace("\n", "")
for line in internal_data["sentences"]
).encode("utf-8"))