Source code for pimlico.datatypes.tokenized

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
from pimlico.datatypes.documents import RawDocumentType
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter

__all__ = ["TokenizedCorpus", "TokenizedCorpusWriter"]


class TokenizedDocumentType(RawDocumentType):
    def process_document(self, doc):
        return [sentence.split(u" ") for sentence in doc.split(u"\n")]


[docs]class TokenizedCorpus(TarredCorpus): """ Specialized datatype for a tarred corpus that's had tokenization applied. The datatype does very little - the main reason for its existence is to allow modules to require that a corpus has been tokenized before it's given as input. Each document is a list of sentences. Each sentence is a list of words. """ datatype_name = "tokenized" data_point_type = TokenizedDocumentType
[docs]class TokenizedCorpusWriter(TarredCorpusWriter): """ Simple writer that takes lists of tokens and outputs them with a sentence per line and tokens separated by spaces. """
[docs] def document_to_raw_data(self, doc): return u"\n".join(u" ".join(sentence) for sentence in doc)