Source code for pimlico.datatypes.tokenized

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
from pimlico.datatypes.base import InvalidDocument
from pimlico.datatypes.documents import TextDocumentType
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter

__all__ = [
    "TokenizedDocumentType", "TokenizedCorpus", "TokenizedCorpusWriter",
    "SegmentedLinesDocumentType", "SegmentedLinesCorpusWriter",
    "CharacterTokenizedDocumentType", "CharacterTokenizedCorpusWriter"
]


[docs]class TokenizedDocumentType(TextDocumentType): formatters = [("tokenized_doc", "pimlico.datatypes.formatters.tokenized.TokenizedDocumentFormatter")]
[docs] def process_document(self, doc, as_type=None): text = super(TokenizedDocumentType, self).process_document(doc) if as_type is not None and as_type is not TokenizedDocumentType: # Raw text required return text return [sentence.split(u" ") for sentence in text.split(u"\n")]
[docs]class TokenizedCorpus(TarredCorpus): """ Specialized datatype for a tarred corpus that's had tokenization applied. The datatype does very little - the main reason for its existence is to allow modules to require that a corpus has been tokenized before it's given as input. Each document is a list of sentences. Each sentence is a list of words. """ datatype_name = "tokenized" data_point_type = TokenizedDocumentType
[docs]class TokenizedCorpusWriter(TarredCorpusWriter): """ Simple writer that takes lists of tokens and outputs them with a sentence per line and tokens separated by spaces. """
[docs] def document_to_raw_data(self, doc): return u"\n".join(u" ".join(sentence) for sentence in doc)
[docs]class CharacterTokenizedDocumentType(TokenizedDocumentType): """ Simple character-level tokenized corpus. The text isn't stored in any special way, but is represented when read internally just as a sequence of characters in each sentence. If you need a more sophisticated way to handle character-type (or any non-word) units within each sequence, see `SegmentedLinesDocumentType`. """ formatters = [("char_tokenized_doc", "pimlico.datatypes.formatters.tokenized.CharacterTokenizedDocumentFormatter")]
[docs] def process_document(self, doc, as_type=None): if as_type is not None and as_type not in (CharacterTokenizedDocumentType, TokenizedDocumentType): # Raw text required return super(CharacterTokenizedDocumentType, self).process_document(doc, as_type=as_type) return [list(sentence) for sentence in doc.split(u"\n")]
[docs]class CharacterTokenizedCorpusWriter(TarredCorpusWriter): """ Simple writer that takes lists of char-tokens and outputs them with a sentence per line. Just joins together all the characters to store the sentence, since they can be divided up again when read. """
[docs] def document_to_raw_data(self, doc): return u"\n".join(u"".join(sentence) for sentence in doc)
[docs]class SegmentedLinesDocumentType(TokenizedDocumentType): """ Document consisting of lines, each split into elements, which may be characters, words, or whatever. Rather like a tokenized corpus, but doesn't make the assumption that the elements (words in the case of a tokenized corpus) don't include spaces. You might use this, for example, if you want to train character-level models on a text corpus, but don't use strictly single-character units, perhaps grouping together certain short character sequences. Uses the character `/` to separate elements. If a `/` is found in an element, it is stored as `@slash@`, so this string is assumed not to be used in any element (which seems reasonable enough, generally). """ formatters = [("segmented_lines", "pimlico.datatypes.formatters.tokenized.SegmentedLinesFormatter")]
[docs] def process_document(self, doc, as_type=None): if as_type is not None and as_type not in (SegmentedLinesDocumentType, TokenizedDocumentType): # Raw text required return super(SegmentedLinesDocumentType, self).process_document(doc, as_type=as_type) return [[el.replace(u"@slash@", u"/") for el in line.split(u"/")] for line in TextDocumentType.process_document(self, doc).split(u"\n")]
[docs]class SegmentedLinesCorpusWriter(TarredCorpusWriter):
[docs] def document_to_raw_data(self, doc): if isinstance(doc, InvalidDocument): return doc else: return u"\n".join(u"/".join(el.replace(u"/", u"@slash@") for el in line).replace(u"\n", u"") for line in doc)