Source code for pimlico.datatypes.formatters.tokenized

from pimlico.cli.browser.formatter import DocumentBrowserFormatter
from pimlico.datatypes.tokenized import TokenizedDocumentType, CharacterTokenizedDocumentType, \
    SegmentedLinesDocumentType


[docs]class TokenizedDocumentFormatter(DocumentBrowserFormatter): DATATYPE = TokenizedDocumentType def __init__(self, corpus, raw_data=False): super(TokenizedDocumentFormatter, self).__init__(corpus) self.raw_data = raw_data
[docs] def format_document(self, doc): if self.raw_data: # We're just showing the raw data, so don't try to do anything other than ensure it's a string doc = unicode(doc) else: doc = u"\n".join(u" ".join(words) for words in doc) return doc
[docs]class CharacterTokenizedDocumentFormatter(DocumentBrowserFormatter): DATATYPE = CharacterTokenizedDocumentType def __init__(self, corpus, raw_data=False): super(CharacterTokenizedDocumentFormatter, self).__init__(corpus) self.raw_data = raw_data
[docs] def format_document(self, doc): if self.raw_data: # We're just showing the raw data, so don't try to do anything other than ensure it's a string doc = unicode(doc) else: doc = u"\n".join(u"".join(chars) for chars in doc) return doc
[docs]class SegmentedLinesFormatter(DocumentBrowserFormatter): DATATYPE = SegmentedLinesDocumentType
[docs] def format_document(self, doc): return u"\n".join(" ".join(line) for line in doc)