Source code for pimlico.datatypes.jsondoc

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

import json

from pimlico.datatypes.documents import RawDocumentType
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid


__all__ = ["JsonDocumentCorpus", "JsonDocumentCorpusWriter"]


class JsonDocumentType(RawDocumentType):
    def process_document(self, doc):
        return json.loads(doc)


[docs]class JsonDocumentCorpus(TarredCorpus): """ Very simple document corpus in which each document is a JSON object. """ datatype_name = "json" data_point_type = JsonDocumentType
[docs]class JsonDocumentCorpusWriter(TarredCorpusWriter): """ If readable=True, JSON text output will be nicely formatted so that it's human-readable. Otherwise, it will be formatted to take up less space. """ def __init__(self, base_dir, readable=False, **kwargs): super(JsonDocumentCorpusWriter, self).__init__(base_dir, **kwargs) self.readable = readable @pass_up_invalid def document_to_raw_data(self, doc): # Data should be a JSON object or other object serializable by the json package if self.readable: return json.dumps(doc, indent=4) else: # More compact representation, not very readable return json.dumps(doc, indent=None, separators=(",", ":"))