Source code for pimlico.datatypes.corpora.json

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

from __future__ import absolute_import

import json
from builtins import object

from pimlico.datatypes.corpora.data_points import RawDocumentType

__all__ = ["JsonDocumentType"]


[docs]class JsonDocumentType(RawDocumentType): """ Very simple document corpus in which each document is a JSON object. """ formatters = [("json", "pimlico.datatypes.corpora.formatters.json.JsonFormatter")] data_point_type_supports_python2 = True
[docs] class Document(object): keys = ["data"]
[docs] def raw_to_internal(self, raw_data): return {"data": json.loads(raw_data.decode("utf-8"))}
[docs] def internal_to_raw(self, internal_data): return json.dumps(internal_data["data"]).encode("utf-8")