Source code for pimlico.datatypes.corpora.parse.trees

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Datatypes for storing parse trees from constitutency parsers.

.. note::

   Parse tress are temporary implementations that don't actually parse the data, but just split it into
   sentences. That is, they store the raw output from the OpenNLP parser. In future, this should be
   replaced by a generic tree structure storage.

"""
from pimlico.datatypes.corpora.data_points import RawDocumentType

__all__ = ["OpenNLPTreeStringsDocumentType"]


[docs]class OpenNLPTreeStringsDocumentType(RawDocumentType): """ The attribute ``trees`` provides a list of strings representing each of the trees in the document, usually one per sentence. .. todo:: In future, this should be replaced by a doc type that reads in the parse trees and returns a tree data structure. For now, you need to load and process the tree strings yourself. """ data_point_type_supports_python2 = True
[docs] class Document(object): keys = ["trees"]
[docs] def raw_to_internal(self, raw_data): return {"trees": raw_data.decode("utf-8").split("\n\n")}
[docs] def internal_to_raw(self, internal_data): return "\n\n".join(internal_data["trees"]).encode("utf-8")