Source code for pimlico.datatypes.parse.dependency

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

import json

from pimlico.datatypes.jsondoc import JsonDocumentCorpus, JsonDocumentCorpusWriter
from pimlico.datatypes.tar import pass_up_invalid
from pimlico.datatypes.word_annotations import WordAnnotationCorpus, WordAnnotationCorpusWriter


__all__ = [
    "StanfordDependencyParseCorpus", "StanfordDependencyParseCorpusWriter",
    "CoNLLDependencyParseCorpus", "CoNLLDependencyParseCorpusWriter",
    "CoNLLDependencyParseInputCorpus", "CoNLLDependencyParseInputCorpusWriter",
]


[docs]class StanfordDependencyParseCorpus(JsonDocumentCorpus):
    datatype_name = "stanford_dependency_parses"

[docs]    def process_document(self, data):
        data = super(StanfordDependencyParseCorpus, self).process_document(data)
        if data.strip():
            # Read in the dep parse trees as JSON and return a dep parse data structure
            return [StanfordDependencyParse.from_json(sentence_json) for sentence_json in data]
        else:
            return []


[docs]class StanfordDependencyParseCorpusWriter(JsonDocumentCorpusWriter):
    @pass_up_invalid
    def document_to_raw_data(self, doc):
        # Data should be a list of StanfordDependencyParses, one for each sentence
        return super(StanfordDependencyParseCorpusWriter, self).document_to_raw_data(
            [parse.to_json_list() for parse in doc]
        )


class StanfordDependencyParse(object):
    def __init__(self, dependencies):
        self.dependencies = dependencies

    @staticmethod
    def from_json(json):
        """
        Read in from JSON, as received from the Stanford CoreNLP server output. Input should be parsed JSON.

        """
        return StanfordDependencyParse([StanfordDependency.from_json(dep_json) for dep_json in json])

    @staticmethod
    def from_json_string(data):
        return StanfordDependencyParse.from_json(json.loads(data))

    def to_json_list(self):
        return [dep.to_json_dict() for dep in self.dependencies]


class StanfordDependency(object):
    def __init__(self, dep, dependent_index, governor_index, dependent_gloss, governor_gloss):
        self.dep = dep
        self.dependent_index = dependent_index
        self.governor_index = governor_index
        self.dependent_gloss = dependent_gloss
        self.governor_gloss = governor_gloss

    @staticmethod
    def from_json(json):
        return StanfordDependency(
            json["dep"], json["dependent"], json["governor"], json["dependentGloss"], json["governorGloss"]
        )

    def to_json_dict(self):
        return {
            "dep": self.dep,
            "dependent": self.dependent_index,
            "dependentGloss": self.dependent_gloss,
            "governor": self.governor_index,
            "governorGloss": self.governor_gloss,
        }


def _usnone(field, typ=lambda x:x):
    return None if field == "_" else typ(field)


def _noneus(field):
    return "_" if field is None else unicode(field)


[docs]class CoNLLDependencyParseCorpus(WordAnnotationCorpus):
    """
    10-field CoNLL dependency parse format (conllx) -- i.e. post parsing.

    Fields are:
      id (int), word form, lemma, coarse POS, POS, features, head (int), dep relation, phead (int), pdeprel

    The last two are usually not used.

    """
    datatype_name = "conll_dependency_parses"

[docs]    def process_document(self, data):
        data = super(CoNLLDependencyParseCorpus, self).process_document(data)
        return [
            [
                {
                    "id": int(token["id"]),
                    "word": token["word"],
                    "lemma": _usnone(token["lemma"]),
                    "cpostag": token["cpostag"],
                    "postag": token["postag"],
                    "feats": _usnone(token["feats"]),
                    "head": _usnone(token["head"], int),
                    "deprel": _usnone(token["deprel"]),
                    "phead": _usnone(token["phead"], int),
                    "pdeprel": _usnone(token["pdeprel"])
                } for token in sentence
            ] for sentence in data
        ]


[docs]class CoNLLDependencyParseCorpusWriter(WordAnnotationCorpusWriter):
    def __init__(self, base_dir, **kwargs):
        super(CoNLLDependencyParseCorpusWriter, self).__init__(
            # Blank line between sentences
            u"\n\n",
            # Single linebreak between words
            u"\n",
            # Tab-separated fields
            u"{id}\t{word}\t{lemma}\t{cpostag}\t{postag}\t{feats}\t{head}\t{deprel}\t{phead}\t{pdeprel}",
            # No linebreaks or tabs in words
            u"\n\t",
            base_dir, **kwargs
        )

    @pass_up_invalid
    def document_to_raw_data(self, doc):
        """
        Data should be a list of sentences
        Each sentence is a list of tokens.
        Each token is a list of columns (fields).

        """
        return super(CoNLLDependencyParseCorpusWriter, self).document_to_raw_data(
            u"\n\n".join(
                u"\n".join(
                    # Replace Nones with underscores
                    u"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % tuple(map(_noneus, token)) for token in sentence
                ) for sentence in doc
            )
        )


[docs]class CoNLLDependencyParseInputCorpus(WordAnnotationCorpus):
    """
    The version of the CoNLL format (conllx) that only has the first 6 columns, i.e. no dependency parse yet annotated.

    """
    datatype_name = "conll_dependency_parse_inputs"

[docs]    def process_document(self, data):
        data = super(CoNLLDependencyParseInputCorpus, self).process_document(data)
        return [
            [
                {
                    "id": int(token["id"]),
                    "word": token["word"],
                    "lemma": _usnone(token["lemma"]),
                    "cpostag": token["cpostag"],
                    "postag": token["postag"],
                    "feats": _usnone(token["feats"]),
                } for token in sentence
            ] for sentence in data
        ]


[docs]class CoNLLDependencyParseInputCorpusWriter(WordAnnotationCorpusWriter):
    def __init__(self, base_dir, **kwargs):
        super(CoNLLDependencyParseInputCorpusWriter, self).__init__(
            # Blank line between sentences
            u"\n\n",
            # Single linebreak between words
            u"\n",
            # Tab-separated fields
            u"{id}\t{word}\t{lemma}\t{cpostag}\t{postag}\t{feats}",
            # No linebreaks or tabs in words
            u"\n\t",
            base_dir, **kwargs
        )

    @pass_up_invalid
    def document_to_raw_data(self, doc):
        """
        Data should be a list of sentences
        Each sentence is a list of tokens.
        Each token is a list of columns (fields).

        """
        return super(CoNLLDependencyParseInputCorpusWriter, self).document_to_raw_data(
            u"\n\n".join(
                u"\n".join(
                    # Replace Nones with underscores
                    u"%s\t%s\t%s\t%s\t%s\t%s" % tuple(map(_noneus, token)) for token in sentence
                ) for sentence in doc
            )
        )