Source code for pimlico.datatypes.coref.corenlp

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

"""
Datatypes for coreference resolution output. Based on Stanford CoreNLP's coref output, so includes all the information
provided by that.

"""
import json

from pimlico.datatypes.jsondoc import JsonDocumentType, JsonDocumentCorpus
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid


[docs]class CorefDocumentType(JsonDocumentType):
[docs]    def process_document(self, doc):
        data = super(CorefDocumentType, self).process_document(doc)
        return [Entity(eid, [Mention.from_json(m) for m in mentions]) for (eid, mentions) in data.items()]


[docs]class CorefCorpus(JsonDocumentCorpus):
    datatype_name = "corenlp_coref"
    data_point_type = CorefDocumentType


[docs]class CorefCorpusWriter(TarredCorpusWriter):
    @pass_up_invalid
    def document_to_raw_data(self, doc):
        return json.dumps(dict(
            (entity.id, [m.to_json_dict() for m in entity.mentions]) for entity in doc
        ))


[docs]class Entity(object):
    def __init__(self, id, mentions):
        self.id = id
        self.mentions = mentions


[docs]class Mention(object):
    def __init__(self, id, sentence_num, start_index, end_index, text, type,
                 position=None, animacy=None, is_representative_mention=None, number=None, gender=None):
        self.id = id
        self.sentence_num = sentence_num
        self.start_index = start_index
        self.end_index = end_index
        self.text = text
        self.type = type
        self.position = position
        self.animacy = animacy
        self.is_representative_mention = is_representative_mention
        self.number = number
        self.gender = gender

    @staticmethod
[docs]    def from_json(json):
        return Mention(
            json["id"], json["sentNum"], json["startIndex"], json["endIndex"], json["text"], json["type"],
            position=json.get("position", None), animacy=json.get("animacy", None),
            is_representative_mention=json.get("isRepresentativeMention", None),
            number=json.get("number", None), gender=json.get("gender", None)
        )

[docs]    def to_json_dict(self):
        data = {
            "id": self.id, "sentNum": self.sentence_num, "startIndex": self.start_index, "endIndex": self.end_index,
            "text": self.text, "type": self.type
        }
        if self.position is not None:
            data["position"] = self.position
        if self.animacy is not None:
            data["animacy"] = self.animacy
        if self.is_representative_mention is not None:
            data["isRepresentativeMention"] = self.is_representative_mention
        if self.number is not None:
            data["number"] = self.number
        if self.gender is not None:
            data["gender"] = self.gender
        return data