Source code for pimlico.datatypes.coref.corenlp

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

"""
Datatypes for coreference resolution output. Based on Stanford CoreNLP's coref output, so includes all the information
provided by that.

"""
import json

from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid


[docs]class CorefCorpus(TarredCorpus): datatype_name = "corenlp_coref"
[docs] def process_document(self, data): if data.strip(): return [Entity(eid, [Mention.from_json(m) for m in mentions]) for (eid, mentions) in json.loads(data).items()] else: return []
[docs]class CorefCorpusWriter(TarredCorpusWriter): @pass_up_invalid def document_to_raw_data(self, doc): return json.dumps(dict( (entity.id, [m.to_json_dict() for m in entity.mentions]) for entity in doc ))
[docs]class Entity(object): def __init__(self, id, mentions): self.id = id self.mentions = mentions
[docs]class Mention(object): def __init__(self, id, sentence_num, start_index, end_index, text, type, position=None, animacy=None, is_representative_mention=None, number=None, gender=None): self.id = id self.sentence_num = sentence_num self.start_index = start_index self.end_index = end_index self.text = text self.type = type self.position = position self.animacy = animacy self.is_representative_mention = is_representative_mention self.number = number self.gender = gender @staticmethod
[docs] def from_json(json): return Mention( json["id"], json["sentNum"], json["startIndex"], json["endIndex"], json["text"], json["type"], position=json.get("position", None), animacy=json.get("animacy", None), is_representative_mention=json.get("isRepresentativeMention", None), number=json.get("number", None), gender=json.get("gender", None) )
[docs] def to_json_dict(self): data = { "id": self.id, "sentNum": self.sentence_num, "startIndex": self.start_index, "endIndex": self.end_index, "text": self.text, "type": self.type } if self.position is not None: data["position"] = self.position if self.animacy is not None: data["animacy"] = self.animacy if self.is_representative_mention is not None: data["isRepresentativeMention"] = self.is_representative_mention if self.number is not None: data["number"] = self.number if self.gender is not None: data["gender"] = self.gender return data