# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
"""
Datatypes for coreference resolution output. Based on Stanford CoreNLP's coref output, so includes all the information
provided by that.
"""
import json
from pimlico.datatypes.tar import TarredCorpus, TarredCorpusWriter, pass_up_invalid
[docs]class CorefCorpus(TarredCorpus):
datatype_name = "corenlp_coref"
[docs] def process_document(self, data):
if data.strip():
return [Entity(eid, [Mention.from_json(m) for m in mentions])
for (eid, mentions) in json.loads(data).items()]
else:
return []
[docs]class CorefCorpusWriter(TarredCorpusWriter):
@pass_up_invalid
def document_to_raw_data(self, doc):
return json.dumps(dict(
(entity.id, [m.to_json_dict() for m in entity.mentions]) for entity in doc
))
[docs]class Entity(object):
def __init__(self, id, mentions):
self.id = id
self.mentions = mentions
[docs]class Mention(object):
def __init__(self, id, sentence_num, start_index, end_index, text, type,
position=None, animacy=None, is_representative_mention=None, number=None, gender=None):
self.id = id
self.sentence_num = sentence_num
self.start_index = start_index
self.end_index = end_index
self.text = text
self.type = type
self.position = position
self.animacy = animacy
self.is_representative_mention = is_representative_mention
self.number = number
self.gender = gender
@staticmethod
[docs] def from_json(json):
return Mention(
json["id"], json["sentNum"], json["startIndex"], json["endIndex"], json["text"], json["type"],
position=json.get("position", None), animacy=json.get("animacy", None),
is_representative_mention=json.get("isRepresentativeMention", None),
number=json.get("number", None), gender=json.get("gender", None)
)
[docs] def to_json_dict(self):
data = {
"id": self.id, "sentNum": self.sentence_num, "startIndex": self.start_index, "endIndex": self.end_index,
"text": self.text, "type": self.type
}
if self.position is not None:
data["position"] = self.position
if self.animacy is not None:
data["animacy"] = self.animacy
if self.is_representative_mention is not None:
data["isRepresentativeMention"] = self.is_representative_mention
if self.number is not None:
data["number"] = self.number
if self.gender is not None:
data["gender"] = self.gender
return data