# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html
"""
Datatypes for coreference resolution output. Based on OpenNLP's coref output, so includes all the information
provided by that.
This is a slight different set of information to CoreNLP. Currently, there's no way to convert between the
two datatypes, but in future it will be easy to provide an adapter that carries across the information common
to the two (which for most purposes will be sufficient).
"""
from pimlico.datatypes.jsondoc import JsonDocumentCorpus, JsonDocumentCorpusWriter, JsonDocumentType
from pimlico.datatypes.tar import pass_up_invalid
from pimlico.utils.linguistic import strip_punctuation, ENGLISH_PRONOUNS
from pimlico.utils.strings import truncate
[docs]class CorefDocumentType(JsonDocumentType):
[docs] def process_document(self, doc):
data = super(CorefDocumentType, self).process_document(doc)
return list(sorted([Entity.from_json(entity) for entity in data], key=lambda e: e.id))
[docs]class CorefCorpus(JsonDocumentCorpus):
datatype_name = "opennlp_coref"
data_point_type = CorefDocumentType
[docs]class CorefCorpusWriter(JsonDocumentCorpusWriter):
@pass_up_invalid
def document_to_raw_data(self, doc):
return super(CorefCorpusWriter, self).document_to_raw_data([
entity.to_json_dict() for entity in doc
])
[docs]class Entity(object):
def __init__(self, id, mentions, category=None, gender=None, gender_prob=None, number=None, number_prob=None):
self.category = category
self.gender = gender
self.gender_prob = gender_prob
self.number = number
self.number_prob = number_prob
self.id = id
self.mentions = mentions
[docs] def get_head_word(self, pronouns=ENGLISH_PRONOUNS):
"""
Retrieve a head word from the entity's mentions if possible. Returns None if no suitable head
word can be found: e.g., if all mentions are pronouns.
Pronouns are filtered out using :data:pimlico.utils.linguistic.ENGLISH_PRONOUNS by default. You can
override this with the `pronouns` kwargs. If `pronouns=None`, no filtering is done.
"""
entity_head_words = set()
# Gather a head word, if possible, from each mention
for mention in self.mentions:
mention_head = mention.text[
mention.head_start_index-mention.start_index:mention.head_end_index-mention.start_index
].lower()
# Process the head phrase a bit
# Remove punctuation
mention_head = strip_punctuation(mention_head)
# Get rid of words that won't help us: stopwords and pronouns
head_words = mention_head.split()
if pronouns is not None:
head_words = [w for w in head_words if w not in pronouns]
# Don't use any 1-letter words
head_words = [w for w in head_words if len(w) > 1]
# If there are no words left, we can't get a headword from this mention
# If there are multiple (a minority of cases), use the rightmost, which usually is the headword
if head_words:
entity_head_words.add(head_words[-1])
# If we've ended up with multiple possible head words (minority, but not uncommon), we've no way to choose
# We could just pick one randomly
# Take the lexicographic first, just to be consistent
if len(entity_head_words):
return list(sorted(entity_head_words))[0]
else:
return None
[docs] def to_json_dict(self):
return {
"id": self.id,
"category": self.category,
"gender": self.gender,
"genderProb": self.gender_prob,
"number": self.number,
"numberProb": self.number_prob,
"mentions": [m.to_json_dict() for m in self.mentions],
}
[docs] @staticmethod
def from_json(json):
return Entity(
json["id"],
[Mention.from_json(m) for m in json["mentions"]],
json.get("category", None), json.get("gender", None), json.get("genderProb", None),
json.get("number", None), json.get("numberProb", None)
)
[docs] @staticmethod
def from_java_object(obj):
return Entity(
obj.getId(),
[Mention.from_java_object(m) for m in obj.getMentions()],
category=obj.getCategory(), gender=obj.getGender().toString(), gender_prob=obj.getGenderProbability(),
number=obj.getNumber().toString(), number_prob=obj.getNumberProbability()
)
def __unicode__(self):
return u"Entity-%s(%s)" % (self.id,
truncate(u"/".join(truncate(unicode(m), 30).strip() for m in self.mentions), 30))
def __repr__(self):
return unicode(self).encode("ascii", "ignore")
[docs]class Mention(object):
def __init__(self, sentence_num, start_index, end_index, text,
gender=None, gender_prob=None, number=None, number_prob=None,
head_start_index=None, head_end_index=None, name_type=None):
self.sentence_num = sentence_num
self.start_index = start_index
self.end_index = end_index
self.text = text
self.gender = gender
self.gender_prob = gender_prob
self.number = number
self.number_prob = number_prob
self.head_start_index = head_start_index
self.head_end_index = head_end_index
self.name_type = name_type
[docs] @staticmethod
def from_json(json):
return Mention(
json["sentNum"], json["startIndex"], json["endIndex"], json["text"],
json.get("gender", None), json.get("genderProb", None),
json.get("number", None), json.get("numberProb", None),
json.get("headStartIndex", None), json.get("headEndIndex", None), json.get("nameType", None),
)
[docs] def to_json_dict(self):
data = {
"sentNum": self.sentence_num, "startIndex": self.start_index, "endIndex": self.end_index,
"text": self.text,
}
if self.gender is not None:
data["gender"] = self.gender
if self.gender_prob is not None:
data["genderProb"] = self.gender_prob
if self.number is not None:
data["number"] = self.number
if self.number_prob is not None:
data["numberProb"] = self.number_prob
if self.head_start_index is not None:
data["headStartIndex"] = self.head_start_index
if self.head_end_index is not None:
data["headEndIndex"] = self.head_end_index
if self.name_type is not None:
data["nameType"] = self.name_type
return data
[docs] @staticmethod
def from_java_object(obj):
return Mention(
obj.getSentenceNumber(), obj.getIndexSpan().getStart(), obj.getIndexSpan().getEnd(),
obj.toText(),
obj.getGender().toString(), obj.getGenderProb(), obj.getNumber().toString(), obj.getNumberProb(),
obj.getHeadSpan().getStart(), obj.getHeadSpan().getEnd(), obj.getNameType()
)
def __unicode__(self):
return unicode(self.text)