Source code for pimlico.datatypes.caevo

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

from __future__ import absolute_import

from pimlico.datatypes.documents import RawDocumentType
from pimlico.datatypes.tar import TarredCorpus
from xml.etree import ElementTree as ET


__all__ = ["CaevoCorpus"]


TAG_PREFIX = "{http://chambers.com/corpusinfo}"

def _tag(name):
    return "%s%s" % (TAG_PREFIX, name)


class CaevoDocument(object):
    def __init__(self, name, entries, tlinks):
        self.name = name
        self.entries = entries
        self.tlinks = tlinks

    def __unicode__(self):
        return u"Doc: %s\nEntries:\n%s\nTLinks:\n%s" % (self.name,
                                                        u"\n".join(e.__unicode__(indent=2) for e in self.entries),
                                                        u"\n".join(unicode(t) for t in self.tlinks))

    @staticmethod
    def from_raw_data(raw_data):
        ET.register_namespace("", "")
        xml = ET.fromstring(raw_data.encode("utf-8"))
        # Pull out the doc name from the XML
        name = xml.attrib["name"]
        # Read in sentences from <entry> tags
        entries = [CaevoEntry.from_element(entry) for entry in xml.findall(_tag("entry"))]
        # Build a dictionary of events and timexes as they're referred to in the tlinks
        event_dict = dict(
            [(event.eiid, event) for entry in entries for event in entry.events] +
            [(timex.tid, timex) for entry in entries for timex in entry.timexes]
        )
        # Also read in <tlink> tags
        tlinks = [TLink.from_element(tlink, event_dict) for tlink in xml.findall(_tag("tlink"))]
        return CaevoDocument(name, entries, tlinks)


class CaevoDocumentType(RawDocumentType):
    def process_document(self, doc):
        return CaevoDocument.from_raw_data(doc)


[docs]class CaevoCorpus(TarredCorpus): """ Datatype for Caevo output. The output is stored exactly as it comes out from Caevo, in an XML format. This datatype reads in that XML and provides easy access to its components. Since we simply store the XML that comes from Caevo, there's no corresponding corpus writer. The data is output using a :class:pimlico.datatypes.tar.TarredCorpusWriter. """ data_point_type = CaevoDocumentType
class CaevoEntry(object): def __init__(self, sid, filename, sentence, tokens, parse=None, deps=None, events=None, timexes=None): self.sentence = sentence self.tokens = tokens self.parse = parse self.deps = deps self.events = events self.timexes = timexes self.filename = filename self.sid = sid @staticmethod def from_element(el): """ Extract from XML element """ sentence = el.find(_tag("sentence")).text tokens = [_token_to_triple(token.text) for token in el.find(_tag("tokens")).findall(_tag("t"))] # Get parse text if there is one parse = el.find(_tag("parse")) parse = parse.text if parse is not None else None # Get dep parse text if there is one dep_parse = el.find(_tag("deps")) dep_parse = dep_parse.text if dep_parse is not None else None # Get any events events = [CaevoEvent.from_element(e) for e in el.find(_tag("events")).findall(_tag("event"))] # Get any timexes timexes = [CaevoTimex.from_element(t) for t in el.find(_tag("timexes")).findall(_tag("timex"))] return CaevoEntry(int(el.attrib["sid"]), el.attrib["file"], sentence, tokens, parse, dep_parse, events, timexes) def __unicode__(self, indent=0): return u"\n".join(u"%s%s" % (u" " * indent, line) for line in [ u"Entry %d" % self.sid, u" Sentence: %s" % self.sentence, u" Tokens: %s" % self.tokens, u" Parse: %s" % self.parse, u" Dep parse:", ] + [ u" %s" % line for line in self.deps.splitlines() ] + [u" Events:"] + [ u" %s" % event for event in self.events ] + [u" Timexes:"] + [ u" %s" % timex for timex in self.timexes ]) def _token_to_triple(token_text): """ Split up a token into before text, token text and after text, as it's encoded in Caevo output. """ # Remove outer quotes and any spaces token_text = token_text.strip()[1:-1] # Text up to first quote-space-quote is the before text before_text, __, token_text = token_text.partition('" "') # Text up to next quote-space-quote is the actual token text # Remaining text is after text token, __, after_text = token_text.partition('" "') return before_text, token, after_text class CaevoEvent(object): def __init__(self, id, eiid, offset, string, tense, aspect, cls, polarity, modality, happen, lower_bound_duration, upper_bound_duration): self.id = id self.eiid = eiid self.offset = offset self.string = string self.tense = tense self.aspect = aspect self.cls = cls self.polarity = polarity self.modality = modality self.happen = happen self.lower_bound_duration = lower_bound_duration self.upper_bound_duration = upper_bound_duration @staticmethod def from_element(el): d = el.attrib return CaevoEvent( d["id"], d["eiid"], int(d["offset"]), d["string"], d["tense"], d["aspect"], d["class"], d["polarity"], d["modality"], d["happen"], d["lowerBoundDuration"], d["upperBoundDuration"] ) def __unicode__(self): return u"Event(id={self.id}, eiid={self.eiid}, '{self.string}')".format(self=self) def __repr__(self): return unicode(self).encode("ascii", "replace") class CaevoTimex(object): def __init__(self, tid, text, offset, length, type, value, temporal_function): self.tid = tid self.text = text self.offset = offset self.length = length self.type = type self.value = value self.temporal_function = temporal_function @staticmethod def from_element(el): d = el.attrib return CaevoTimex( d["tid"], d["text"], int(d["offset"]), int(d["length"]), d["type"], d["value"], d["temporalFunction"] ) def __unicode__(self): return u"Timex(tid={self.tid}, '{self.text}')".format(self=self) class TLink(object): def __init__(self, event1, event2, relation, closed, origin, type, event1_obj=None, event2_obj=None): self.event2_obj = event2_obj self.event1_obj = event1_obj self.event1 = event1 self.event2 = event2 self.relation = relation self.closed = closed self.origin = origin self.type = type @staticmethod def from_element(el, event_dict): """ Build from XML element """ d = el.attrib return TLink( d["event1"], d["event2"], d["relation"], d["closed"] == "true", d["origin"], d["type"], event1_obj=event_dict[d["event1"]], event2_obj=event_dict[d["event2"]], ) def __unicode__(self): return u"TLink(e1={self.event1}, e2={self.event2}, {self.relation})".format(self=self) def __repr__(self): return unicode(self).encode("ascii", "replace")