Source code for pimlico.datatypes.vrt

from pimlico.cli.browser.formatter import DocumentBrowserFormatter
from pimlico.datatypes.documents import DataPointType
from pimlico.utils.core import cached_property


[docs]class VRTWord(object): """ Word with all its annotations. The Korp docs give the following example list of positional attributes (columns): word form, the number of the token within the sentence, lemma, lemma with compound boundaries marked, part of speech, morphological analysis, dependency head number and dependency relation However, they are not fixed and different files may have different numbers of attributes with different meanings. This information is not included in the data file. """ def __init__(self, word, *attributes): self.word = word self.attributes = attributes
[docs]class VRTText(object): """ Contains a single VRT text (i.e. document). Note that VRT's structures are not hierarchical: they can be overlapping. See `VRT docs <https://www.kielipankki.fi/development/korp/corpus-input-format/#VRT_file_format>`_. We don't currently process structural attributes. This can easily be added later if necessary. """ def __init__(self, words, paragraph_ranges=[], sentence_ranges=[], opening_tag=None): self.words = words self.paragraph_ranges = paragraph_ranges self.sentence_ranges = sentence_ranges self.opening_tag = opening_tag
[docs] @staticmethod def from_string(data): lines = data.splitlines() # First and last lines should be "text" tags opening_tag = lines[0] # Get just the content lines, and indices of the tags words = [] paragraph_ranges = [] sentence_ranges = [] par_open = sent_open = None for line in lines[1:-1]: # There shouldn't be any blank lines, but skip them if they arise if len(line.strip()) == 0: continue if line.startswith(u"<paragraph"): par_open = len(words) elif line.startswith(u"</paragraph>"): # If we didn't get a par opener, we can't close it, but fail quietly and skip it if par_open is not None: paragraph_ranges.append((par_open, len(words))) par_open = None elif line.startswith(u"<sentence"): sent_open = len(words) elif line.startswith(u"</sentence>"): if sent_open is not None: sentence_ranges.append((sent_open, len(words))) else: # Just a normal word line # Don't complain about missing fields fields = line.rstrip(u"\n").split(u"\t") words.append(VRTWord(*tuple(fields))) return VRTText(words, paragraph_ranges, sentence_ranges, opening_tag)
@cached_property def paragraphs(self): return [self.words[start:end] for (start, end) in self.paragraph_ranges] @cached_property def sentences(self): return [self.words[start:end] for (start, end) in self.sentence_ranges] @cached_property def word_strings(self): return [w.word for w in self.words]
[docs]class VRTDocumentType(DataPointType): """ Document type for annotation text documents read in from VRT files (`VeRticalized Text, as used by Korp: <https://www.kielipankki.fi/development/korp/corpus-input-format/#VRT_file_format>`_). """ formatters = [("vrt", "pimlico.datatypes.vrt.VRTFormatter")]
[docs] def process_document(self, doc): return VRTText.from_string(doc)
[docs]class VRTFormatter(DocumentBrowserFormatter): DATATYPE = VRTDocumentType
[docs] def format_document(self, doc): return u"\n".join(u"{}\t{}".format(word.word, u"\t".join(word.attributes)) for word in doc.words)