Source code for pimlico.datatypes.vrt

from pimlico.cli.browser.formatter import DocumentBrowserFormatter
from pimlico.datatypes.documents import DataPointType
from pimlico.utils.core import cached_property


[docs]class VRTWord(object):
    """
    Word with all its annotations.

    The Korp docs give the following example list of positional attributes (columns):

       word form, the number of the token within the sentence, lemma, lemma with compound boundaries marked,
       part of speech, morphological analysis, dependency head number and dependency relation

    However, they are not fixed and different files may have different numbers of attributes with different
    meanings. This information is not included in the data file.

    """
    def __init__(self, word, *attributes):
        self.word = word
        self.attributes = attributes


[docs]class VRTText(object):
    """
    Contains a single VRT text (i.e. document).

    Note that VRT's structures are not hierarchical: they can be overlapping.
    See `VRT docs <https://www.kielipankki.fi/development/korp/corpus-input-format/#VRT_file_format>`_.

    We don't currently process structural attributes. This can easily be added later if necessary.

    """
    def __init__(self, words, paragraph_ranges=[], sentence_ranges=[], opening_tag=None):
        self.words = words
        self.paragraph_ranges = paragraph_ranges
        self.sentence_ranges = sentence_ranges
        self.opening_tag = opening_tag

[docs]    @staticmethod
    def from_string(data):
        lines = data.splitlines()
        # First and last lines should be "text" tags
        opening_tag = lines[0]

        # Get just the content lines, and indices of the tags
        words = []
        paragraph_ranges = []
        sentence_ranges = []

        par_open = sent_open = None
        for line in lines[1:-1]:
            # There shouldn't be any blank lines, but skip them if they arise
            if len(line.strip()) == 0:
                continue
            if line.startswith(u"<paragraph"):
                par_open = len(words)
            elif line.startswith(u"</paragraph>"):
                # If we didn't get a par opener, we can't close it, but fail quietly and skip it
                if par_open is not None:
                    paragraph_ranges.append((par_open, len(words)))
                    par_open = None
            elif line.startswith(u"<sentence"):
                sent_open = len(words)
            elif line.startswith(u"</sentence>"):
                if sent_open is not None:
                    sentence_ranges.append((sent_open, len(words)))
            else:
                # Just a normal word line
                # Don't complain about missing fields
                fields = line.rstrip(u"\n").split(u"\t")
                words.append(VRTWord(*tuple(fields)))
        return VRTText(words, paragraph_ranges, sentence_ranges, opening_tag)

    @cached_property
    def paragraphs(self):
        return [self.words[start:end] for (start, end) in self.paragraph_ranges]

    @cached_property
    def sentences(self):
        return [self.words[start:end] for (start, end) in self.sentence_ranges]

    @cached_property
    def word_strings(self):
        return [w.word for w in self.words]


[docs]class VRTDocumentType(DataPointType):
    """
    Document type for annotation text documents read in from VRT files
    (`VeRticalized Text, as used by Korp:
    <https://www.kielipankki.fi/development/korp/corpus-input-format/#VRT_file_format>`_).

    """
    formatters = [("vrt", "pimlico.datatypes.vrt.VRTFormatter")]

[docs]    def process_document(self, doc):
        return VRTText.from_string(doc)


[docs]class VRTFormatter(DocumentBrowserFormatter):
    DATATYPE = VRTDocumentType

[docs]    def format_document(self, doc):
        return u"\n".join(u"{}\t{}".format(word.word, u"\t".join(word.attributes)) for word in doc.words)