Source code for pimlico.datatypes.word2vec

import os
from pimlico.cli.shell.base import ShellCommand, ShellError

from pimlico.core.dependencies.python import PythonPackageOnPip
from pimlico.core.modules.options import str_to_bool
from pimlico.datatypes.base import PimlicoDatatype, PimlicoDatatypeWriter

__all__ = ["Word2VecModel", "Word2VecModelWriter"]


class NearestNeighboursCommand(ShellCommand):
    commands = ["neighbours", "nn"]
    help_text = "Print the nearest neighbours of the given word by cosine similarity in the vector space. You may " \
                "specify multiple words and include negative words by prefixing '-'"

    def execute(self, shell, *args, **kwargs):
        if len(args) == 0:
            raise ShellError("specify at least one word")
        model = shell.data.load_model()

        positive_words = [arg for arg in args if not arg.startswith("-")]
        negative_words = [arg[1:] for arg in args if arg.startswith("-")]
        for w in positive_words + negative_words:
            if w not in model.vocab:
                print "WARNING: %s not in vocabulary, leaving out" % w
        # Filter out OOVs
        positive_words = [w for w in positive_words if w in model.vocab]
        negative_words = [w for w in negative_words if w in model.vocab]

        if len(positive_words + negative_words) == 0:
            raise ShellError("no non-OOV query terms")

        similar = model.most_similar(positive=positive_words, negative=negative_words)

        for word, score in similar:
            print "%s  (%.3f)" % (word, score)


class VectorCommand(ShellCommand):
    commands = ["vector", "vec"]
    help_text = "Output (some of) the values of a word vector. Use norm=T to apply Euclidean normalization"

    def execute(self, shell, *args, **kwargs):
        from gensim.matutils import unitvec

        model = shell.data.load_model()
        vec = model[args[0]]

        norm = str_to_bool(kwargs.pop("norm", False))
        if norm:
            vec = unitvec(vec)

        print vec


class SimilarityCommand(ShellCommand):
    commands = ["similarity", "sim"]
    help_text = "Output the similarity of two words by cosine in the vector space"

    def execute(self, shell, *args, **kwargs):
        model = shell.data.load_model()
        for word in args[:2]:
            if not word in model.vocab:
                raise ShellError("word '%s' is not in the vocabulary" % word)
        print model.similarity(args[0], args[1])


[docs]class Word2VecModel(PimlicoDatatype):
    shell_commands = [NearestNeighboursCommand(), VectorCommand(), SimilarityCommand()]

    def __init__(self, base_dir, pipeline, **kwargs):
        super(Word2VecModel, self).__init__(base_dir, pipeline, **kwargs)
        self._model = None
        # Old models don't have this set, so default to False
        self.verb_only = self.metadata.get("verb_only", False)

[docs]    def data_ready(self):
        return super(Word2VecModel, self).data_ready() and os.path.exists(os.path.join(self.data_dir, "vectors.bin"))

[docs]    def load_model(self):
        if self._model is None:
            from gensim.models.word2vec import Word2Vec
            self._model = Word2Vec.load_word2vec_format(os.path.join(self.data_dir, "vectors.bin"), binary=True)
        return self._model

    @property
    def model(self):
        return self.load_model()

[docs]    def get_software_dependencies(self):
        # Depend on Gensim, which can be installed using Pip
        return super(Word2VecModel, self).get_software_dependencies() + [
            PythonPackageOnPip("gensim")
        ]


[docs]class Word2VecModelWriter(PimlicoDatatypeWriter):
    def __init__(self, base_dir, verb_only=False, **kwargs):
        super(Word2VecModelWriter, self).__init__(base_dir, **kwargs)
        self.word2vec_model = None
        self.metadata["verb_only"] = verb_only

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        super(Word2VecModelWriter, self).__exit__(exc_type, exc_val, exc_tb)
        if self.word2vec_model is not None:
            self.word2vec_model.save_word2vec_format(os.path.join(self.data_dir, "vectors.bin"), binary=True)