Source code for pimlico.datatypes.word2vec

import os
from pimlico.cli.shell.base import ShellCommand, ShellError

from pimlico.core.dependencies.python import PythonPackageOnPip
from pimlico.core.modules.options import str_to_bool
from pimlico.datatypes.base import PimlicoDatatype, PimlicoDatatypeWriter

__all__ = ["Word2VecModel", "Word2VecModelWriter"]


class NearestNeighboursCommand(ShellCommand):
    commands = ["neighbours", "nn"]
    help_text = "Print the nearest neighbours of the given word by cosine similarity in the vector space. You may " \
                "specify multiple words and include negative words by prefixing '-'"

    def execute(self, shell, *args, **kwargs):
        if len(args) == 0:
            raise ShellError("specify at least one word")
        model = shell.data.load_model()

        positive_words = [arg for arg in args if not arg.startswith("-")]
        negative_words = [arg[1:] for arg in args if arg.startswith("-")]
        for w in positive_words + negative_words:
            if w not in model.vocab:
                print "WARNING: %s not in vocabulary, leaving out" % w
        # Filter out OOVs
        positive_words = [w for w in positive_words if w in model.vocab]
        negative_words = [w for w in negative_words if w in model.vocab]

        if len(positive_words + negative_words) == 0:
            raise ShellError("no non-OOV query terms")

        similar = model.most_similar(positive=positive_words, negative=negative_words)

        for word, score in similar:
            print "%s  (%.3f)" % (word, score)


class VectorCommand(ShellCommand):
    commands = ["vector", "vec"]
    help_text = "Output (some of) the values of a word vector. Use norm=T to apply Euclidean normalization"

    def execute(self, shell, *args, **kwargs):
        from gensim.matutils import unitvec

        model = shell.data.load_model()
        vec = model[args[0]]

        norm = str_to_bool(kwargs.pop("norm", False))
        if norm:
            vec = unitvec(vec)

        print vec


class SimilarityCommand(ShellCommand):
    commands = ["similarity", "sim"]
    help_text = "Output the similarity of two words by cosine in the vector space"

    def execute(self, shell, *args, **kwargs):
        model = shell.data.load_model()
        for word in args[:2]:
            if not word in model.vocab:
                raise ShellError("word '%s' is not in the vocabulary" % word)
        print model.similarity(args[0], args[1])


[docs]class Word2VecModel(PimlicoDatatype): shell_commands = [NearestNeighboursCommand(), VectorCommand(), SimilarityCommand()] def __init__(self, base_dir, pipeline, **kwargs): super(Word2VecModel, self).__init__(base_dir, pipeline, **kwargs) self._model = None # Old models don't have this set, so default to False self.verb_only = self.metadata.get("verb_only", False)
[docs] def data_ready(self): return super(Word2VecModel, self).data_ready() and os.path.exists(os.path.join(self.data_dir, "vectors.bin"))
[docs] def load_model(self): if self._model is None: from gensim.models.word2vec import Word2Vec self._model = Word2Vec.load_word2vec_format(os.path.join(self.data_dir, "vectors.bin"), binary=True) return self._model
@property def model(self): return self.load_model()
[docs] def get_software_dependencies(self): # Depend on Gensim, which can be installed using Pip return super(Word2VecModel, self).get_software_dependencies() + [ PythonPackageOnPip("gensim") ]
[docs]class Word2VecModelWriter(PimlicoDatatypeWriter): def __init__(self, base_dir, verb_only=False, **kwargs): super(Word2VecModelWriter, self).__init__(base_dir, **kwargs) self.word2vec_model = None self.metadata["verb_only"] = verb_only def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): super(Word2VecModelWriter, self).__exit__(exc_type, exc_val, exc_tb) if self.word2vec_model is not None: self.word2vec_model.save_word2vec_format(os.path.join(self.data_dir, "vectors.bin"), binary=True)