Source code for pimlico.datatypes.gensim

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

from __future__ import absolute_import, print_function

from builtins import object
import os

from pimlico.utils.core import cached_property

from pimlico.core.dependencies.python import gensim_dependency
from pimlico.datatypes import PimlicoDatatype


__all__ = ["GensimLdaModel", "TopicsTopWords"]


[docs]class GensimLdaModel(PimlicoDatatype): """ Storage of trained Gensim LDA models. Depends on Gensim (and thereby also in Python 3), since we use Gensim to store and load the models. """ datatype_name = "lda_model" datatype_supports_python2 = False
[docs] def get_software_dependencies(self): return super(GensimLdaModel, self).get_software_dependencies() + [gensim_dependency]
[docs] def run_browser(self, reader, opts): """ Browse the LDA model simply by printing out all its topics. """ model = reader.load_model() print("Showing all {} trained LDA topics:".format(model.num_topics)) for topic, topic_repr in model.show_topics(num_topics=-1, num_words=10): print(u"#{}: {}".format(topic, topic_repr).encode("utf-8"))
[docs] class Reader(object):
[docs] def load_model(self): from gensim.models.ldamodel import LdaModel return LdaModel.load(os.path.join(self.data_dir, "model"))
[docs] class Writer(object): required_tasks = ["model"]
[docs] def write_model(self, model): model.save(os.path.join(self.data_dir, "model")) self.task_complete("model")
class GensimLdaSeqModel(PimlicoDatatype): """ A trained LDA-seq model - i.e. Dynamic Topic Model (DTM). As well as the Gensim model, it also stores the list of slice labels, so that we can easily look up the appropriate time slice for a document paired with its slice name. These could be, for example, years or months. """ datatype_name = "ldaseq_model" def get_software_dependencies(self): return super(GensimLdaSeqModel, self).get_software_dependencies() + [gensim_dependency] def run_browser(self, reader, opts): """ Browse the DTM model simply by printing out all its topics. """ model = reader.load_model() print("Showing all {} trained DTM topics:".format(model.num_topics)) for time, label in zip(range(len(model.time_slice)), reader.load_labels()): print("Time slice {}".format(time)) for topic, topic_repr in enumerate(model.print_topics(time=time, top_terms=6)): print(u"#{}: {}".format(topic, ", ".join("{} ({:.3f})".format(word, prob) for (word, prob) in topic_repr))) class Reader(object): def load_model(self): from gensim.models.ldaseqmodel import LdaSeqModel return LdaSeqModel.load(os.path.join(self.data_dir, "model")) def load_labels(self): with open(os.path.join(self.data_dir, "slice_labels.txt"), "r") as f: return f.read().splitlines() class Writer(object): required_tasks = ["model", "slice_labels"] def write_model(self, model): model.save(os.path.join(self.data_dir, "model")) self.task_complete("model") def write_labels(self, labels): with open(os.path.join(self.data_dir, "slice_labels.txt"), "w") as f: f.write("\n".join(labels)) self.task_complete("slice_labels")
[docs]class TopicsTopWords(PimlicoDatatype): """ Stores a list of the top words for each topic of a topic model. For some evaluations (like coherence), this is all the information that is needed about a model. This datatype can be extracted from various topic model types, so that they can all be evaluated using the same evaluation modules. """ datatype_name = "topics_top_words"
[docs] class Reader:
[docs] class Setup(object):
[docs] def get_required_paths(self): return ["topics.tsv"]
@cached_property def topics_words(self): with open(os.path.join(self.data_dir, "topics.tsv"), "r") as f: return [ line.split("\t") for line in f.read().split("\n") ] def __getitem__(self, item): return self.topics_words[item] def __len__(self): return len(self.topics_words) @property def num_topics(self): return len(self) def __iter__(self): return iter(self.topics_words)
[docs] class Writer(object): required_tasks = ["topics.tsv"]
[docs] def write_topics_words(self, topics_words): """ :param topics_words: list of topic, where each topic is a list of words, with the top weighted word first """ with open(os.path.join(self.data_dir, "topics.tsv"), "w") as f: f.write("\n".join("\t".join(words) for words in topics_words)) self.task_complete("topics.tsv")
[docs] def run_browser(self, reader, opts): for topic_num, words in enumerate(reader): print("Topic {}: {}".format(topic_num, ", ".join(words)))