Source code for pimlico.datatypes.dictionary

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

"""
This module implements the concept of Dictionary -- a mapping between words and
their integer ids.

The implementation is based on Gensim, because Gensim is wonderful and there's no need to reinvent the wheel.
We don't use Gensim's data structure directly, because it's unnecessary to depend on the whole of Gensim just
for one data structure.

"""
import os
from collections import defaultdict
import itertools
from itertools import izip
import cPickle as pickle
from operator import itemgetter

from pimlico.datatypes.base import PimlicoDatatype, PimlicoDatatypeWriter


__all__ = ["Dictionary", "DictionaryWriter"]


[docs]class Dictionary(PimlicoDatatype): """ Dictionary encapsulates the mapping between normalized words and their integer ids. """ datatype_name = "dictionary" def __init__(self, base_dir, pipeline, **kwargs): super(Dictionary, self).__init__(base_dir, pipeline, **kwargs)
[docs] def get_data(self): with open(os.path.join(self.data_dir, "dictionary"), "r") as f: return pickle.load(f)
[docs] def data_ready(self): return super(Dictionary, self).data_ready() and os.path.exists(os.path.join(self.data_dir, "dictionary"))
[docs] def get_detailed_status(self): data = self.get_data() sorted_ids = list(reversed(sorted(data.dfs.items(), key=itemgetter(1)))) if len(sorted_ids) <= 8: term_list = u"%s...%s" % ", ".join(u"'%s' (%d)" % (data.id2token[i], cnt) for (i, cnt) in sorted_ids) else: top_ids = sorted_ids[:4] bottom_ids = sorted_ids[-4:] term_list = u"%s, ..., %s" % ( u", ".join(u"'%s' (%d)" % (data.id2token[i], cnt) for (i, cnt) in top_ids), u", ".join(u"'%s' (%d)" % (data.id2token[i], cnt) for (i, cnt) in bottom_ids) ) return super(Dictionary, self).get_detailed_status() + [ # Add a wee sample of the items in the dictionary "Terms: %s" % term_list.encode("utf8"), "Vocab size: %d" % len(data) ]
[docs]class DictionaryWriter(PimlicoDatatypeWriter): def __init__(self, base_dir): super(DictionaryWriter, self).__init__(base_dir) self.data = DictionaryData() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): super(DictionaryWriter, self).__exit__(exc_type, exc_val, exc_tb) with open(os.path.join(self.data_dir, "dictionary"), "w") as f: pickle.dump(self.data, f, -1)
[docs] def add_documents(self, documents, prune_at=2000000): self.data.add_documents(documents, prune_at=prune_at)
[docs] def filter(self, threshold=None, no_above=None, limit=None): if threshold is None: threshold = 0 if no_above is None: no_above = 1. return self.data.filter_extremes(no_below=threshold, no_above=no_above, keep_n=limit)
class DictionaryData(object): """ Dictionary encapsulates the mapping between normalized words and their integer ids. This is taken almost directly from Gensim. TODO: Provide a mapping to Gensim's actual Dictionary type for modules that use Gensim. """ def __init__(self): self.token2id = {} # token -> tokenId self._id2token = {} # reverse mapping for token2id; only formed on request, to save memory self.dfs = {} # document frequencies: tokenId -> in how many documents this token appeared self.num_docs = 0 # number of documents processed self.num_pos = 0 # total number of corpus positions self.num_nnz = 0 # total number of non-zeroes in the BOW matrix def __getitem__(self, tokenid): return self.id2token[tokenid] # will throw for non-existent ids def __iter__(self): return iter(self.keys()) @property def id2token(self): # Backwards compat with old pickled objects if not hasattr(self, "_id2token") or len(self._id2token) != len(self.token2id): self.refresh_id2token() return self._id2token def keys(self): """Return a list of all token ids.""" return list(self.token2id.values()) def __len__(self): """ Return the number of token->id mappings in the dictionary. """ return len(self.token2id) def __str__(self): some_keys = list(itertools.islice(self.token2id.iterkeys(), 5)) return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') def refresh_id2token(self): self._id2token = dict((id, token) for (token, id) in self.token2id.iteritems()) def add_term(self, term): """ Add a term to the dictionary, without any occurrence count. Note that if you run threshold-based filters after adding a term like this, it will get removed. """ if term not in self.token2id: new_id = len(self.token2id) self.token2id[term] = new_id self.dfs.setdefault(new_id, 0) return new_id else: return self.token2id[term] def add_documents(self, documents, prune_at=2000000): """ Update dictionary from a collection of documents. Each document is a list of tokens = **tokenized and normalized** strings (either utf8 or unicode). This is a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`, which also prunes infrequent words, keeping the total number of unique words <= `prune_at`. This is to save memory on very large inputs. To disable this pruning, set `prune_at=None`. """ for docno, document in enumerate(documents): # Run a regular check for pruning, once every 10k docs if docno % 10000 == 0 and prune_at is not None and len(self) > prune_at: self.filter_extremes(no_below=0, no_above=1.0, keep_n=prune_at) # Update dictionary with the document self.doc2bow(document, allow_update=True) # ignore the result, here we only care about updating token ids def doc2bow(self, document, allow_update=False, return_missing=False): """ Convert `document` (a list of words) into the bag-of-words format = list of `(token_id, token_count)` 2-tuples. Each word is assumed to be a **tokenized and normalized** string (either unicode or utf8-encoded). No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling this method. If `allow_update` is set, then also update dictionary in the process: create ids for new words. At the same time, update document frequencies -- for each word appearing in this document, increase its document frequency (`self.dfs`) by one. If `allow_update` is **not** set, this function is `const`, aka read-only. """ if isinstance(document, basestring): raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") # Construct (word, frequency) mapping. counter = defaultdict(int) for w in document: counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1 token2id = self.token2id if allow_update or return_missing: missing = dict((w, freq) for w, freq in counter.iteritems() if w not in token2id) if allow_update: for w in missing: # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) result = dict((token2id[w], freq) for w, freq in counter.iteritems() if w in token2id) if allow_update: self.num_docs += 1 self.num_pos += sum(counter.itervalues()) self.num_nnz += len(result) # increase document count for each unique token that appeared in the document dfs = self.dfs for tokenid in result.iterkeys(): dfs[tokenid] = dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(result.iteritems()) if return_missing: return result, missing else: return result def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ Filter out tokens that appear in 1. fewer than `no_below` documents (absolute number) or 2. more than `no_above` documents (fraction of total corpus size, *not* absolute number). 3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). After the pruning, shrink resulting gaps in word ids. **Note**: Due to the gap shrinking, the same word may have a different word id before and after the call to this function! """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold # determine which tokens to keep good_ids = (v for v in self.token2id.itervalues() if no_below <= self.dfs.get(v, 0) <= no_above_abs) good_ids = sorted(good_ids, key=self.dfs.get, reverse=True) if keep_n is not None: good_ids = good_ids[:keep_n] # Keep a record of what items we remove, along with their counts removed = [(token, id, self.dfs[id]) for (token, id) in self.token2id.iteritems() if id not in good_ids] # do the actual filtering, then rebuild dictionary to remove gaps in ids self.filter_tokens(good_ids=good_ids) return removed def filter_tokens(self, bad_ids=None, good_ids=None): """ Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep selected `good_ids` in the mapping and remove the rest. `bad_ids` and `good_ids` are collections of word ids to be removed. """ if bad_ids is not None: bad_ids = set(bad_ids) self.token2id = dict((token, tokenid) for token, tokenid in self.token2id.iteritems() if tokenid not in bad_ids) self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if tokenid not in bad_ids) if good_ids is not None: good_ids = set(good_ids) self.token2id = dict((token, tokenid) for token, tokenid in self.token2id.iteritems() if tokenid in good_ids) self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if tokenid in good_ids) self.compactify() def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ # build mapping from old id -> new id idmap = dict(izip(self.token2id.itervalues(), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in self.token2id.iteritems()) self._id2token = {} self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in self.dfs.iteritems())