Source code for pimlico.utils.probability

from itertools import islice
import random
import itertools
from pimlico.utils.core import infinite_cycle


[docs]def limited_shuffle(iterable, buffer_size):
    """
    Some algorithms require the order of data to be randomized. An obvious solution is to put it all in
    a list and shuffle, but if you don't want to load it all into memory that's not an option. This method
    iterates over the data, keeping a buffer and choosing at random from the buffer what to put next.
    It's less shuffled than the simpler solution, but limits the amount of memory used at any one time
    to the buffer size.

    """
    buffer = []
    try:
        while len(buffer) < buffer_size:
            buffer.append(iterable.next())

        for next_val in iterable:
            # Pick a random item from the buffer to remove
            index = random.randint(0, len(buffer)-1)
            yield buffer.pop(index)
            # Add the new value to the buffer to replace the old one
            buffer.append(next_val)
    except StopIteration:
        # No more to take in, just return the rest in a random order
        random.shuffle(buffer)
        for v in buffer:
            yield v


[docs]def sequential_document_sample(corpus, start=None, shuffle=None, sample_rate=None):
    """
    Wrapper around a :cls:`pimlico.datatypes.tar.TarredCorpus` to draw infinite samples of documents
    from the corpus, by iterating over the corpus (looping infinitely), yielding documents at random.
    If `sample_rate` is given, it should be a float between 0 and 1, specifying the rough proportion of
    documents to sample. A lower value spreads out the documents more on average.

    Optionally, the samples are shuffled within a limited scope. Set `shuffle` to the size of this scope (higher
    will shuffle more, but need to buffer more samples in memory).
    Otherwise (`shuffle=0`), they will appear in the order they were in the original corpus.

    If `start` is given, that number of documents will be skipped before drawing any samples. Set `start=0` to
    start at the beginning of the corpus. By default (`start=None`) a random point in the corpus will be skipped
    to before beginning.
    
    """
    if start is None:
        # Choose a random point in the dataset to start at
        start = random.randint(0, len(corpus)-1)
    # Start by reading the corpus from the start point onwards, then cycle forever
    doc_iter = itertools.chain(
        # Jump into the corpus the first time round
        corpus.archive_iter(skip=start, subsample=sample_rate),
        # Then loop over the corpus infinitely
        infinite_cycle(corpus.archive_iter(subsample=sample_rate))
    )
    if shuffle is not None:
        # Shuffle the data points (a bit) as we go
        doc_iter = iter(limited_shuffle(doc_iter, shuffle))
    return doc_iter


[docs]def sequential_sample(iterable, start=0, shuffle=None, sample_rate=None):
    """
    Draw infinite samples from an iterable, by iterating over it (looping infinitely), yielding items at random.
    If `sample_rate` is given, it should be a float between 0 and 1, specifying the rough proportion of
    documents to sample. A lower value spreads out the documents more on average.

    Optionally, the samples are shuffled within a limited scope. Set `shuffle` to the size of this scope (higher
    will shuffle more, but need to buffer more samples in memory).
    Otherwise (`shuffle=0`), they will appear in the order they were in the original corpus.

    If `start` is given, that number of documents will be skipped before drawing any samples. Set `start=0` to
    start at the beginning of the corpus. Note that setting this to a high number can result in a slow start-up, 
    if iterating over the items is slow.
    
    .. note::
       
       If you're sampling documents from a `TarredCorpus`, it's better to use :func:`sequential_document_sample`,
       since it makes use of `TarredCorpus`'s built-in features to do the skipping and sampling more efficiently.

    """
    # Cycle forever
    doc_iter = infinite_cycle(iterable)
    if sample_rate is not None and sample_rate != 1.:
        # Subsample to space out the items that get included
        doc_iter = subsample(doc_iter, sample_rate)
    if start > 0:
        # Skip some items at the start
        doc_iter = islice(doc_iter, start, None)
    if shuffle is not None:
        # Shuffle the data points (a bit) as we go
        doc_iter = iter(limited_shuffle(doc_iter, shuffle))
    return doc_iter


[docs]def subsample(iterable, sample_rate):
    """
    Subsample the given iterable at a given rate, between 0 and 1.
    
    """
    for item in iterable:
        if random.random() <= sample_rate:
            yield item