Source code for pimlico.core.modules.map.singleproc

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

"""
Sometimes the simple multiprocessing-based approach to map module parallelization just isn't suitable.
This module provides an equivalent set of implementations and convenience functions that don't use
multiprocessing, but conform to the pool-based execution pattern by creating a single-thread pool.

"""
import threading
from Queue import Queue, Empty
from traceback import format_exc

from pimlico.core.modules.map import DocumentMapProcessMixin, ProcessOutput, DocumentProcessorPool, \
    DocumentMapModuleExecutor, WorkerStartupError
from pimlico.datatypes.base import InvalidDocument
from pimlico.utils.pipes import qget


[docs]class SingleThreadMapWorker(threading.Thread, DocumentMapProcessMixin):
    def __init__(self, input_queue, output_queue, exception_queue, executor):
        threading.Thread.__init__(self)
        DocumentMapProcessMixin.__init__(self, input_queue, output_queue, exception_queue)
        self.executor = executor
        self.info = executor.info
        self.daemon = True
        self.stopped = threading.Event()
        self.initialized = threading.Event()
        self.no_more_inputs = threading.Event()

        self.start()

[docs]    def notify_no_more_inputs(self):
        self.no_more_inputs.set()

[docs]    def run(self):
        try:
            # Run any startup routine that the subclass has defined
            self.set_up()
            # Notify waiting processes that we've finished initialization
            self.initialized.set()
            input_buffer = []
            try:
                while not self.stopped.is_set():
                    try:
                        # Timeout and go round the loop again to check whether we're supposed to have stopped
                        archive, filename, docs = qget(self.input_queue, timeout=0.05)
                    except Empty:
                        # Don't worry if the queue is empty: just keep waiting for more until we're shut down
                        pass
                    else:
                        input_buffer.append(tuple([archive, filename] + docs))
                        if len(input_buffer) >= self.docs_per_batch or self.no_more_inputs.is_set():
                            results = self.process_documents(input_buffer)
                            for input_tuple, result in zip(input_buffer, results):
                                self.output_queue.put(ProcessOutput(input_tuple[0], input_tuple[1], result))
                            input_buffer = []
            finally:
                self.tear_down()
        except Exception, e:
            # If there's any uncaught exception, make it available to the main process
            self.exception_queue.put_nowait(e)
        finally:
            # Even there was an error, set initialized so that the main process can wait on it
            self.initialized.set()


[docs]class SingleThreadMapPool(DocumentProcessorPool):
    """
    A base implementation of document map parallelization using a single thread.

    """
    THREAD_TYPE = None

    def __init__(self, executor):
        super(SingleThreadMapPool, self).__init__(1)
        self.executor = executor
        self.worker = self.start_worker()
        # Wait until all of the workers have completed their initialization
        self.worker.initialized.wait()
        # Check whether the worker had an error during initialization
        try:
            e = self.worker.exception_queue.get_nowait()
        except Empty:
            # No error
            pass
        else:
            raise WorkerStartupError("error in worker thread: %s" % e, cause=e)

[docs]    def start_worker(self):
        return self.THREAD_TYPE(self.input_queue, self.output_queue, self.exception_queue, self.executor)

    @staticmethod
[docs]    def create_queue(maxsize=None):
        return Queue(maxsize)

[docs]    def shutdown(self):
        # Tell the thread to stop
        self.worker.stopped.set()
        # Wait until it's stopped
        while self.worker.is_alive():
            # Need to clear the output queue, or else the join hangs
            while not self.output_queue.empty():
                self.output_queue.get_nowait()
            self.worker.join(0.1)


[docs]class MultiprocessingMapModuleExecutor(DocumentMapModuleExecutor):
    POOL_TYPE = None

[docs]    def create_pool(self, processes):
        return self.POOL_TYPE(self, processes)

[docs]    def postprocess(self, error=False):
        self.pool.shutdown()


[docs]def single_process_executor_factory(process_document_fn, preprocess_fn=None, postprocess_fn=None):
    """
    Factory function for creating an executor that uses the single-process implementations of document-map
    pools and workers. This is an easy way to implement a non-parallelized executor

    process_document_fn should be a function that takes the following arguments:

    - the executor instance (allowing access to things set during setup)
    - archive name
    - document name
    - the rest of the args are the document itself, from each of the input corpora

    If proprocess_fn is given, it is called once before execution begins, with the executor as an argument.

    If postprocess_fn is given, it is called at the end of execution, including on the way out after an error,
    with the executor as an argument and a kwarg *error* which is True if execution failed.

    """
    # Define a worker thread type
    class FactoryMadeMapThread(SingleThreadMapWorker):
        def process_document(self, archive, filename, *docs):
            process_document_fn(self.executor, archive, filename, *docs)

    # Define a pool type to use this worker thread type
    class FactoryMadeMapPool(SingleThreadMapPool):
        PROCESS_TYPE = FactoryMadeMapThread

    # Finally, define an executor type (subclass of DocumentMapModuleExecutor) that creates a pool of the right sort
    class ModuleExecutor(MultiprocessingMapModuleExecutor):
        POOL_TYPE = FactoryMadeMapPool

        def preprocess(self):
            super(ModuleExecutor, self).preprocess()
            if preprocess_fn is not None:
                preprocess_fn(self)

        def postprocess(self, error=False):
            super(ModuleExecutor, self).postprocess(error=error)
            if postprocess_fn is not None:
                postprocess_fn(self, error=error)

    return ModuleExecutor