Source code for pimlico.core.modules.map.multiproc

# This file is part of Pimlico
# Copyright (C) 2016 Mark Granroth-Wilding
# Licensed under the GNU GPL v3.0 - http://www.gnu.org/licenses/gpl-3.0.en.html

"""
Document map modules can in general be easily parallelized using multiprocessing. This module provides
implementations of a pool and base worker processes that use multiprocessing, making it dead easy to
implement a parallelized module, simply by defining what should be done on each document.

In particular, use :fun:.multiprocessing_executor_factory wherever possible.

"""
from __future__ import absolute_import

import multiprocessing
from Queue import Empty
from traceback import format_exc

from pimlico.core.modules.map import ProcessOutput, DocumentProcessorPool, DocumentMapProcessMixin, \
    DocumentMapModuleExecutor, WorkerStartupError, WorkerShutdownError
from pimlico.core.modules.map.threaded import ThreadingMapThread
from pimlico.utils.pipes import qget


[docs]class MultiprocessingMapProcess(multiprocessing.Process, DocumentMapProcessMixin):
    """
    A base implementation of document map parallelization using multiprocessing. Note that not all document
    map modules will want to use this: e.g. if you call a background service that provides parallelization
    itself (like the CoreNLP module) there's no need for multiprocessing in the Python code.

    """
    def __init__(self, input_queue, output_queue, exception_queue, executor, docs_per_batch=1):
        multiprocessing.Process.__init__(self)
        DocumentMapProcessMixin.__init__(self, input_queue, output_queue, exception_queue,
                                         docs_per_batch=docs_per_batch)
        self.executor = executor
        self.info = executor.info
        self.daemon = True
        self.stopped = multiprocessing.Event()
        self.initialized = multiprocessing.Event()
        self.no_more_inputs = multiprocessing.Event()

        self.start()

[docs]    def notify_no_more_inputs(self):
        self.no_more_inputs.set()

[docs]    def run(self):
        try:
            # Run any startup routine that the subclass has defined
            self.set_up()
            # Notify waiting processes that we've finished initialization
            self.initialized.set()
            input_buffer = []
            try:
                while not self.stopped.is_set():
                    try:
                        # Timeout and go round the loop again to check whether we're supposed to have stopped
                        archive, filename, docs = qget(self.input_queue, timeout=0.05)
                    except Empty:
                        # Don't worry if the queue is empty: just keep waiting for more until we're shut down
                        pass
                    else:
                        # Buffer input documents, so that we can process multiple at once if requested
                        input_buffer.append(tuple([archive, filename] + docs))
                    if len(input_buffer) >= self.docs_per_batch or self.no_more_inputs.is_set():
                        results = self.process_documents(input_buffer)
                        for input_tuple, result in zip(input_buffer, results):
                            self.output_queue.put(ProcessOutput(input_tuple[0], input_tuple[1], result))
                        input_buffer = []
            finally:
                try:
                    self.tear_down()
                except Exception, e:
                    self.exception_queue.put(WorkerShutdownError("error in tear_down() call", cause=e), block=True)
        except Exception, e:
            # If there's any uncaught exception, make it available to the main process
            # Include the formatted stack trace, since we can't get this later from the exception outside this process
            e.traceback = format_exc()
            self.exception_queue.put(e, block=True)
        finally:
            # Even there was an error, set initialized so that the main process can wait on it
            self.initialized.set()


[docs]class MultiprocessingMapPool(DocumentProcessorPool):
    """
    A base implementation of document map parallelization using multiprocessing.

    """
    PROCESS_TYPE = None
    # Can specify an alternative implementation of the process type when we only need a single process
    SINGLE_PROCESS_TYPE = None

    def __init__(self, executor, processes):
        super(MultiprocessingMapPool, self).__init__(processes)
        self.executor = executor
        self.workers = [self.start_worker() for i in range(processes)]
        # Wait until all of the workers have completed their initialization
        for worker in self.workers:
            worker.initialized.wait()
            # Check whether the worker had an error during initialization
            try:
                e = self.exception_queue.get_nowait()
            except Empty:
                # No error
                pass
            else:
                if hasattr(e, "traceback"):
                    debugging_info = e.traceback
                else:
                    debugging_info = None
                raise WorkerStartupError("error starting up worker process: %s" % e, cause=e,
                                         debugging_info=debugging_info)

[docs]    def start_worker(self):
        if self.processes == 1 and self.SINGLE_PROCESS_TYPE is not None:
            return self.SINGLE_PROCESS_TYPE(self.input_queue, self.output_queue, self.exception_queue, self.executor)
        else:
            return self.PROCESS_TYPE(self.input_queue, self.output_queue, self.exception_queue, self.executor)

    @staticmethod
[docs]    def create_queue(maxsize=None):
        return multiprocessing.Queue(maxsize)

[docs]    def shutdown(self):
        # Tell all the threads to stop
        for worker in self.workers:
            worker.stopped.set()

        # Now wait until all processes have shut down
        still_alive = []
        for worker in self.workers:
            worker.join(5.)
            if worker.is_alive():
                # Extreme case: 5 seconds without the worker closing down
                still_alive.append(worker)
        for worker in still_alive:
            # Output something so we know that it's the worker shutdown that's holding things up
            self.executor.log.warn("Worker process %s taking a long time to shut down: you may need to forcibly kill "
                                   "everything" % worker)
            worker.join()

        # Clear the output queue now, as something might have appeared there since we told the process to shutdown
        while not self.output_queue.empty():
            self.output_queue.get_nowait()

        # Also need to clear the exception queue
        errors = []
        while not self.exception_queue.empty():
            error = self.exception_queue.get_nowait()
            if error is not None:
                errors.append(error)

        if errors:
            # Hopefully, this shouldn't happen, as errors should already have been handled further up
            error = errors[0]
            if hasattr(error, "debugging_info"):
                # We've already attached debugging info at some lower level: just use it
                debugging = error.debugging_info
            elif hasattr(error, "traceback"):
                debugging = error.traceback
            else:
                debugging = None
            if len(errors) > 1:
                extra_mess = " (%d further unhandled errors received from worker processes)" % (len(errors)-1)
            else:
                extra_mess = ""

            self.executor.log.error("error in worker process received while shutting down pool: %s%s" %
                                    (error, extra_mess), cause=error, debugging_info=debugging)

[docs]    def notify_no_more_inputs(self):
        for worker in self.workers:
            worker.notify_no_more_inputs()


[docs]class MultiprocessingMapModuleExecutor(DocumentMapModuleExecutor):
    POOL_TYPE = None

[docs]    def create_pool(self, processes):
        return self.POOL_TYPE(self, processes)

[docs]    def postprocess(self, error=False):
        self.pool.shutdown()


[docs]def multiprocessing_executor_factory(process_document_fn, preprocess_fn=None, postprocess_fn=None,
                                     worker_set_up_fn=None, worker_tear_down_fn=None, batch_docs=None,
                                     multiprocessing_single_process=False):
    """
    Factory function for creating an executor that uses the multiprocessing-based implementations of document-map
    pools and worker processes.
    This is an easy way to implement a parallelizable executor, which is suitable for a large number of module
    types.

    process_document_fn should be a function that takes the following arguments (unless `batch_docs` is given):

    - the worker process instance (allowing access to things set during setup)
    - archive name
    - document name
    - the rest of the args are the document itself, from each of the input corpora

    If proprocess_fn is given, it is called from the main process once before execution begins, with the executor
    as an argument.

    If postprocess_fn is given, it is called from the main process at the end of execution, including on the way
    out after an error, with the executor as an argument and a kwarg *error* which is True if execution failed.

    If worker_set_up_fn is given, it is called within each worker before execution begins, with the worker process
    instance as an argument.
    Likewise, worker_tear_down_fn is called from within the worker process before it exits.

    Alternatively, you can supply a worker type, a subclass of :class:.MultiprocessingMapProcess, as the first argument.
    If you do this, worker_set_up_fn and worker_tear_down_fn will be ignored.

    If `batch_docs` is not None, `process_document_fn` is treated differently. Instead of supplying the
    `process_document()` of the worker, it supplies a `process_documents()`. The second argument is a list of tuples,
    each of which is assumed to be the args to `process_document()` for a single document. In this case,
    `docs_per_batch` is set on the worker processes, so that the given number of docs are collected from the input
    and passed into `process_documents()` at once.

    By default, if only a single process is needed, we use the threaded implementation of a map process instead of
    multiprocessing. If this doesn't work out in your case, for some reason, specify
    `multiprocessing_single_process=True` and a mutiprocessing process will be used even when only creating one.

    """
    if isinstance(process_document_fn, type):
        if not issubclass(process_document_fn, MultiprocessingMapProcess):
            raise TypeError("called multiprocessing_executor_factory with a worker type that's not a subclass of "
                            "MultiprocessingMapProcess: got %s" % process_document_fn.__name__)
        worker_type = process_document_fn
    else:
        # Define a worker process type
        class FactoryMadeMapProcess(MultiprocessingMapProcess):
            def __init__(self, input_queue, output_queue, exception_queue, executor):
                super(FactoryMadeMapProcess, self).__init__(input_queue, output_queue, exception_queue, executor,
                                                            docs_per_batch=batch_docs or 1)

            def set_up(self):
                if worker_set_up_fn is not None:
                    worker_set_up_fn(self)

            def tear_down(self):
                if worker_tear_down_fn is not None:
                    worker_tear_down_fn(self)

        if batch_docs is not None:
            FactoryMadeMapProcess.process_documents = process_document_fn
        else:
            FactoryMadeMapProcess.process_document = process_document_fn
        worker_type = FactoryMadeMapProcess

        if multiprocessing_single_process:
            # Don't define a special single-process case
            single_worker_type = None
        else:
            # Also define a different worker thread type for use when we only need a single process
            class FactoryMadeMapSingleProcess(ThreadingMapThread):
                def process_document(self, archive, filename, *docs):
                    return process_document_fn(self, archive, filename, *docs)

                def set_up(self):
                    if worker_set_up_fn is not None:
                        worker_set_up_fn(self)

                def tear_down(self):
                    if worker_tear_down_fn is not None:
                        worker_tear_down_fn(self)

            if batch_docs is not None:
                FactoryMadeMapSingleProcess.process_documents = process_document_fn
            else:
                FactoryMadeMapSingleProcess.process_document = process_document_fn
            single_worker_type = FactoryMadeMapSingleProcess

    # Define a pool type to use this worker process type
    class FactoryMadeMapPool(MultiprocessingMapPool):
        PROCESS_TYPE = worker_type
        SINGLE_PROCESS_TYPE = single_worker_type

    # Finally, define an executor type (subclass of DocumentMapModuleExecutor) that creates a pool of the right sort
    class ModuleExecutor(MultiprocessingMapModuleExecutor):
        POOL_TYPE = FactoryMadeMapPool

        def preprocess(self):
            super(ModuleExecutor, self).preprocess()
            if preprocess_fn is not None:
                preprocess_fn(self)

        def postprocess(self, error=False):
            super(ModuleExecutor, self).postprocess(error=error)
            if postprocess_fn is not None:
                postprocess_fn(self, error=error)

    return ModuleExecutor