Source code for pimlico.cli.jupyter

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
A command to start a Jupyter notebook for a given pipeline, providing access
to its modules and their outputs.

"""
import json
import os
import sys

from pimlico.core.dependencies.licenses import BSD

from pimlico import PIMLICO_ROOT
from pimlico.cli.subcommands import PimlicoCLISubcommand
from pimlico.core.dependencies.python import PythonPackageOnPip


[docs]class JupyterCmd(PimlicoCLISubcommand):
    """
    Creates and runs a Jupyter notebook for the loaded pipeline. The pipeline is
    made easily available within the notebook, providing a way to load the modules
    and get their outputs.

    This is a useful way to explore the data or analyses coming out of your modules.
    Once a module has been run, you can load it from a notebook and manipulate,
    explore, visualize, etc to results.

    A new directory is automatically created in your project root to contain
    the pipeline's notebooks. (You can override the location of this using
    ``--notebook-dir``). An example notebook is created there, to show you how
    to load the pipeline.

    From within a notebook, load a pipeline like so:

    .. code-block:: py

       from pimlico import get_jupyter_pipeline
       pipeline = get_jupyter_pipeline()

    Now you can access the modules of the pipeline through this pipeline object:

    .. code-block:: py

       mod = pipeline["my_module"]

    And get data from its outputs (provided the module's been run):

    .. code-block:: py

       print(mod.status)
       output = mod.get_output("output_name")
    
    """
    command_name = "jupyter"
    command_help = "Create and start a new Jupyter notebook for the pipeline"

[docs]    def add_arguments(self, parser):
        parser.add_argument("--notebook-dir", action="store",
                            help="Use a custom directory as the notebook directory. By default, a directory will be "
                                 "created according to: <pimlico_root>/../notebooks/<pipeline_name>/")

[docs]    def run_command(self, pipeline, opts):
        if not jupyter_dependency.available(pipeline.local_config):
            print("Jupyter not currently installed in local environment: installing")
            jupyter_dependency.install(pipeline.local_config)
        # Now Jupyter should be installed, so we can import the main function for running notebooks
        from notebook.notebookapp import main
        print("Jupyter installed and successfully imported")

        # Set up a directory that will be used as the notebook directory for this pipeline
        if opts.notebook_dir is not None:
            notebook_dir = opts.notebook_dir
            print("Using custom notebook directory: {}".format(notebook_dir))
        else:
            notebook_dir = os.path.abspath(os.path.join(PIMLICO_ROOT, "..", "notebooks", pipeline.name))
            print("Using notebook directory for pipeline {}: {}".format(pipeline.name, notebook_dir))
        # Create the directory tree if necessary
        if not os.path.exists(notebook_dir):
            print("Creating notebook dir {}".format(notebook_dir))
            os.makedirs(notebook_dir)
            # Create an example source file that loads the pipeline
            if len(pipeline.modules) == 0:
                # Can't give an example module name, as there aren't any modules
                example_module_name = "module_name"
                example_output = "output_name"
            else:
                example_module_name = pipeline.modules[-1]
                example_mod = pipeline[example_module_name]
                if len(example_mod.available_outputs) == 0:
                    # Can't give example output name, as the module doesn't have any outputs
                    example_output = "output_name"
                else:
                    example_output = example_mod.available_outputs[0]
            example_code = EXAMPLE_CODE.format(
                example_module_name=example_module_name,
                example_output_name=example_output
            )
            print("Adding example notebook")
            with open(os.path.join(notebook_dir, "example.ipynb"), "w") as f:
                f.write(make_notebook(example_code))

        # Make the currently loaded pipeline available from within Jupyter notebooks via an environment var
        os.environ["JUPYTER_PIPELINE"] = os.path.abspath(pipeline.filename)

        print("Running Jupyter...")
        print("------------------")
        print("From within a notebook, you can access the loaded '{}' pipeline by:")
        print("  from pimlico import get_jupyter_pipeline")
        print("  pipeline = get_jupyter_pipeline()")
        print()
        sys.argv = [sys.argv[0], "--notebook-dir", notebook_dir]
        sys.exit(main())


jupyter_dependency = PythonPackageOnPip("jupyter", homepage_url="https://jupyter.org/", license=BSD)


EXAMPLE_CODE = """\
# This is an example of how to load your pipeline from a notebook
from pimlico import get_jupyter_pipeline

pipeline = get_jupyter_pipeline()

# Now you can access the modules of the pipeline through this pipeline object
mod = pipeline["{example_module_name}"]

# And get data from its outputs (provided the module's been run)
print(mod.status)

output = mod.get_output("{example_output_name}")
"""


[docs]def make_notebook(code_text):
    data = {
        "cells": [],
        "metadata": {},
        "nbformat": 4,
        "nbformat_minor": 2,
    }
    for line in code_text.split("\n\n"):
        data["cells"].append({
            "cell_type": "code",
            "execution_count": None,
            "metadata": {},
            "outputs": [],
            "source": [line],
        })
    return json.dumps(data, indent=4)