Source code for pimlico.utils.docs.modulegen

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Tool to generate Pimlico module docs. Based on Sphinx's apidoc tool.

It is assumed that this script will be run using Python 3. Although it has
a basic Python 2 compatibility, it's not really intended for Python 2 use.
Modules that are marked as still awaiting update to the new datatypes system
will now not be imported at all, since they are typically not Python 3
compatible (due to their use of ``old_datatypes``, which has not been
updated to Python 3).

"""
from __future__ import print_function

import argparse
import codecs
import inspect
import os
import sys
import warnings
from builtins import str
from collections import OrderedDict
from importlib import import_module
from pkgutil import iter_modules

from sphinx import __version__
from .rest import format_heading

from pimlico import install_core_dependencies
from pimlico.core.modules.base import BaseModuleInfo
from pimlico.core.modules.options import format_option_type, str_to_bool, \
    json_string, json_dict
from pimlico.datatypes import PimlicoDatatype, MultipleInputs, DynamicOutputDatatype, DynamicInputDatatypeRequirement, \
    IterableCorpus
from pimlico.datatypes.corpora import DataPointType
from pimlico.utils.docs import trim_docstring
from pimlico.utils.docs.rest import make_table


provide_further_outputs_base_doc = BaseModuleInfo.provide_further_outputs.__doc__
build_output_groups_base_doc = BaseModuleInfo.build_output_groups.__doc__


[docs]def generate_docs_for_pymod(module, output_dir, test_refs={}, example_refs={}): """ Generate RST docs for Pimlico modules on a given Python path and output to a directory. """ module_name = module.__name__ # Look at all this module's submodules submodules = dict((modname, (importer, is_package)) for (importer, modname, is_package) in iter_modules(module.__path__, prefix="%s." % module_name)) # If not a Pimlico module, recurse into subpackages to find modules # Even if it is a Pimlico module, there could be other modules in the subpackages all_generated = [] all_pimlico_modules = [] all_children = [] for modname, (importer, is_package) in sorted(submodules.items()): if is_package: # Import the module (package) so we can recurse on it submod = importer.find_module(modname).load_module(modname) is_pim_submod, sub_pim_mods = generate_docs_for_pymod(submod, output_dir, test_refs=test_refs, example_refs=example_refs) all_generated.append(submod.__name__) all_pimlico_modules.extend(sub_pim_mods) if is_pim_submod: all_children.append(submod.__name__) is_pimlico_module = False if "%s.info" % module_name in submodules and not submodules["%s.info" % module_name][1]: # This looks like a Pimlico module # Try building module docs for this one # If there were submodules, they should be included in the module doc in a TOC info = generate_docs_for_pimlico_mod(module_name, output_dir, all_generated, test_refs=test_refs, example_refs=example_refs) if info is not None: is_pimlico_module = True all_pimlico_modules.append(module_name) elif all_generated: # This was just a package, not a Pimlico module, but it included Pimlico modules # Generate a contents page for the submodules # If the submodule has a docstring, it goes onto the index page if module.__doc__ is not None and module.__doc__.strip("\n "): # By convention, the first line is used as a title module_title, __, module_doc = module.__doc__.lstrip("\n ").partition("\n") else: module_title = "Package %s" % module_name module_doc = "" # Generate an index for this submodule generate_contents_page(all_generated, output_dir, module_name, module_title, module_doc) # If no Pimlico modules were found anywhere in this package, don't generate anything return is_pimlico_module, all_pimlico_modules
[docs]def generate_docs_for_pimlico_mod(module_path, output_dir, submodules=[], test_refs={}, example_refs={}): print("Building docs for %s" % module_path) filename = os.path.join(output_dir, "%s.rst" % module_path) # First import the python module pymod = import_module(module_path) # Check whether we've been instructed to skip this module if hasattr(pymod, "SKIP_MODULE_DOCS") and pymod.SKIP_MODULE_DOCS: return # If a module is marked as awaiting update to the new datatypes, don't # even try importing it, as this usually won't work in Python 3 if hasattr(pymod, "AWAITING_UPDATE") and pymod.AWAITING_UPDATE: warnings.warn("Module {} is still waiting to be updated to the new datatypes system".format(module_path)) with codecs.open(filename, "w", "utf8") as output_file: module_title = module_path.rpartition(".")[2] module_title = "!! {}".format(module_title) # Make a page heading output_file.write(format_heading(0, module_title)) # Add a directive to mark this as the documentation for the py module that defines the Pimlico module output_file.write(".. py:module:: %s\n\n" % module_path) output_file.write(".. note::\n\n This module has not yet been updated to the new " "datatype system, so cannot be used yet. Soon it will be updated.\n\n") return # Import the info pymodule so we can get the ModuleInfo class info = import_module("%s.info" % module_path) # We know this exists try: ModuleInfo = info.ModuleInfo except AttributeError: # If there's no ModuleInfo, it's not a valid Pimlico module # Warn, since it looks like it's supposed to be one warnings.warn("Module %s has no ModuleInfo in its info.py" % module_path) return # Collect key information from the module info key_info = [ ["Path", module_path], ["Executable", "yes" if ModuleInfo.module_executable else "no"], ] + ModuleInfo.get_key_info_table() # Try using the module's readable name as the document title module_title = ModuleInfo.module_readable_name if module_title is None or module_title == "": # No readable name given: make one out of the internal name module_title = ModuleInfo.module_type_name module_title = module_title[0].capitalize() + module_title[1:] module_title = module_title.replace("_", " ") input_table = [ [input_name, input_datatype_list(input_types, context=module_path)] for input_name, input_types in ModuleInfo.module_inputs ] output_table = [ [output_name, output_datatype_text(output_types, context=module_path)] for output_name, output_types in ModuleInfo.module_outputs ] optional_output_table = [ [output_name, output_datatype_text(output_types, context=module_path)] for output_name, output_types in ModuleInfo.module_optional_outputs ] further_outputs_doc = ModuleInfo.provide_further_outputs.__doc__ if further_outputs_doc is None or further_outputs_doc == provide_further_outputs_base_doc: # Docstring hasn't been overridden: don't use this further_outputs_doc = "" else: further_outputs_doc = inspect.cleandoc(further_outputs_doc) # Output groups are simply defined as string output_groups_table = [ [group_name, ", ".join(output_names)] for (group_name, output_names) in ModuleInfo.module_output_groups ] build_output_groups_doc = ModuleInfo.build_output_groups.__doc__ if build_output_groups_doc is None or build_output_groups_doc == build_output_groups_base_doc: build_output_groups_doc = "" else: build_output_groups_doc = inspect.cleandoc(build_output_groups_doc) info_doc = info.__doc__ if info_doc is not None: info_doc = inspect.cleandoc(info_doc) module_info_doc = ModuleInfo.__doc__ if module_info_doc is not None: module_info_doc = inspect.cleandoc(module_info_doc) additional_paras = [] if ModuleInfo.is_input(): additional_paras.append("This is an input module. It takes no pipeline inputs and is used to read in data") if ModuleInfo.is_filter(): additional_paras.append( "This is a filter module. It is not executable, so won't appear in a pipeline's list of modules that can " "be run. It produces its output for the next module on the fly when the next module needs it." ) # Check whether this module works in Python 2 if not ModuleInfo.supports_python2(): # It doesn't: include a warning in the docs additional_paras.append("*This module does not support Python 2, so can only be used when Pimlico " "is being run under Python 3*") # Put together the options table options_table = [ [ option_name, ("(required) " if d.get("required", False) else "") + d.get("help", ""), format_option_type(d.get("type", str)), ] for (option_name, d) in _sort_options(ModuleInfo.module_options) ] # Try generating some example config for how this module can be used try: example_config_short = generate_example_config(ModuleInfo, input_table, module_path, minimal=True) except Exception as e: warnings.warn("Error generating example config for {}: {}. Not including example".format(module_path, e)) example_config_short = None try: example_config_long = generate_example_config(ModuleInfo, input_table, module_path, minimal=False) except Exception as e: warnings.warn("Error generating example config for {}: {}. Not including example".format(module_path, e)) example_config_long = None with codecs.open(filename, "w", "utf8") as output_file: # Make a page heading output_file.write(format_heading(0, module_title)) # Add a directive to mark this as the documentation for the py module that defines the Pimlico module output_file.write(".. py:module:: %s\n\n" % module_path) # Output a summary table of key information output_file.write("%s\n" % make_table(key_info)) # Insert text from docstrings if info_doc is not None: output_file.write(trim_docstring(info_doc) + "\n\n") if module_info_doc is not None: output_file.write(trim_docstring(module_info_doc) + "\n\n") output_file.write("\n") output_file.write("".join("%s\n\n" % para for para in additional_paras)) # Output a table of inputs output_file.write(format_heading(1, "Inputs")) if input_table: output_file.write("%s\n" % make_table(input_table, header=["Name", "Type(s)"])) else: output_file.write("No inputs\n\n") # Table of outputs output_file.write(format_heading(1, "Outputs")) if output_table: output_file.write("%s\n" % make_table(output_table, header=["Name", "Type(s)"])) elif optional_output_table: output_file.write("No non-optional outputs\n\n") else: output_file.write("No outputs\n\n") if optional_output_table: output_file.write("\n" + format_heading(2, "Optional")) output_file.write("%s\n" % make_table(optional_output_table, header=["Name", "Type(s)"])) if further_outputs_doc: output_file.write("\n" + format_heading(2, "Further conditional outputs")) output_file.write("\n{}\n".format(further_outputs_doc)) # Show output groups if there are any if len(output_groups_table) or len(build_output_groups_doc): output_file.write("\n" + format_heading(1, "Output groups")) output_file.write("The module defines some named output groups, which can be used to refer to collections " "of outputs at once, as multiple inputs to another module or alternative inputs.\n\n") if len(output_groups_table): output_file.write("{}\n".format(make_table(output_groups_table, header=["Group name", "Outputs"]))) if len(build_output_groups_doc): output_file.write("\n{}\n".format(build_output_groups_doc)) # Table of options if options_table: output_file.write("\n" + format_heading(1, "Options")) output_file.write("%s\n" % make_table(options_table, header=["Name", "Description", "Type"])) # Example config if example_config_short is not None or example_config_long is not None: output_file.write(format_heading(1, "Example config")) if example_config_short is not None: output_file.write("This is an example of how this module can be used in a pipeline config file.\n\n") output_file.write(".. code-block:: ini\n \n{}\n\n".format(indent(3, example_config_short))) if example_config_long is not None: # Only show long example if it's longer than short if example_config_short is None or len(example_config_short) < len(example_config_long): output_file.write("This example usage includes more options.\n\n") output_file.write(".. code-block:: ini\n \n{}\n\n".format(indent(3, example_config_long))) # See whether this module is used in any example config files example_configs = example_refs.get(module_path, []) if len(example_configs): output_file.write(format_heading(1, "Example pipelines")) output_file.write("This module is used by the following :ref:`example pipelines <example-pipelines>`. " "They are examples of how the module can be used together with " "other modules in a larger pipeline.\n\n") output_file.write("\n".join(" * :ref:`{}`".format(ref_name) for ref_name in example_configs)) output_file.write("\n\n") # See whether this module is used in any test config files test_configs = test_refs.get(module_path, []) if len(test_configs): output_file.write(format_heading(1, "Test pipelines")) output_file.write("This module is used by the following :ref:`test pipelines <test-pipelines>`. " "They are a further source of examples of the module's usage.\n\n") output_file.write("\n".join(" * :ref:`{}`".format(ref_name) for ref_name in test_configs)) output_file.write("\n\n") if submodules: # Generate a TOC for the nested modules output_file.write(format_heading(1, "Submodules")) output_file.write(".. toctree::\n :titlesonly:\n\n ") output_file.write("\n ".join(submodules)) output_file.write("\n") return ModuleInfo
[docs]def input_datatype_list(types, context=None, no_warn=False): if type(types) is tuple: # This is a list of types return " or ".join(input_datatype_text(t, context=context, no_warn=no_warn) for t in types) else: # Just a single type return input_datatype_text(types, context=context, no_warn=no_warn)
[docs]def input_datatype_text(datatype, context=None, no_warn=False): if isinstance(datatype, PimlicoDatatype): # Standard behaviour for normal datatypes return datatype_to_link(datatype) elif isinstance(datatype, MultipleInputs): # Multiple inputs, but the datatype is known: call this function to format the common type return ":class:`list <pimlico.datatypes.base.MultipleInputs>` of %s" % \ input_datatype_text(datatype.datatype_requirements) elif isinstance(datatype, DynamicInputDatatypeRequirement): if datatype.datatype_doc_info is not None: # Dynamic input type that gives us a name to use return datatype.datatype_doc_info else: # Dynamic datatype requirement with no custom string return ":class:`%s <%s.%s>`" % (type(datatype).__name__, type(datatype).__module__, type(datatype).__name__) else: if not no_warn: warnings.warn("Invalid input type specification {} (not datatype, multiple input, or dynamic " "requirement): {}".format("in {}".format(context) if context else "", type(datatype))) return "**invalid input type specification**"
[docs]def output_datatype_text(datatype, context=None, no_warn=False): if isinstance(datatype, DynamicOutputDatatype): # Use the datatype name given by the dynamic datatype and link to the class base_datatype = datatype.get_base_datatype() if base_datatype is not None: datatype_class_name = base_datatype.datatype_full_class_name() else: # Just link to the dynamic datatype class datatype_class_name = "%s.%s" % (type(datatype).__module__, type(datatype).__name__) datatype_name = datatype.datatype_name or type(datatype).__name__ return ":class:`%s <%s>`" % (datatype_name, datatype_class_name) elif isinstance(datatype, PimlicoDatatype): return datatype_to_link(datatype) else: if not no_warn: warnings.warn("Invalid output type {} (not datatype or dynamic output type): {}".format( "in {}".format(context) if context else "", type(datatype)) ) return "**invalid output type specification**"
[docs]def generate_contents_page(modules, output_dir, index_name, title, content): with open(os.path.join(output_dir, "%s.rst" % index_name), "w") as index_file: index_file.write("""\ {title} .. py:module:: {index_name} {content} .. toctree:: :maxdepth: 2 :titlesonly: {list} """.format( title=format_heading(0, title), content=content, list="\n ".join(modules), index_name=index_name, ))
[docs]def generate_example_config(info, input_types, module_path, minimal=False): """ Generate a string containing an example of how to configure the given module in a pipeline config file. Where possible, uses default values for options, or values appropriate to the type, and dummy input names. """ input_lines = "".join( "input_{}=module_a.some_output{}\n".format( name, ",module_b.some_output,..." if isinstance(dtype, MultipleInputs) else "" ) for (name, dtype) in input_types ) # Generate example values for all the options options = [] for opt_name, opt_dict in _sort_options(info.module_options): # If producing minimal version, only include required options if not minimal or opt_dict.get("required", False): opt_val = None # If the opt dict includes an explicit "example", use that if "example" in opt_dict: # If example is explicitly given as None, skip this option in the long example if opt_dict["example"] is None: continue opt_val = opt_dict["example"] else: # If the option has a default value, we should use that opt_default = opt_dict.get("default", None) if opt_default is not None: # Whether we can simply use the default value depends on the type, as it's given as a processed value opt_val = _val_to_config(opt_default) if opt_val is None: # Not managed to get anything from the default value # Try looking at the option type otype = opt_dict.get("type", None) if otype is not None: opt_val = _opt_type_to_config(otype) if opt_val is None: # If nothing else works, just put something there so we see that the option can be set opt_val = "value" options.append("{}={}".format(opt_name, opt_val)) return """\ [my_{}_module] type={} {}{} """.format( info.module_type_name, module_path, input_lines, "\n".join(options) )
def _val_to_config(val): if isinstance(val, str): # This is easy: we can just use it return val # Some types can simply be converted to strings to get a good example elif type(val) is int: return str(val) elif type(val) is float: return "{:.2f}".format(val) elif type(val) is bool: return "T" if val else "F" elif type(val) in (list, tuple): # Presumably a comma-separated list return ",".join(_val_to_config(v) for v in val) else: return None def _opt_type_to_config(otype): if hasattr(otype, "_opt_type_example"): return otype._opt_type_example if type(otype) is type: if issubclass(otype, str): # Just a string, anything can go here return "text" elif issubclass(otype, int): return "0" elif issubclass(otype, float): return "0.1" elif issubclass(otype, bool): # Most often, str_to_bool is used instead of this # but if bool has been used, work with that return "1" elif otype is str_to_bool: return "T" elif otype is json_string: return '{"key1":"value"}' elif otype is json_dict: return '"key1": "value", "key2": 2' elif hasattr(otype, "list_item_type"): # Comma-separated list list_val = _opt_type_to_config(otype.list_item_type) if list_val is None: # We don't know how to generate an example of this list type list_val = "value" return "{},{},...".format(list_val, list_val) else: return None def _sort_options(options_dict): """ Ensure consistent ordering of options in table and examples. """ # Ensure a consistent ordering of the options in the table if isinstance(options_dict, OrderedDict): # Ordering is consistent, because options have been specified with an OrderedDict return options_dict.items() else: # Sort alphabetically, so we have a consistent ordering. Otherwise it's random return sorted(options_dict.items())
[docs]def indent(spaces, text): return "\n".join("{}{}".format(" "*spaces, line) for line in text.splitlines())
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate module documentation RST files from core Pimlico modules, " "or your own Pimlico modules") parser.add_argument("output_dir", help="Where to put the .rst files") parser.add_argument("--path", default="pimlico.modules", help="Base Python module path to generate docs for. Defaults to generating docs for core " "modules from the Pimlico distribution. Use this to generate module docs for your own " "modules") parser.add_argument("--test-refs", help="Path to module ref list from test config file doc generator. If given, a list of " "config files will be included with each module that is used in one or more") parser.add_argument("--example-refs", help="Path to module ref list from example config file doc generator. If given, a list of " "config files will be included with each module that is used in one or more") opts = parser.parse_args() output_dir = os.path.abspath(opts.output_dir) # Install basic Pimlico requirements install_core_dependencies() print("Sphinx %s" % __version__) print("Pimlico module doc generator") try: base_mod = import_module(opts.path) except ImportError as e: print("Could not import base module %s: %s" % (opts.path, e)) print("Did you add your own modules to the pythonpath? (Current paths: %s)" % \ u", ".join(sys.path).encode("ascii", "ignore")) print("Cannot generate docs") sys.exit(1) print("Generating docs for %s (including all submodules)" % opts.path) print("Outputting module docs to %s" % output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) test_refs = {} if opts.test_refs is not None: if not os.path.exists(opts.test_refs): warnings.warn("Test pipeline module reference file {} does not exist".format(opts.test_refs)) # Load test pipeline module refs file with open(opts.test_refs, "r") as f: test_refs_data = f.read() # Use the file to build a dictionary of referenced modules for easy lookup when building docs for line in test_refs_data.splitlines(): ref_name, __, modules = line.partition("\t") for module in modules.split(","): test_refs.setdefault(module.strip(), []).append(ref_name) example_refs = {} if opts.example_refs is not None: if not os.path.exists(opts.example_refs): warnings.warn("Example pipeline module reference file {} does not exist".format(opts.example_refs)) # Load example pipeline module refs file with open(opts.example_refs, "r") as f: example_refs_data = f.read() # Use the file to build a dictionary of referenced modules for easy lookup when building docs for line in example_refs_data.splitlines(): ref_name, __, modules = line.partition("\t") for module in modules.split(","): example_refs.setdefault(module.strip(), []).append(ref_name) generate_docs_for_pymod(base_mod, output_dir, test_refs=test_refs, example_refs=example_refs)