Source code for pimlico.cli.newmodule

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

from __future__ import print_function
from builtins import input

import os
from textwrap import wrap

from pimlico import PROJECT_ROOT
from pimlico.cli.subcommands import PimlicoCLISubcommand


[docs]class NewModuleCmd(PimlicoCLISubcommand): command_name = "newmodule" command_help = "Interactive tool to create a new module type, generating a skeleton for the module's code. " \ "Currently only works for certain module types. May be extended in future to help with " \ "creating a broader range of sorts of modules" command_desc = "Create a new module type"
[docs] def run_command(self, pipeline, opts): # We assume that the first location given in the pipeline's python_path variable is where the project's # main custom code lives and create the new code there python_paths = pipeline.pipeline_config["python_path"].split(":") if len(python_paths) == 0: print("Could not determine a location for creating the new module code, since the pipeline does not " \ "specify a 'python_path' variable") path = ask("Enter a base path for your custom Pimlico code (may be relative to project root): ") if os.path.isabs(path): code_root = path else: code_root = os.path.abspath(os.path.join(PROJECT_ROOT, path)) else: code_root = os.path.abspath(python_paths[0]) print("New code will live in %s" % code_root) # Make sure the root dir exists if not os.path.exists(code_root): os.makedirs(code_root) module_path = ask("Enter name for new module (full python path, e.g. 'mypackage.modules.mymodule'): ") module_root_dir = os.path.join(code_root, *(module_path.split("."))) print("Module code will be created in %s" % module_root_dir) # Create Python directories as necessary to create the new module's directory if os.path.exists(module_root_dir): print("Module directory already exists: not creating any code, so we don't overwrite existing code") return rt = code_root for prt in module_path.split("."): rt = os.path.join(rt, prt) if not os.path.exists(rt): os.mkdir(rt) if not os.path.exists(os.path.join(rt, "__init__.py")): # Touch the file to create it with open(os.path.join(rt, "__init__.py"), "w"): pass imports = [] # Work out what category of module we're creating print("\nSelect a category of module to create:") print(" 1. Generic") print(" 2. Document map module") # In future, you probably want to add, e.g. filter modules, multistage modules, ... module_category = int(ask("Category: ")) assert module_category in [1, 2] # Ask some questions that apply to all module categories module_type_name = module_path.split(".")[-1] module_readable_name = ask("Enter readable name (short, e.g. 'Number multiplier'): ") print("\nCreate module options") print("=====================") module_options = [] option_egs = [] while True: option_name = ask("Option name (blank to stop creating options): ") if " " in option_name: print("Option name cannot include spaces") continue elif len(option_name) == 0: break # Ask questions to guide the user through defining the module option option_def = [] print("Choose one of the standard option types, or edit the ") print("generated code afterwards to use a different one") print(" 1. string") print(" 2. integer") print(" 3. float") print(" 4. boolean") print(" 5. choice from list of possible values") print(" 6. comma-separated list") option_type_choice = int(ask("Option type: ")) # String is the default, so we don't need to specify a type if option_type_choice != 1: if option_type_choice == 2: option_type = "int" option_eg = "10" elif option_type_choice == 3: option_type = "float" option_eg = "1.5" elif option_type_choice == 4: option_type = "str_to_bool" option_eg = "T" imports.append("from pimlico.core.modules.options import str_to_bool") elif option_type_choice == 5: imports.append("from pimlico.core.modules.options import choose_from_list") print("Specifying values (strings, unquoted) to choose from:") choices = [] while True: next_value = ask("Next value (blank to stop): ", strip_space=False) if len(next_value): choices.append('"%s"' % next_value) else: break option_type = 'choose_from_list([%s], name="%s")' % (", ".join(choices), option_name) option_eg = choices[0] elif option_type_choice == 6: print("What type of values are in the list? (Customize afterwards if you need other types)") print(" 1. string") print(" 2. integer") print(" 3. float") print(" 4. other") option_list_type_choice = int(ask("Type: ")) if option_list_type_choice == 1: imports.append("from pimlico.core.modules.options import comma_separated_strings") option_type = "comma_separated_strings" option_eg = "x,y,z" elif option_list_type_choice == 2: imports.append("from pimlico.core.modules.options import comma_separated_list") option_type = "comma_separated_list(item_type=int)" option_eg = "1,2,3" elif option_list_type_choice == 3: imports.append("from pimlico.core.modules.options import comma_separated_list") option_type = "comma_separated_list(item_type=float)" option_eg = "1.2,5.4,3.9" else: imports.append("from pimlico.core.modules.options import comma_separated_list") option_type = "comma_separated_list(item_type=???) # TODO Put your type in here" option_eg = "x,y,z" else: print("Unknown type") continue option_def.append(("type", option_type)) else: option_eg = "something" print("Describe the option, so the module's users understand what it does") option_help = ask("Description: ") if len(option_help): # Escape any double quotes option_help = option_help.replace('"', '\\"') # Apply word-wrap to help text so the code isn't messy if len(option_help) > 95: help_lines = wrap(option_help, width=95) option_help = '"%s"' % '"\n "'.join(help_lines) else: option_help = '"%s"' % option_help option_def.append(("help", option_help)) print("You can give the option a default value.") print("Specify as Python code (i.e. quote it if it's a string).") print("Leave blank to use None as the default") option_default = ask("Default value: ") if option_default: option_def.append(("default", option_default)) option_egs.append((option_name, option_eg)) # Put together the option definition as a dictionary module_options.append(' "%s": {\n%s\n },' % ( option_name, "\n".join(' "%s": %s,' % (name, df) for (name, df) in option_def) )) print() template_data = { "module_type_name": module_type_name, "module_readable_name": module_readable_name, "module_options": "\n".join(module_options), } if module_category == 1: imports.append("from pimlico.core.modules.base import BaseModuleInfo") template = GENERIC_TEMPLATE else: imports.append("from pimlico.core.modules.map import DocumentMapModuleInfo") template = DOC_MAP_TEMPLATE template_data["imports"] = "\n".join(imports) # Render the template info_code = template.format(**template_data) # Output it to the info file info_path = os.path.join(module_root_dir, "info.py") with open(info_path, "w") as f: f.write(info_code) ####### # Now create the executor exec_template_data = {} if module_category == 1: # This is a simple template exec_template = GENERIC_EXEC_TEMPLATE else: exec_template = DOC_MAP_EXEC_TEMPLATE exec_imports = [] print("Several types of document map module are available:") print(" 1. Multiprocessing: documents may be processed in parallel (using multiprocessing), by specifying ") print(" --processes at runtime") print(" 2. Threaded: similar, but parallelization is implemented using Python's threading package. This ") print(" will not take advantage of multiple processors or system-level parallelism, but is useful, for ") print(" example, if your process function calls background processes to do the legwork") print(" 3. Single-process: do not parallelize, even if --processes is set at runtime. Use where you know ") print(" that things will go wrong if documents are processed in parallel") map_type_choice = int(ask("Choose a map module type: ")) if map_type_choice == 1: exec_imports.append("from pimlico.core.modules.map.multiproc import multiprocessing_executor_factory") exec_template_data["factory"] = "multiprocessing_executor_factory" elif map_type_choice == 2: exec_imports.append("from pimlico.core.modules.map.threaded import threading_executor_factory") exec_template_data["factory"] = "threading_executor_factory" else: exec_imports.append("from pimlico.core.modules.map.singleproc import single_process_executor_factory") exec_template_data["factory"] = "single_process_executor_factory" exec_template_data["imports"] = "\n".join(exec_imports) # Render the template exec_code = exec_template.format(**exec_template_data) # Output it to the execute file exec_path = os.path.join(module_root_dir, "execute.py") with open(exec_path, "w") as f: f.write(exec_code) # Prepare an example of the config code config_eg = CONFIG_TEMPLATE.format( module_path=module_path, option_egs="\n".join("%s=%s" % (name, eg) for (name, eg) in option_egs) ) print("\nModule created") print("==============") print(" 1. Edit the module metedata in %s" % info_path) print(" 2. Write the module's execution code in %s" % exec_path) print(" 3. Use the module in your pipeline with something like this:") print() print(config_eg)
[docs]def ask(prompt, strip_space=True): strp = "\n " if strip_space else "\n" print() val = input(" %s" % prompt).strip(strp) print() return val
GENERIC_TEMPLATE = """\ \"\"\" .. todo:: Document module {module_type_name} \"\"\" {imports} class ModuleInfo(BaseModuleInfo): module_type_name = "{module_type_name}" module_readable_name = "{module_readable_name}" module_inputs = [ # TODO Define module inputs here as: # ("input_name", InputTypeClass()), ] module_outputs = [ # TODO Define module outputs here as: # ("output_name", OutputTypeClass()), ] module_options = {{ {module_options} }} """ DOC_MAP_TEMPLATE = """\ \"\"\" .. todo:: Document module {module_type_name} \"\"\" {imports} class ModuleInfo(DocumentMapModuleInfo): module_type_name = "{module_type_name}" module_readable_name = "{module_readable_name}" module_inputs = [ # TODO Define module inputs here as: # ("input_name", InputTypeClass()), # At least one should be a sub-type of IterableCorpus ] module_outputs = [ # TODO Define module outputs here as: # ("output_name", OutputTypeClass()), # At least one should be a sub-type of IterableCorpus ] module_options = {{ {module_options} }} def get_writer(self, output_name, output_dir, append=False): # TODO Return an appropriate writer instance for each output raise NotImplementedError("writer creation not implemented for {module_type_name}") """ GENERIC_EXEC_TEMPLATE = """\ from pimlico.core.modules.base import BaseModuleExecutor class ModuleExecutor(BaseModuleExecutor): def execute(self): # TODO Write execution code here # The metadata, options, etc are available through the module info instance in self.info pass """ DOC_MAP_EXEC_TEMPLATE = """\ from pimlico.core.modules.map import skip_invalid {imports} # Remove skip_invalid if you want to process invalid documents, rather than just pass them through @skip_invalid def process_document(worker, archive_name, doc_name, doc): # TODO Define the actual processing that the module does on each doc # Access anything initialized per worker via the worker # Access anything initialized just once (preprocess_fn) via worker.executor # Access the module info instance (for options, etc) in worker.info return # TODO Return result to be passed to the output writer # You might also want to specify preprocess_fn, postprocess_fn, worker_set_up_fn, worker_tear_down_fn ModuleExecutor = {factory}(process_document) """ CONFIG_TEMPLATE = """\ [my_instance_name] type={module_path} # Once you've defined your module inputs, specify where each comes from with: input_NAME=module_name.output_name {option_egs} """