Source code for pimlico

"""
The Pimlico Processing Toolkit (PIpelined Modular LInguistic COrpus processing) is a toolkit for building pipelines
made up of linguistic processing tasks to run on large datasets (corpora). It provides a wrappers around many
existing, widely used NLP (Natural Language Processing) tools.

"""
from __future__ import print_function
import os
import sys
import subprocess

# Core dependencies will be checked when Pimlico is run and installed if necessary.
# However, future is needed right away, before we can even start importing
# the code to install the core deps, since that code needs to be Py2-3 compatible
try:
    import future
except ImportError:
    print("Future library is not installed: installing now")
    # Call pip to install future
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'future'])
    # Reload the environment, so we see the newly install package(s)
    import site
    try:
        # importlib is preferred over imp
        from importlib import reload
    except ImportError:
        try:
            # In early Python3 versions, use imp
            from imp import reload
        except ImportError:
            # In Python2 it's a builtin, so try just skipping
            pass
    reload(site)

PIMLICO_ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", ".."))

# Fetch current version number from PIMLICO_ROOT/admin/release.txt
with open(os.path.join(PIMLICO_ROOT, "admin", "release.txt"), "r") as releases_file:
    _lines = [r.strip() for r in releases_file.read().splitlines()]
    releases = [r[1:] for r in _lines if r.startswith("v")]
# The last listed version is the current, bleeding-edge version number
# This file used to contain all release numbers, but we now get them from git tags
# The only information given in the file now is the current version
__version__ = releases[-1]

PROJECT_ROOT = os.path.abspath(os.path.join(PIMLICO_ROOT, ".."))

LIB_DIR = os.path.join(PIMLICO_ROOT, "lib")
JAVA_LIB_DIR = os.path.join(LIB_DIR, "java")
JAVA_BUILD_JAR_DIR = os.path.join(PIMLICO_ROOT, "build", "jar")
MODEL_DIR = os.path.join(PIMLICO_ROOT, "models")
LOG_DIR = os.path.join(PIMLICO_ROOT, "log")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "output")
EXAMPLES_DIR = os.path.join(PIMLICO_ROOT, "examples")
TEST_DATA_DIR = os.path.join(PIMLICO_ROOT, "test", "data")
TEST_STORAGE_DIR = os.path.join(PIMLICO_ROOT, "test", "storage")
# Root URL for generating links to Pimlico source code in the docs
REPO_SOURCE_HTML_ROOT = "https://github.com/markgw/pimlico/blob/master/"


[docs]def install_core_dependencies(): from pimlico.core.dependencies.base import check_and_install from pimlico.core.dependencies.core import CORE_PIMLICO_DEPENDENCIES, coloredlogs_dependency # Always check that core dependencies are satisfied before running anything # Core dependencies are not allowed to depend on the local config, as we can't get to it at this point # We just pass in an empty dictionary unavailable = [dep for dep in CORE_PIMLICO_DEPENDENCIES if not dep.available({})] if len(unavailable): print("Some core Pimlico dependencies are not available: %s\n" % \ ", ".join(dep.name for dep in unavailable), file=sys.stderr) uninstalled = check_and_install(CORE_PIMLICO_DEPENDENCIES, {}) if len(uninstalled): print("Unable to install all core dependencies: exiting", file=sys.stderr) sys.exit(1) # Special procedure for coloredlogs # This is nice to have, but if we can't install it or load it, it's not a problem try: if not coloredlogs_dependency.available({}): print("Installing coloredlogs") coloredlogs_dependency.install({}) # Load coloredlogs and start using it for all logging formatters import coloredlogs coloredlogs.install() except Exception as e: print("Error installing and loading colorlogs. Logs will not be coloured. {}".format(e))
[docs]def get_jupyter_pipeline(): """ Special function to get access to a currently loaded pipeline from a Jupyter notebook. """ from pimlico.utils.jupyter import get_pipeline return get_pipeline()