Source code for pimlico.cli.fixlength

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

from __future__ import print_function
import copy

from pimlico.cli.recover import count_docs
from pimlico.cli.subcommands import PimlicoCLISubcommand
from pimlico.core.modules.base import satisfies_typecheck
from pimlico.datatypes import GroupedCorpus, PimlicoDatatype
from pimlico.datatypes.base import DataNotReadyError, _metadata_path
from pimlico.datatypes.corpora.data_points import RawDocumentType
from pimlico.utils.pimarc import PimarcReader
from pimlico.utils.progress import get_open_progress_bar


[docs]class FixLengthCmd(PimlicoCLISubcommand):
    """
    Under some circumstances (e.g. some unpredictable combinations of failures
    and restarts), an output corpus can end up with an incorrect length in its
    metadata. This command counts up the documents in the corpus and corrects
    the stored length if it's wrong.

    """
    command_name = "fixlength"
    command_help = "Check the length of written outputs and fix it if it's wrong"

[docs]    def add_arguments(self, parser):
        parser.add_argument("module", help="The name (or number) of the module to recover")
        parser.add_argument("outputs", nargs="*", help="Names of module outputs to check. By default, checks all")
        parser.add_argument("--dry", action="store_true", help="Dry run: check the lengths, but don't write anything")

[docs]    def run_command(self, pipeline, opts):
        dry = opts.dry
        module_name = opts.module
        module = pipeline[module_name]

        # Get the outputs that are grouped corpora
        grouped_outputs = [
            name for name in module.output_names
            if satisfies_typecheck(module.get_output_datatype(name)[1], GroupedCorpus(RawDocumentType()))
        ]
        if opts.outputs:
            # Some specific module names have been given
            for output_name in opts.outpus:
                if output_name not in module.output_names:
                    raise ValueError("unknown output '{}' for module '{}'".format(output_name, module_name))
                if output_name not in grouped_outputs:
                    raise ValueError("output '{}' is not a grouped corpus".format(output_name))
            outputs = opts.outputs
        else:
            # Check all grouped corpus outputs
            outputs = grouped_outputs

        print("Checking outputs: {}".format(", ".join(outputs)))
        for output_name in outputs:
            print("\n### Checking output '{}'".format(output_name))
            try:
                output = module.get_output(output_name)
            except DataNotReadyError as e:
                print("Could not read output '{}': cannot check written documents".format(output_name))
                raise DataNotReadyError("could not read output '{}': {}".format(output_name, e))
            print("Reported length: {:,d}".format(len(output)))
            print("Counting using pimarc indices...")
            num_docs_in_indices = count_pimarcs(output)
            if num_docs_in_indices == len(output):
                print("Reported length matches Pimarc indices")
            else:
                print("Stored length does not match count from Pimarc indices")
                print("Length of indices: {:,d}".format(num_docs_in_indices))
            print("Counting actual length. This could take some time...")
            # Use the function from the recover command to count the docs
            num_docs = count_docs(output, last_buffer_size=0)[1]
            if num_docs == len(output):
                print("Reported length is correct")
            else:
                print("Stored length does not match number of docs")
                print("Actual length: {:,d}".format(num_docs))
                if dry:
                   print("DRY: Not correcting metadata")
                else:
                    metadata_path = _metadata_path(output.base_dir)
                    print("Correcting metadata in {}".format(metadata_path))
                    metadata = copy.deepcopy(output.metadata)
                    metadata["length"] = num_docs
                    # Use standard method to write out the corrected metadata
                    PimlicoDatatype.Writer._write_metadata(metadata_path, metadata)


[docs]def count_pimarcs(output):
    # Show counting progress so we know something's happening
    pbar = get_open_progress_bar("Counting")
    total = 0
    for archive_filename in output.archive_filenames:
        reader = PimarcReader(archive_filename)
        # Read the length from the pimarc's index
        # This could be wrong, if something went wrong with writing the archives
        total += len(reader)
        pbar.update(total)
    return total