Source code for pimlico.datatypes.files

import os

from pimlico.datatypes.base import PimlicoDatatype, IterableCorpus, PimlicoDatatypeWriter, InvalidDocument
from pimlico.datatypes.documents import RawTextDocumentType


[docs]class File(PimlicoDatatype):
    """
    Simple datatype that supplies a single file, providing the path to it.

    This is an abstract class: subclasses need to provide a way of getting to (e.g. storing) the filename in
    question.

    """
    datatype_name = "file"

[docs]    def data_ready(self):
        if not super(File, self).data_ready():
            return False
        try:
            # Check that the file that our path points to also exists
            if not os.path.exists(self.absolute_path):
                return False
        except IOError:
            # Subclasses may raise an IOError while trying to compute the path: in this case it's assumed not ready
            return False
        return True

    @property
    def absolute_path(self):
        raise NotImplementedError


[docs]def NamedFile(name):
    """
    Datatype factory that produces something like a `File` datatype, pointing to a single file, but doesn't store
    its path, just refers to a particular file in the data dir.

    :param name: name of the file
    :return: datatype class
    """
    class _NamedFile(File):
        datatype_name = "named_file"
        filename = name

        @property
        def absolute_path(self):
            return os.path.join(self.data_dir, name)

        @classmethod
        def datatype_full_class_name(cls):
            return ":func:`~pimlico.datatypes.files.NamedFile`"

    _NamedFile.__name__ = 'NamedFile'
    return _NamedFile


[docs]class NamedFileWriter(PimlicoDatatypeWriter):
    def __init__(self, base_dir, filename, *kwargs):
        super(NamedFileWriter, self).__init__(base_dir, *kwargs)
        self.filename = filename

    @property
    def absolute_path(self):
        return os.path.join(self.data_dir, self.filename)

[docs]    def write_data(self, data):
        """
        Write the given string data to the appropriate output file
        """
        with open(self.absolute_path, "w") as f:
            f.write(data)


[docs]class RawTextDirectory(IterableCorpus):
    """
    Basic datatype for reading in all the files in a directory and its subdirectories as raw text documents.

    Generally, this may be appropriate to use as the input datatype at the start of a pipeline. You'll then
    want to pass it through a tarred corpus filter to get it into a suitable form for input to other modules.

    """
    datatype_name = "raw_text_directory"
    input_module_options = {
        "path": {
            "help": "Full path to the directory containing the files",
            "required": True,
        },
        "encoding": {
            "help": "Encoding used to store the text. Should be given as an encoding name known to Python. By "
                    "default, assumed to be 'utf8'",
            "default": "utf8",
        },
        "encoding_errors": {
            "help": "What to do in the case of invalid characters in the input while decoding (e.g. illegal utf-8 "
                    "chars). Select 'strict' (default), 'ignore', 'replace'. See Python's str.decode() for details",
            "default": "strict",
        },
    }
    data_point_type = RawTextDocumentType
    requires_data_preparation = True

[docs]    def prepare_data(self, output_dir, log):
        log.info("Counting files in input directory")
        # Walk over the entire subdirectory structure at the given path
        num_docs = sum(1 for __ in self.walk())

        with PimlicoDatatypeWriter(output_dir) as datatype:
            datatype.metadata["length"] = num_docs

[docs]    def walk(self):
        base_path = self.options["path"]
        for base_dir, subdirs, filenames in os.walk(base_path):
            for filename in filenames:
                yield os.path.join(base_dir, filename)

[docs]    def filter_document(self, doc):
        """
        Each document is passed through this filter before being yielded.
        Default implementation does nothing, but this makes it easy to implement custom postprocessing
        by overriding.
        """
        return doc

    def __iter__(self):
        base_path = self.options["path"]
        encoding = self.options["encoding"]
        errors = self.options["encoding_errors"]

        for file_path in self.walk():
            with open(file_path, "r") as f:
                # Use the file's path within the base directory as its doc name
                rel_path = os.path.relpath(file_path, base_path)
                data = f.read().decode(encoding, errors=errors)
                # Apply datatype-specific processing of the data
                document = self.process_document_data_with_datatype(data)
                # Allow subclasses to apply filters to the data
                if not isinstance(document, InvalidDocument) and not self.raw_data:
                    document = self.filter_document(document)
                yield rel_path, document

[docs]    def get_required_paths(self):
        return [self.options["path"]]