Source code for pimlico.datatypes.files

import os

from pimlico.datatypes.base import PimlicoDatatype, IterableCorpus, PimlicoDatatypeWriter, InvalidDocument
from pimlico.datatypes.documents import RawTextDocumentType


[docs]class File(PimlicoDatatype): """ Simple datatype that supplies a single file, providing the path to it. This is an abstract class: subclasses need to provide a way of getting to (e.g. storing) the filename in question. """ datatype_name = "file"
[docs] def data_ready(self): if not super(File, self).data_ready(): return False try: # Check that the file that our path points to also exists if not os.path.exists(self.absolute_path): return False except IOError: # Subclasses may raise an IOError while trying to compute the path: in this case it's assumed not ready return False return True
@property def absolute_path(self): raise NotImplementedError
[docs]def NamedFile(name): """ Datatype factory that produces something like a `File` datatype, pointing to a single file, but doesn't store its path, just refers to a particular file in the data dir. :param name: name of the file :return: datatype class """ class _NamedFile(File): datatype_name = "named_file" filename = name @property def absolute_path(self): return os.path.join(self.data_dir, name) @classmethod def datatype_full_class_name(cls): return ":func:`~pimlico.datatypes.files.NamedFile`" _NamedFile.__name__ = 'NamedFile' return _NamedFile
[docs]class NamedFileWriter(PimlicoDatatypeWriter): def __init__(self, base_dir, filename, *kwargs): super(NamedFileWriter, self).__init__(base_dir, *kwargs) self.filename = filename @property def absolute_path(self): return os.path.join(self.data_dir, self.filename)
[docs] def write_data(self, data): """ Write the given string data to the appropriate output file """ with open(self.absolute_path, "w") as f: f.write(data)
[docs]class RawTextDirectory(IterableCorpus): """ Basic datatype for reading in all the files in a directory and its subdirectories as raw text documents. Generally, this may be appropriate to use as the input datatype at the start of a pipeline. You'll then want to pass it through a tarred corpus filter to get it into a suitable form for input to other modules. """ datatype_name = "raw_text_directory" input_module_options = { "path": { "help": "Full path to the directory containing the files", "required": True, }, "encoding": { "help": "Encoding used to store the text. Should be given as an encoding name known to Python. By " "default, assumed to be 'utf8'", "default": "utf8", }, "encoding_errors": { "help": "What to do in the case of invalid characters in the input while decoding (e.g. illegal utf-8 " "chars). Select 'strict' (default), 'ignore', 'replace'. See Python's str.decode() for details", "default": "strict", }, } data_point_type = RawTextDocumentType requires_data_preparation = True
[docs] def prepare_data(self, output_dir, log): log.info("Counting files in input directory") # Walk over the entire subdirectory structure at the given path num_docs = sum(1 for __ in self.walk()) with PimlicoDatatypeWriter(output_dir) as datatype: datatype.metadata["length"] = num_docs
[docs] def walk(self): base_path = self.options["path"] for base_dir, subdirs, filenames in os.walk(base_path): for filename in filenames: yield os.path.join(base_dir, filename)
[docs] def filter_document(self, doc): """ Each document is passed through this filter before being yielded. Default implementation does nothing, but this makes it easy to implement custom postprocessing by overriding. """ return doc
def __iter__(self): base_path = self.options["path"] encoding = self.options["encoding"] errors = self.options["encoding_errors"] for file_path in self.walk(): with open(file_path, "r") as f: # Use the file's path within the base directory as its doc name rel_path = os.path.relpath(file_path, base_path) data = f.read().decode(encoding, errors=errors) # Apply datatype-specific processing of the data document = self.process_document_data_with_datatype(data) # Allow subclasses to apply filters to the data if not isinstance(document, InvalidDocument) and not self.raw_data: document = self.filter_document(document) yield rel_path, document
[docs] def get_required_paths(self): return [self.options["path"]]