Source code for pimlico.utils.pimarc.tar

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Wrapper around tar reader, to provide the same interface as Pimarc.

This means we can deprecate the use of tar files, but keep backwards compatibility
for a time, whilst moving over to direct use of Pimarc objects.

"""
from builtins import bytes
from future.utils import PY2

import os
import shutil
import tarfile
from tempfile import mkdtemp

from itertools import islice

from pimlico.utils.pimarc.reader import StartAfterFilenameNotFound


[docs]class PimarcTarBackend(object):
    def __init__(self, archive_filename):
        self.archive_filename = archive_filename
        self.archive_file = None
        self.closed = False

[docs]    def open(self):
        self.archive_file = tarfile.open(self.archive_filename, mode="r:")
        return self.archive_file

[docs]    def close(self):
        self.archive_file.close()
        self.closed = True

    def __enter__(self):
        self.archive_file = self.open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def _get_metadata(self, filename):
        """
        Metadata is always the same with the tar backend: just a dictionary containing
        the key `name` with the filename.

        """
        return {"name": filename}

    def __getitem__(self, item):
        """
        Random access into the archive. Load a named file's data and metadata.

        This is a very bad thing to do (at least to do many times) with tar files. That's
        why I've replaced tar files with Pimarc. However, for backwards compatibility,
        we do this in the inefficient way the tar allows.

        """
        return self._get_metadata(item), self.archive_file.extractfile(item).read()

[docs]    def iter_filenames(self):
        """
        Just iterate over the filenames (decoded if necessary). Used to create metadata,
        check for file existence, etc.

        Not as fast as with Pimarc, as we need to pass over the whole archive file to
        read all the names.

        """
        # Make sure we're at the start of the file
        self.archive_file.fileobj.seek(0)
        for tarinfo in self.archive_file:
            if PY2:
                yield tarinfo.name.decode("utf-8")
            else:
                yield tarinfo.name

[docs]    def iter_metadata(self):
        """
        Iterate over all files in the archive, yielding just the metadata, skipping
        over the data.

        """
        for filename in self.iter_filenames():
            yield self._get_metadata(filename)

[docs]    def iter_files(self, skip=None, start_after=None):
        """
        Iterate over files, together with their JSON metadata, which includes their name (as "name").

        :param start_after: skips all files before that with the given name, which is
            expected to be in the archive
        :param skip: skips over the first portion of the archive, until this number of documents have
            been seen. Ignored is start_after is given.
        """
        # Make sure we're at the start of the file
        self.archive_file.fileobj.seek(0)

        # Prepare a temporary directory to extract everything to
        tmp_dir = mkdtemp()

        started = True
        tarinfo_iter = self.archive_file
        if start_after is not None:
            # Don't start until we encounter the filename
            started = False
        elif skip is not None:
            # Skip over the given number of files
            tarinfo_iter = islice(self.archive_file, skip, None)

        try:
            for tarinfo in tarinfo_iter:
                if PY2:
                    filename = tarinfo.name.decode("utf-8")
                else:
                    filename = tarinfo.name

                if not started:
                    # Check whether this is the file we're to start after
                    if filename == start_after:
                        # Skip this file, but start on the next one
                        started = True
                    continue

                # Extract the raw file data
                self.archive_file.extract(tarinfo, tmp_dir)
                # Read in the data
                with open(os.path.join(tmp_dir, filename), "rb") as f:
                    raw_data = f.read()

                # Wrap in bytes
                # In Py2, this converts the string to a bytes backport
                # In Py3, this is a no-op
                raw_data = bytes(raw_data)

                yield self._get_metadata(filename), raw_data

                # Remove the file once we're done with it (when we request another)
                os.remove(os.path.join(tmp_dir, filename))
        finally:
            # Remove the temp dir
            shutil.rmtree(tmp_dir)

        # Catch the case where the filename requested as a starting point wasn't found
        if not started and start_after is not None:
            raise StartAfterFilenameNotFound("filename '{}' not found in the tar archive".format(start_after))

    def __iter__(self):
        return self.iter_files()

    def __len__(self):
        self.archive_file.fileobj.seek(0)
        return sum((1 for tarinfo in self.archive_file), 0)