Source code for pimlico.utils.pimarc.reader

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

import json

from builtins import super, bytes

from .utils import _read_var_length_data, _skip_var_length_data
from .index import PimarcIndex


[docs]class PimarcReader(object):
    """
    The Pimlico Archive format: read-only archive.

    """
    def __init__(self, archive_filename):
        self.archive_filename = archive_filename
        if not archive_filename.endswith(".prc"):
            raise IOError("pimarc files should have the extension '.prc'")
        self.index_filename = "{}i".format(archive_filename)

        self.archive_file = open(self.archive_filename, mode="rb")
        self.index = PimarcIndex.load(self.index_filename)
        self.closed = False

[docs]    def close(self):
        self.archive_file.close()
        # Allow garbage collection of the index
        self.index = None
        self.closed = True

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def __getitem__(self, item):
        """
        Random access into the archive. Load a named file's data and metadata.

        """
        # Look up the filename in the index and get pointers to its metadata and data
        metadata_start, data_start = self.index[item]
        # There's some redundancy in this case: we're now presumably at the start
        # of the data after reading the metadata, so don't need data_start
        # Assume that this is the case and continue reading from where we stopped
        metadata, data = read_doc_from_pimarc_file(self.archive_file, metadata_start)
        return metadata, data

[docs]    def read_file(self, filename):
        """ Load a file. Same as `reader[filename]` """
        return self[filename]

[docs]    def iter_filenames(self):
        """
        Iterate over just the filenames in the archive, without further metadata or file data.
        Fast for Pimarc, as the index is fully loaded into memory.

        """
        return iter(self.index.keys())

    def _read_metadata(self):
        """
        Assuming the file is currently at the start of a metadata block, read and
        parse that metadata.

        """
        # Read the metadata
        return PimarcFileMetadata(_read_var_length_data(self.archive_file))

    def _skip_block(self):
        """
        Assuming the file is currently at the start of a metadata block or a file block,
        read how long it is and skip over it.

        """
        _skip_var_length_data(self.archive_file)

[docs]    def iter_metadata(self):
        """
        Iterate over all files in the archive, yielding just the metadata, skipping
        over the data.

        """
        # Make sure we're at the start of the file
        self.archive_file.seek(0)
        while True:
            # Try reading the metadata of the next file
            try:
                metadata = self._read_metadata()
            except EOFError:
                # At this point, it's normal to get an EOF: we've just got to the end neatly
                break
            # This should be followed by the file's data, which we skip over, since we don't need it
            self._skip_block()
            yield metadata

[docs]    def iter_files(self, skip=None, start_after=None):
        """
        Iterate over files, together with their JSON metadata, which includes their name (as "name").

        :param start_after: skips all files before that with the given name, which is
            expected to be in the archive
        :param skip: skips over the first portion of the archive, until this number of documents have
            been seen. Ignored is start_after is given.
        """
        if start_after is not None:
            # Look up this filename in the index
            if start_after not in self.index:
                raise StartAfterFilenameNotFound("filename '{}' not found in the Pimarc archive".format(start_after))
            # Get the start byte of the file's data
            start_after_start_byte = self.index.get_data_start_byte(start_after)
            # Seek to this byte, then skip over the data, so we're at the start of the next file's metadata
            self.archive_file.seek(start_after_start_byte)
            self._skip_block()
            # Don't skip any more files
            started = True
        else:
            # Make sure we're at the start of the file
            self.archive_file.seek(0)

            if skip is not None and skip < 1:
                skip = None
            # Don't wait to start if skip is not given
            started = skip is None

        skipped = 0
        while True:
            if not started:
                # Skip this file's metadata
                self._skip_block()
                # And the file's data
                self._skip_block()

                skipped += 1
                if skipped >= skip:
                    # Skipped enough files: start reading at the next one
                    started = True
            else:
                # Try reading the metadata of the next file
                try:
                    metadata = self._read_metadata()
                except EOFError:
                    # At this point, it's normal to get an EOF: we've just got to the end neatly
                    break
                # This should be followed by the file's data immediately
                # Read it in
                # If there's an EOF here, something's wrong with the file
                data = _read_var_length_data(self.archive_file)

                # Wrap in bytes
                # In Py2, this converts the string to a bytes backport
                # In Py3, this is a no-op
                data = bytes(data)

                yield metadata, data

    def __iter__(self):
        return self.iter_files()

    def __len__(self):
        return len(self.index)


[docs]def read_doc_from_pimarc(archive_filename, metadata_start_byte):
    """
    Read a single file's metadata and file data from a given start point in the
    archive. This can be useful if you know the start point and don't want to
    read in the whole index for an archive.

    :param archive_filename: path to archive file
    :param metadata_start_byte: byte from which metadata starts
    :return: tuple (metadata, raw file data)
    """
    with open(archive_filename, mode="rb") as archive_file:
        return read_doc_from_pimarc_file(archive_file, metadata_start_byte)


[docs]def read_doc_from_pimarc_file(archive_file, metadata_start_byte):
    """
    Same as `read_doc_from_pimarc`, but operates on an already-opened
    archive file.

    :param archive_file: file-like object
    :param metadata_start_byte: byte from which metadata starts
    :return: tuple (metadata, raw file data)
    """

    # Jump to the start of the metadata
    archive_file.seek(metadata_start_byte)
    # Read the metadata
    metadata = PimarcFileMetadata(_read_var_length_data(archive_file))
    # We're now presumably at the start of the data
    # Assume that this is the case and continue reading from where we stopped
    data = _read_var_length_data(archive_file)
    return metadata, data


[docs]def metadata_decode_decorator(fn):
    def _new_fn(self, *args, **kwargs):
        self.decode()
        return fn(self, *args, **kwargs)
    return _new_fn


[docs]class PimarcFileMetadata(dict):
    """
    Simple wrapper around the JSON-encoded metadata associated with a file in a
    Pimarc archive. When the metadata is loaded, the raw bytes data is wrapped in
    an instance of PimarcFileMetadata, so that it can be easily decoded when
    needed, but avoiding decoding all metadata, which might not ever be needed.

    You can simply use the object as if it is a dict and it will decode the JSON
    data the first time you try accessing it. You can also call `dict(obj)` to
    get a plain dict instead.

    """
    def __init__(self, raw_data):
        super().__init__()
        self.raw_data = raw_data
        self._decoded = False

[docs]    def decode(self):
        if not self._decoded:
            # Decode the metadata and parse as JSON
            self.update(json.loads(self.raw_data.decode("utf-8")))
            self._decoded = True

    __getitem__ = metadata_decode_decorator(dict.__getitem__)
    __setitem__ = metadata_decode_decorator(dict.__setitem__)
    __delitem__ = metadata_decode_decorator(dict.__delitem__)
    keys = metadata_decode_decorator(dict.keys)
    values = metadata_decode_decorator(dict.values)
    items = metadata_decode_decorator(dict.items)


[docs]class StartAfterFilenameNotFound(KeyError):
    pass