# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
import json
from builtins import super, bytes
from .utils import _read_var_length_data, _skip_var_length_data
from .index import PimarcIndex
[docs]class PimarcReader(object):
"""
The Pimlico Archive format: read-only archive.
"""
def __init__(self, archive_filename):
self.archive_filename = archive_filename
if not archive_filename.endswith(".prc"):
raise IOError("pimarc files should have the extension '.prc'")
self.index_filename = "{}i".format(archive_filename)
self.archive_file = open(self.archive_filename, mode="rb")
self.index = PimarcIndex.load(self.index_filename)
self.closed = False
[docs] def close(self):
self.archive_file.close()
# Allow garbage collection of the index
self.index = None
self.closed = True
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def __getitem__(self, item):
"""
Random access into the archive. Load a named file's data and metadata.
"""
# Look up the filename in the index and get pointers to its metadata and data
metadata_start, data_start = self.index[item]
# There's some redundancy in this case: we're now presumably at the start
# of the data after reading the metadata, so don't need data_start
# Assume that this is the case and continue reading from where we stopped
metadata, data = read_doc_from_pimarc_file(self.archive_file, metadata_start)
return metadata, data
[docs] def read_file(self, filename):
""" Load a file. Same as `reader[filename]` """
return self[filename]
[docs] def iter_filenames(self):
"""
Iterate over just the filenames in the archive, without further metadata or file data.
Fast for Pimarc, as the index is fully loaded into memory.
"""
return iter(self.index.keys())
def _read_metadata(self):
"""
Assuming the file is currently at the start of a metadata block, read and
parse that metadata.
"""
# Read the metadata
return PimarcFileMetadata(_read_var_length_data(self.archive_file))
def _skip_block(self):
"""
Assuming the file is currently at the start of a metadata block or a file block,
read how long it is and skip over it.
"""
_skip_var_length_data(self.archive_file)
[docs] def iter_files(self, skip=None, start_after=None):
"""
Iterate over files, together with their JSON metadata, which includes their name (as "name").
:param start_after: skips all files before that with the given name, which is
expected to be in the archive
:param skip: skips over the first portion of the archive, until this number of documents have
been seen. Ignored is start_after is given.
"""
if start_after is not None:
# Look up this filename in the index
if start_after not in self.index:
raise StartAfterFilenameNotFound("filename '{}' not found in the Pimarc archive".format(start_after))
# Get the start byte of the file's data
start_after_start_byte = self.index.get_data_start_byte(start_after)
# Seek to this byte, then skip over the data, so we're at the start of the next file's metadata
self.archive_file.seek(start_after_start_byte)
self._skip_block()
# Don't skip any more files
started = True
else:
# Make sure we're at the start of the file
self.archive_file.seek(0)
if skip is not None and skip < 1:
skip = None
# Don't wait to start if skip is not given
started = skip is None
skipped = 0
while True:
if not started:
# Skip this file's metadata
self._skip_block()
# And the file's data
self._skip_block()
skipped += 1
if skipped >= skip:
# Skipped enough files: start reading at the next one
started = True
else:
# Try reading the metadata of the next file
try:
metadata = self._read_metadata()
except EOFError:
# At this point, it's normal to get an EOF: we've just got to the end neatly
break
# This should be followed by the file's data immediately
# Read it in
# If there's an EOF here, something's wrong with the file
data = _read_var_length_data(self.archive_file)
# Wrap in bytes
# In Py2, this converts the string to a bytes backport
# In Py3, this is a no-op
data = bytes(data)
yield metadata, data
def __iter__(self):
return self.iter_files()
def __len__(self):
return len(self.index)
[docs]def read_doc_from_pimarc(archive_filename, metadata_start_byte):
"""
Read a single file's metadata and file data from a given start point in the
archive. This can be useful if you know the start point and don't want to
read in the whole index for an archive.
:param archive_filename: path to archive file
:param metadata_start_byte: byte from which metadata starts
:return: tuple (metadata, raw file data)
"""
with open(archive_filename, mode="rb") as archive_file:
return read_doc_from_pimarc_file(archive_file, metadata_start_byte)
[docs]def read_doc_from_pimarc_file(archive_file, metadata_start_byte):
"""
Same as `read_doc_from_pimarc`, but operates on an already-opened
archive file.
:param archive_file: file-like object
:param metadata_start_byte: byte from which metadata starts
:return: tuple (metadata, raw file data)
"""
# Jump to the start of the metadata
archive_file.seek(metadata_start_byte)
# Read the metadata
metadata = PimarcFileMetadata(_read_var_length_data(archive_file))
# We're now presumably at the start of the data
# Assume that this is the case and continue reading from where we stopped
data = _read_var_length_data(archive_file)
return metadata, data
[docs]class StartAfterFilenameNotFound(KeyError):
pass