Source code for pimlico.utils.pimarc.writer

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

import json
import os

from future.utils import raise_from

from pimlico.utils.pimarc.index import DuplicateFilename
from .utils import _write_var_length_data
from .index import PimarcIndexAppender


[docs]class PimarcWriter(object): """ The Pimlico Archive format: writing new archives or appending existing ones. """ def __init__(self, archive_filename, mode="w"): self.archive_filename = archive_filename self.index_filename = "{}i".format(archive_filename) self.append = mode == "a" if self.append: # Check the old archive already exists if not os.path.exists(archive_filename): raise IOError("cannot append to non-existent archive: {}".format(archive_filename)) if not os.path.exists(self.index_filename): raise IOError("cannot append to archive: index file doesn't exist: {}".format(self.index_filename)) else: # Remove any existing files if os.path.exists(archive_filename): os.remove(archive_filename) if os.path.exists(self.index_filename): os.remove(self.index_filename) self.archive_file = open(self.archive_filename, mode="ab" if self.append else "wb") self.index = PimarcIndexAppender(self.index_filename, mode="a" if self.append else "w")
[docs] @staticmethod def delete(archive_filename): """ Delete all files associated with the given archive. At the moment, this is just the archive file itself and the associated index. """ if os.path.exists(archive_filename): os.remove(archive_filename) index_filenam = "{}i".format(archive_filename) if os.path.exists(index_filenam): os.remove(index_filenam)
[docs] def close(self): self.archive_file.close() self.index.close()
def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close()
[docs] def write_file(self, data, name=None, metadata=None): """ Append a write to the end of the archive. The metadata should be a dictionary that can be encoded as JSON (which is how it will be stored). The data should be a bytes object. If you want to write text files, you should encode the text as UTF-8 to get a bytes object and write that. Setting `name=X` is simply a shorthand for setting `metadata["name"]=X`. Either `name` or a metadata dict including the `name` key is required. """ if metadata is None: metadata = {} if name is not None: filename = name metadata["name"] = name else: # The file's name should always be in the metadata as "name" try: filename = metadata["name"] except KeyError: raise MetadataError("metadata should include 'name' key") # Check before we write anything that the filename isn't already used if filename in self.index: raise DuplicateFilename(filename) # Check where we're up to in the file # This tells us where the metadata starts, which will be stored in the index metadata_start = self.archive_file.tell() # Encode the metadata as utf-8 JSON try: metadata_data = json.dumps(metadata).encode("utf-8") except Exception as e: raise_from(MetadataError("problem encoding metadata as JSON"), e) try: # Write it to the file, including its length _write_var_length_data(self.archive_file, metadata_data) # Check where we're up to in the file # This tells us where the file data starts, which will be stored in the index data_start = self.archive_file.tell() # Write out the data, including its length _write_var_length_data(self.archive_file, data) # Add the file to the index self.index.append(filename, metadata_start, data_start) except: # If anything goes wrong during writing or it's cancelled by an interrupt, # truncate the partial data that we've just written, so we don't leave the file # in a messed up state self.archive_file.truncate(metadata_start) self.archive_file.seek(metadata_start) # Re-raise the exception for handling further up raise
[docs] def flush(self): """ Flush the archive's data out to disk, archive and index. """ # First call flush(), which does a basic flush to RAM cache self.archive_file.flush() # Then we also need to force the system to write it to disk os.fsync(self.archive_file.fileno()) # The index flush does the same with its file self.index.flush()
[docs]class MetadataError(Exception): pass