# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
"""
Documents consisting of strings.
.. seealso::
:class:`~pimlico.datatypes.corpora.TextDocumentType` and
:class:`~pimlico.datatypes.corpora.RawTextDocumentType`: basic text
(i.e. unicode string) document types for normal textual documents.
"""
from pimlico.datatypes.corpora.data_points import RawDocumentType
__all__ = ["LabelDocumentType"]
[docs]class LabelDocumentType(RawDocumentType):
"""
Simple document type for storing a short label associated with a document.
Identical to :class:`~pimlico.datatypes.corpora.TextDocumentType`,
but distinguished for typechecking, so that only corpora designed to be
used as short labels can be used as input where a label corpus is required.
The string label is stored in the ``label`` attribute.
"""
[docs] class Document(object):
keys = ["label"]
[docs] def internal_to_raw(self, internal_data):
return bytes(internal_data["label"].encode("utf-8"))
[docs] def raw_to_internal(self, raw_data):
return {"label": raw_data.decode("utf-8")}