# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
from builtins import object
from pimlico.datatypes import NamedFileCollection
from pimlico.datatypes.base import DatatypeWriteError
from pimlico.utils.core import cached_property
[docs]class ScoredRealFeatureSets(NamedFileCollection):
"""
Sets of features, where each feature has an associated real number value,
and each set (i.e. data point) has a score.
This is suitable as training data for a multidimensional regression.
Stores a dictionary of feature types and uses integer IDs to refer to them
in the data storage.
.. todo::
Add unit test for ScoredReadFeatureSets
"""
datatype_name = "scored_real_feature_sets"
datatype_supports_python2 = True
def __init__(self, *args, **kwargs):
super(ScoredRealFeatureSets, self).__init__(["feature_types.list", "data.csv"], *args, **kwargs)
[docs] def browse_file(self, reader, filename):
if filename == "data.csv":
# Show feature names instead of IDs
feature_names = reader.feature_types
data = reader.read_file(filename)
lines = [line.split() for line in data.splitlines()]
return u"\n".join(u"{}: {}".format(row[0], u", ".join(u"{} ({:.2f})".format(feature_names[int(item.partition(":")[0])], float(item.partition(":")[2])) for item in row[1:])) for row in lines).encode("utf8")
else:
super(ScoredRealFeatureSets, self).browse_file(reader, filename)
[docs] class Reader(object):
def __iter__(self):
for features, score in self.iter_ids():
# Translate feature IDs into names
yield dict((self.feature_types[f], v) for (f, v) in features.items()), score
[docs] def read_samples(self):
"""
Read all samples in from the data file.
Note that `__iter__()` iterates over the file without loading everything
into memory, which may be preferable if dealing with big datasets.
"""
return list(self)
[docs] def iter_ids(self):
"""
Iterate over the raw ID data from the data file, without translating feature
type IDs into feature names.
"""
with open(self.get_absolute_path("data.csv"), "r") as f:
for line in f:
line.rstrip("\n")
values = line.split()
# The first value is the score
score = float(values[0])
# The rest are feature id -> value mappings
feature_id_vals = [val.split(":") for val in values[1:]]
features = dict((int(f), float(v)) for (f, v) in feature_id_vals)
yield features, score
@cached_property
def feature_types(self):
data = self.read_file("feature_types.list")
return data.decode("utf8").splitlines()
@cached_property
def num_samples(self):
# Count the lines in the data file
with open(self.get_absolute_path("data.csv"), "r") as f:
return sum(1 for __ in f)
def __len__(self):
return self.num_samples
[docs] class Writer(object):
[docs] def set_feature_types(self, feature_types):
"""
Explicitly set the list of feature types that will be written out.
All feature types given will be included, plus possibly others that are
used in the written samples, which will be added to the set.
This can be useful if you want your feature vocabulary to include
the whole of a given set, even if some feature types are never
used in the data. It can also be useful to ensure particular IDs
are used for particular feature types, if you care about that.
"""
if len(self._used_feature_types) > 0:
# The feature set has already started being constructed
raise DatatypeWriteError("tried to set feature type list explicitly after it's already started "
"being constructed implicitly (or has previously been set explicitly)")
self._used_feature_types = list(feature_types)
[docs] def write_samples(self, samples):
"""
Writes a list of samples, each given as a (features, score) pair.
See `write_sample()`
"""
for (fs, score) in samples:
self.write_sample(fs, score)
[docs] def write_sample(self, features, score):
"""
Write out a single sample to the end of the data file.
Features should be given by name in a dictionary mapping the feature type
to its value.
:param features: dict(feature name -> feature value)
:param score: score associated with this data point
"""
# Map feature names to IDs, adding new feature types as necessary
self._used_feature_types.extend(f for f in features.keys() if f not in self._used_feature_types)
features_by_id = dict(
(self._used_feature_types.index(f), val) for (f, val) in features.items()
)
# Write out a line containing the score and each of the feature id -> value mappings
self.data_file.write("{:f} {}\n".format(
score,
" ".join(u"{:d}:{:f}".format(f, v) for (f, v) in features_by_id.items())
))
def __enter__(self):
obj = super(ScoredRealFeatureSets.Writer, self).__enter__()
# Open the data file for writing: we'll write one data point per line
self.data_file = open(self.get_absolute_path("data.csv"), "w")
# Keep track of used feature types to output the dictionary at the end
self._used_feature_types = []
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.data_file.close()
self.task_complete("write_data.csv")
# Write out the feature type dictionary
self.write_file("feature_types.list", u"\n".join(self._used_feature_types).encode("utf8"))
super(ScoredRealFeatureSets.Writer, self).__exit__(exc_type, exc_val, exc_tb)