Source code for pimlico.datatypes.features

# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

from builtins import object
from pimlico.datatypes import NamedFileCollection
from pimlico.datatypes.base import DatatypeWriteError
from pimlico.utils.core import cached_property


[docs]class ScoredRealFeatureSets(NamedFileCollection): """ Sets of features, where each feature has an associated real number value, and each set (i.e. data point) has a score. This is suitable as training data for a multidimensional regression. Stores a dictionary of feature types and uses integer IDs to refer to them in the data storage. .. todo:: Add unit test for ScoredReadFeatureSets """ datatype_name = "scored_real_feature_sets" datatype_supports_python2 = True def __init__(self, *args, **kwargs): super(ScoredRealFeatureSets, self).__init__(["feature_types.list", "data.csv"], *args, **kwargs)
[docs] def browse_file(self, reader, filename): if filename == "data.csv": # Show feature names instead of IDs feature_names = reader.feature_types data = reader.read_file(filename) lines = [line.split() for line in data.splitlines()] return u"\n".join(u"{}: {}".format(row[0], u", ".join(u"{} ({:.2f})".format(feature_names[int(item.partition(":")[0])], float(item.partition(":")[2])) for item in row[1:])) for row in lines).encode("utf8") else: super(ScoredRealFeatureSets, self).browse_file(reader, filename)
[docs] class Reader(object): def __iter__(self): for features, score in self.iter_ids(): # Translate feature IDs into names yield dict((self.feature_types[f], v) for (f, v) in features.items()), score
[docs] def read_samples(self): """ Read all samples in from the data file. Note that `__iter__()` iterates over the file without loading everything into memory, which may be preferable if dealing with big datasets. """ return list(self)
[docs] def iter_ids(self): """ Iterate over the raw ID data from the data file, without translating feature type IDs into feature names. """ with open(self.get_absolute_path("data.csv"), "r") as f: for line in f: line.rstrip("\n") values = line.split() # The first value is the score score = float(values[0]) # The rest are feature id -> value mappings feature_id_vals = [val.split(":") for val in values[1:]] features = dict((int(f), float(v)) for (f, v) in feature_id_vals) yield features, score
@cached_property def feature_types(self): data = self.read_file("feature_types.list") return data.decode("utf8").splitlines() @cached_property def num_samples(self): # Count the lines in the data file with open(self.get_absolute_path("data.csv"), "r") as f: return sum(1 for __ in f) def __len__(self): return self.num_samples
[docs] class Writer(object):
[docs] def set_feature_types(self, feature_types): """ Explicitly set the list of feature types that will be written out. All feature types given will be included, plus possibly others that are used in the written samples, which will be added to the set. This can be useful if you want your feature vocabulary to include the whole of a given set, even if some feature types are never used in the data. It can also be useful to ensure particular IDs are used for particular feature types, if you care about that. """ if len(self._used_feature_types) > 0: # The feature set has already started being constructed raise DatatypeWriteError("tried to set feature type list explicitly after it's already started " "being constructed implicitly (or has previously been set explicitly)") self._used_feature_types = list(feature_types)
[docs] def write_samples(self, samples): """ Writes a list of samples, each given as a (features, score) pair. See `write_sample()` """ for (fs, score) in samples: self.write_sample(fs, score)
[docs] def write_sample(self, features, score): """ Write out a single sample to the end of the data file. Features should be given by name in a dictionary mapping the feature type to its value. :param features: dict(feature name -> feature value) :param score: score associated with this data point """ # Map feature names to IDs, adding new feature types as necessary self._used_feature_types.extend(f for f in features.keys() if f not in self._used_feature_types) features_by_id = dict( (self._used_feature_types.index(f), val) for (f, val) in features.items() ) # Write out a line containing the score and each of the feature id -> value mappings self.data_file.write("{:f} {}\n".format( score, " ".join(u"{:d}:{:f}".format(f, v) for (f, v) in features_by_id.items()) ))
def __enter__(self): obj = super(ScoredRealFeatureSets.Writer, self).__enter__() # Open the data file for writing: we'll write one data point per line self.data_file = open(self.get_absolute_path("data.csv"), "w") # Keep track of used feature types to output the dictionary at the end self._used_feature_types = [] return self def __exit__(self, exc_type, exc_val, exc_tb): self.data_file.close() self.task_complete("write_data.csv") # Write out the feature type dictionary self.write_file("feature_types.list", u"\n".join(self._used_feature_types).encode("utf8")) super(ScoredRealFeatureSets.Writer, self).__exit__(exc_type, exc_val, exc_tb)