Mercurial > ift6266
diff utils/scalar_series/series.py @ 186:d364a130b221
Ajout du code de base pour scalar_series. Modifications à stacked_dae: réglé un problème avec les input_divider (empêchait une optimisation), et ajouté utilisation des séries. Si j'avais pas déjà commité, aussi, j'ai enlevé l'histoire de réutilisation du pretraining: c'était compliqué (error prone) et ça créait des jobs beaucoup trop longues.
author | fsavard |
---|---|
date | Mon, 01 Mar 2010 11:45:25 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/scalar_series/series.py Mon Mar 01 11:45:25 2010 -0500 @@ -0,0 +1,311 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +import sys +import os +import os.path +import array + +# for BasicStatsSeries +import numpy + +# To access .value if necessary +import theano.tensor.sharedvar + +''' +* TODO: add xy series +* TODO: add graph() for base and accumulator +* TODO: flush_every for BaseStatsSeries +* TODO: warn when Mux append() is called with a nonexisting name +* SeriesContainers are also series, albeit with more complex elements appended +* Each series has a "name" which corresponds in some way to the directory or file in which it's saved +''' + +# Simple class to append numbers and flush them to a file once in a while +class BaseSeries(): + # for types, see http://docs.python.org/library/array.html + def __init__(self, name, directory, type='f', flush_every=1): + self.type = type + self.flush_every = flush_every + + if not name or not directory: + raise Exception("name and directory must be provided (strings)") + + self.directory = directory + self.name = name + + if name and directory: + self.filepath = os.path.join(directory, name) + + self._array = array.array(type) + # stores the length not stored in file, waiting to be flushed + self._buffered = 0 + + def append(self, newitem): + self._array.append(newitem) + + self._buffered += 1 + if self._buffered >= self.flush_every: + self.flush() + + def append_list(self, items): + self._array.fromlist(items) + self._buffered += len(items) + if self._buffered >= self.flush_every: + self.flush() + + def flush(self): + if self._buffered == 0: + return + with open(self.filepath, "wb") as f: + s = self._array[-self._buffered:].tostring() + f.write(s) + + def tolist(self): + return self._array.tolist() + + def load_from_file(self): + if not self.filepath: + raise Exception("No name/directory provided") + + self._array = array.array(self.type) + self._buffered = 0 + + statinfo = os.stat(self.filepath) + size = statinfo.st_size + num_items = size / self._array.itemsize + + with open(self.filepath, "rb") as f: + self._array.fromfile(f, num_items) + +class AccumulatorSeries(BaseSeries): + ''' + reduce_every: group (sum or mean) the last "reduce_every" items whenever we have enough + and create a new item added to the real, saved array + (if elements remain at the end, less then "reduce_every", they'll be discarded on program close) + flush_every: this is for items of the real, saved array, not in terms of number of calls to "append" + ''' + def __init__(self, reduce_every, + name, directory, flush_every=1, + mean=False): + BaseSeries.__init__(self, name=name, directory=directory, type='f', flush_every=flush_every) + self.reduce_every = reduce_every + self._accumulator = 0.0 + self._num_accumulated = 0 + self.use_mean = mean + + @classmethod + def series_constructor(cls, reduce_every, mean=False): + def cstr(name, directory, flush_every=1): + return cls(reduce_every=reduce_every, mean=mean, name=name, directory=directory, flush_every=flush_every) + return cstr + + def append(self, item): + self._accumulator += item + self._num_accumulated += 1 + if self._num_accumulated >= self.reduce_every: + n = self._accumulator + if self.use_mean: + n = n / self.reduce_every + BaseSeries.append(self, n) + + self._num_accumulated = 0 + self._accumulator = 0.0 + + def append_list(self, items): + for i in items: + self.append(i) + +class SeriesContainer(): + def __init__(self, parent_directory, name, + series_constructor=BaseSeries): + self.parent_directory = parent_directory + self.name = name + + if not parent_directory or not name: + raise Exception("parent_directory and name must be provided (strings)") + + self.directory_path = os.path.join(parent_directory, name) + + self.series_constructor = series_constructor + + # attempt to create directory for series + if not os.path.isdir(self.directory_path): + os.mkdir(self.directory_path) + + def graph(self): + pass + +class BasicStatsSeries(SeriesContainer): + def __init__(self, parent_directory, name, series_constructor=BaseSeries, + mean=True, minmax=True, std=True): + SeriesContainer.__init__(self, parent_directory=parent_directory, name=name, series_constructor=series_constructor) + + self.save_mean = mean + self.save_minmax = minmax + self.save_std = std + + self.create_series() + + @classmethod + def series_constructor(cls, mean=True, minmax=True, std=True): + def cstr(name, directory, flush_every=1): + return cls(name=name, parent_directory=directory, + mean=mean, minmax=minmax, std=std) + return cstr + + + def create_series(self): + if self.save_mean: + self.means = self.series_constructor(name="mean", directory=self.directory_path) + + if self.save_minmax: + self.mins = self.series_constructor(name="min", directory=self.directory_path) + self.maxes = self.series_constructor(name="max", directory=self.directory_path) + + if self.save_std: + self.stds = self.series_constructor(name="std", directory=self.directory_path) + + def append(self, array): + # TODO: shouldn't this be the job of the caller? (at least ParamsArraySeries) + if isinstance(array, theano.tensor.sharedvar.TensorSharedVariable): + array = array.value + + if self.save_mean: + n = numpy.mean(array) + self.means.append(n) + if self.save_minmax: + n = numpy.min(array) + self.mins.append(n) + n = numpy.max(array) + self.maxes.append(n) + if self.save_std: + n = numpy.std(array) + self.stds.append(n) + + def load_from_file(self): + self.load_from_directory() + + def load_from_directory(self): + if self.save_mean: + self.means.load_from_file() + + if self.save_minmax: + self.mins.load_from_file() + self.maxes.load_from_file() + + if self.save_std: + self.stds.load_from_file() + + def graph(self, xes=None): + import pylab + + if self.save_minmax: + mn = numpy.array(self.mins.tolist()) + mx = numpy.array(self.maxes.tolist()) + if self.save_mean: + y = numpy.array(self.means.tolist()) + else: + y = (mn+mx) / 2 + + above_y = mx - y + below_y = y - mn + + if not xes: + xes = numpy.arange(len(y)) + + pylab.errorbar(x=xes, y=y, yerr=[below_y, above_y]) + + elif self.save_mean: + y = numpy.array(self.means.tolist()) + if not xes: + xes = numpy.arange(len(y)) + + pylab.plot(x=xes, y=y) + + +class SeriesMultiplexer(): + def __init__(self): + self._series_dict = {} + self._warned_for = {} + + def append(self, series_name, item): + # if we don't have the series, just don't do anything + if self._series_dict.has_key(series_name): + s = self._series_dict[series_name] + s.append(item) + elif not self._warned_for.has_key(series_name): + print "WARNING: SeriesMultiplexer called with unknown name ", series_name + self._warned_for[series_name] = 1 + + def append_list(self, series_name, items): + if self._series_dict.has_key(series_name): + s = self._series_dict[series_name] + s.append_list(items) + elif not self._warned_for.has_key(series_name): + print "WARNING: SeriesMultiplexer called with unknown name ", series_name + self._warned_for[series_name] = 1 + + def add_series(self, series): + if self._series_dict.has_key(series.name): + raise Exception("A series with such a name already exists") + self._series_dict[series.name] = series + +class SeriesList(): + def __init__(self, num_elements, name, directory, series_constructor=BaseSeries): + self._subseries = [None] * num_elements + self.name = name + + for i in range(num_elements): + newname = name + "." + str(i) + self._subseries[i] = series_constructor(name=newname, directory=directory) + + def load_from_files(self): + self.load_from_file() + + def load_from_file(self): + for s in self._subseries: + s.load_from_file() + + # no "append_list", this would get confusing + def append(self, list_of_items): + if len(list_of_items) != len(self._subseries): + raise Exception("bad number of items, expected " + str(len(self._subseries)) + ", got " + str(len(list_of_items))) + for i in range(len(list_of_items)): + self._subseries[i].append(list_of_items[i]) + + +# Just a shortcut +class ParamsArrayStats(SeriesList): + def __init__(self, num_params_arrays, name, directory): + cstr = BasicStatsSeries.series_constructor() + + SeriesList.__init__(self, num_elements=num_params_arrays, + name=name, directory=directory, + series_constructor=cstr) + +# ------------------------ +# Utilities to work with the series files from the command line + +# "dumpf" +def dump_floats_file(filepath): + print "Floats dump of ", filepath + with open(filepath, "rb") as f: + s = os.stat(filepath) + size = s.st_size + num = size / 4 + a = array.array('f') + a.fromfile(f, num) + print a.tolist() + +if __name__ == '__main__': + args = sys.argv[1:] + + if len(args) == 2 and args[0] == "dumpf": + file = args[1] + dump_floats_file(file) + else: + print "Bad arguments" +