# HG changeset patch # User fsavard # Date 1267461925 18000 # Node ID d364a130b221ae2b518417afde80807ccc4b6c07 # Parent b9ea8e2d071a515a3d4901510e26a75f1154b8ed Ajout du code de base pour scalar_series. Modifications à stacked_dae: réglé un problème avec les input_divider (empêchait une optimisation), et ajouté utilisation des séries. Si j'avais pas déjà commité, aussi, j'ai enlevé l'histoire de réutilisation du pretraining: c'était compliqué (error prone) et ça créait des jobs beaucoup trop longues. diff -r b9ea8e2d071a -r d364a130b221 deep/__init__.py diff -r b9ea8e2d071a -r d364a130b221 deep/stacked_dae/nist_sda.py --- a/deep/stacked_dae/nist_sda.py Fri Feb 26 17:45:52 2010 -0500 +++ b/deep/stacked_dae/nist_sda.py Mon Mar 01 11:45:25 2010 -0500 @@ -32,7 +32,7 @@ except ImportError: print "Could not import Series" -TEST_CONFIG = True +TEST_CONFIG = False NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' @@ -46,7 +46,7 @@ MAX_FINETUNING_EPOCHS = 2 REDUCE_EVERY = 10 -EXPERIMENT_PATH = "ift6266.scripts.stacked_dae.nist_sda.jobman_entrypoint" +EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint" JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], 'pretraining_epochs_per_layer': [10,20], diff -r b9ea8e2d071a -r d364a130b221 deep/stacked_dae/sgd_optimization.py --- a/deep/stacked_dae/sgd_optimization.py Fri Feb 26 17:45:52 2010 -0500 +++ b/deep/stacked_dae/sgd_optimization.py Mon Mar 01 11:45:25 2010 -0500 @@ -33,7 +33,7 @@ self.hp = hyperparameters self.n_ins = n_ins self.n_outs = n_outs - self.input_divider = numpy.asarray(input_divider, dtype=theano.config.floatX) + self.input_divider = input_divider if not series_mux: series_mux = DummyMux() @@ -117,14 +117,15 @@ # create a function to compute the mistakes that are made by the model # on the validation set, or testing set + shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX)) test_model = theano.function([index], self.classifier.errors, givens = { - self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / self.input_divider, + self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]}) validate_model = theano.function([index], self.classifier.errors, givens = { - self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / self.input_divider, + self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]}) @@ -161,6 +162,7 @@ validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) + self.series_mux.append("validation_error", this_validation_loss) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index+1, self.n_train_batches, \ this_validation_loss*100.)) @@ -181,6 +183,7 @@ # test it on the test set test_losses = [test_model(i) for i in xrange(self.n_test_batches)] test_score = numpy.mean(test_losses) + self.series_mux.append("test_error", test_score) print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index+1, self.n_train_batches, @@ -188,7 +191,7 @@ sys.stdout.flush() - self.series_mux.append("params", self.classifier.params) + self.series_mux.append("params", self.classifier.all_params) if patience <= iter : done_looping = True diff -r b9ea8e2d071a -r d364a130b221 deep/stacked_dae/stacked_dae.py --- a/deep/stacked_dae/stacked_dae.py Fri Feb 26 17:45:52 2010 -0500 +++ b/deep/stacked_dae/stacked_dae.py Mon Mar 01 11:45:25 2010 -0500 @@ -152,9 +152,12 @@ self.layers = [] self.pretrain_functions = [] self.params = [] + # MODIF: added this so we also get the b_primes + # (not used for finetuning... still using ".params") + self.all_params = [] self.n_layers = len(hidden_layers_sizes) - self.input_divider = numpy.asarray(input_divider, dtype=theano.config.floatX) + self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) if len(hidden_layers_sizes) < 1 : raiseException (' You must have at least one hidden layer ') @@ -196,6 +199,8 @@ corruption_level = corruption_levels[0],\ input = layer_input, \ shared_W = layer.W, shared_b = layer.b) + + self.all_params += dA_layer.params # Construct a function that trains this dA # compute gradients of layer parameters @@ -209,7 +214,7 @@ update_fn = theano.function([index], dA_layer.cost, \ updates = updates, givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.input_divider}) + self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) # collect this function into a list self.pretrain_functions += [update_fn] @@ -220,6 +225,7 @@ n_in = hidden_layers_sizes[-1], n_out = n_outs) self.params += self.logLayer.params + self.all_params += self.logLayer.params # construct a function that implements one step of finetunining # compute the cost, defined as the negative log likelihood @@ -234,7 +240,7 @@ self.finetune = theano.function([index], cost, updates = updates, givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.input_divider, + self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) # symbolic variable that points to the number of errors made on the diff -r b9ea8e2d071a -r d364a130b221 utils/scalar_series/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/scalar_series/__init__.py Mon Mar 01 11:45:25 2010 -0500 @@ -0,0 +1,2 @@ +from series import BaseSeries, AccumulatorSeries, SeriesContainer, BasicStatsSeries, SeriesMultiplexer, SeriesList, ParamsArrayStats + diff -r b9ea8e2d071a -r d364a130b221 utils/scalar_series/series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/scalar_series/series.py Mon Mar 01 11:45:25 2010 -0500 @@ -0,0 +1,311 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +import sys +import os +import os.path +import array + +# for BasicStatsSeries +import numpy + +# To access .value if necessary +import theano.tensor.sharedvar + +''' +* TODO: add xy series +* TODO: add graph() for base and accumulator +* TODO: flush_every for BaseStatsSeries +* TODO: warn when Mux append() is called with a nonexisting name +* SeriesContainers are also series, albeit with more complex elements appended +* Each series has a "name" which corresponds in some way to the directory or file in which it's saved +''' + +# Simple class to append numbers and flush them to a file once in a while +class BaseSeries(): + # for types, see http://docs.python.org/library/array.html + def __init__(self, name, directory, type='f', flush_every=1): + self.type = type + self.flush_every = flush_every + + if not name or not directory: + raise Exception("name and directory must be provided (strings)") + + self.directory = directory + self.name = name + + if name and directory: + self.filepath = os.path.join(directory, name) + + self._array = array.array(type) + # stores the length not stored in file, waiting to be flushed + self._buffered = 0 + + def append(self, newitem): + self._array.append(newitem) + + self._buffered += 1 + if self._buffered >= self.flush_every: + self.flush() + + def append_list(self, items): + self._array.fromlist(items) + self._buffered += len(items) + if self._buffered >= self.flush_every: + self.flush() + + def flush(self): + if self._buffered == 0: + return + with open(self.filepath, "wb") as f: + s = self._array[-self._buffered:].tostring() + f.write(s) + + def tolist(self): + return self._array.tolist() + + def load_from_file(self): + if not self.filepath: + raise Exception("No name/directory provided") + + self._array = array.array(self.type) + self._buffered = 0 + + statinfo = os.stat(self.filepath) + size = statinfo.st_size + num_items = size / self._array.itemsize + + with open(self.filepath, "rb") as f: + self._array.fromfile(f, num_items) + +class AccumulatorSeries(BaseSeries): + ''' + reduce_every: group (sum or mean) the last "reduce_every" items whenever we have enough + and create a new item added to the real, saved array + (if elements remain at the end, less then "reduce_every", they'll be discarded on program close) + flush_every: this is for items of the real, saved array, not in terms of number of calls to "append" + ''' + def __init__(self, reduce_every, + name, directory, flush_every=1, + mean=False): + BaseSeries.__init__(self, name=name, directory=directory, type='f', flush_every=flush_every) + self.reduce_every = reduce_every + self._accumulator = 0.0 + self._num_accumulated = 0 + self.use_mean = mean + + @classmethod + def series_constructor(cls, reduce_every, mean=False): + def cstr(name, directory, flush_every=1): + return cls(reduce_every=reduce_every, mean=mean, name=name, directory=directory, flush_every=flush_every) + return cstr + + def append(self, item): + self._accumulator += item + self._num_accumulated += 1 + if self._num_accumulated >= self.reduce_every: + n = self._accumulator + if self.use_mean: + n = n / self.reduce_every + BaseSeries.append(self, n) + + self._num_accumulated = 0 + self._accumulator = 0.0 + + def append_list(self, items): + for i in items: + self.append(i) + +class SeriesContainer(): + def __init__(self, parent_directory, name, + series_constructor=BaseSeries): + self.parent_directory = parent_directory + self.name = name + + if not parent_directory or not name: + raise Exception("parent_directory and name must be provided (strings)") + + self.directory_path = os.path.join(parent_directory, name) + + self.series_constructor = series_constructor + + # attempt to create directory for series + if not os.path.isdir(self.directory_path): + os.mkdir(self.directory_path) + + def graph(self): + pass + +class BasicStatsSeries(SeriesContainer): + def __init__(self, parent_directory, name, series_constructor=BaseSeries, + mean=True, minmax=True, std=True): + SeriesContainer.__init__(self, parent_directory=parent_directory, name=name, series_constructor=series_constructor) + + self.save_mean = mean + self.save_minmax = minmax + self.save_std = std + + self.create_series() + + @classmethod + def series_constructor(cls, mean=True, minmax=True, std=True): + def cstr(name, directory, flush_every=1): + return cls(name=name, parent_directory=directory, + mean=mean, minmax=minmax, std=std) + return cstr + + + def create_series(self): + if self.save_mean: + self.means = self.series_constructor(name="mean", directory=self.directory_path) + + if self.save_minmax: + self.mins = self.series_constructor(name="min", directory=self.directory_path) + self.maxes = self.series_constructor(name="max", directory=self.directory_path) + + if self.save_std: + self.stds = self.series_constructor(name="std", directory=self.directory_path) + + def append(self, array): + # TODO: shouldn't this be the job of the caller? (at least ParamsArraySeries) + if isinstance(array, theano.tensor.sharedvar.TensorSharedVariable): + array = array.value + + if self.save_mean: + n = numpy.mean(array) + self.means.append(n) + if self.save_minmax: + n = numpy.min(array) + self.mins.append(n) + n = numpy.max(array) + self.maxes.append(n) + if self.save_std: + n = numpy.std(array) + self.stds.append(n) + + def load_from_file(self): + self.load_from_directory() + + def load_from_directory(self): + if self.save_mean: + self.means.load_from_file() + + if self.save_minmax: + self.mins.load_from_file() + self.maxes.load_from_file() + + if self.save_std: + self.stds.load_from_file() + + def graph(self, xes=None): + import pylab + + if self.save_minmax: + mn = numpy.array(self.mins.tolist()) + mx = numpy.array(self.maxes.tolist()) + if self.save_mean: + y = numpy.array(self.means.tolist()) + else: + y = (mn+mx) / 2 + + above_y = mx - y + below_y = y - mn + + if not xes: + xes = numpy.arange(len(y)) + + pylab.errorbar(x=xes, y=y, yerr=[below_y, above_y]) + + elif self.save_mean: + y = numpy.array(self.means.tolist()) + if not xes: + xes = numpy.arange(len(y)) + + pylab.plot(x=xes, y=y) + + +class SeriesMultiplexer(): + def __init__(self): + self._series_dict = {} + self._warned_for = {} + + def append(self, series_name, item): + # if we don't have the series, just don't do anything + if self._series_dict.has_key(series_name): + s = self._series_dict[series_name] + s.append(item) + elif not self._warned_for.has_key(series_name): + print "WARNING: SeriesMultiplexer called with unknown name ", series_name + self._warned_for[series_name] = 1 + + def append_list(self, series_name, items): + if self._series_dict.has_key(series_name): + s = self._series_dict[series_name] + s.append_list(items) + elif not self._warned_for.has_key(series_name): + print "WARNING: SeriesMultiplexer called with unknown name ", series_name + self._warned_for[series_name] = 1 + + def add_series(self, series): + if self._series_dict.has_key(series.name): + raise Exception("A series with such a name already exists") + self._series_dict[series.name] = series + +class SeriesList(): + def __init__(self, num_elements, name, directory, series_constructor=BaseSeries): + self._subseries = [None] * num_elements + self.name = name + + for i in range(num_elements): + newname = name + "." + str(i) + self._subseries[i] = series_constructor(name=newname, directory=directory) + + def load_from_files(self): + self.load_from_file() + + def load_from_file(self): + for s in self._subseries: + s.load_from_file() + + # no "append_list", this would get confusing + def append(self, list_of_items): + if len(list_of_items) != len(self._subseries): + raise Exception("bad number of items, expected " + str(len(self._subseries)) + ", got " + str(len(list_of_items))) + for i in range(len(list_of_items)): + self._subseries[i].append(list_of_items[i]) + + +# Just a shortcut +class ParamsArrayStats(SeriesList): + def __init__(self, num_params_arrays, name, directory): + cstr = BasicStatsSeries.series_constructor() + + SeriesList.__init__(self, num_elements=num_params_arrays, + name=name, directory=directory, + series_constructor=cstr) + +# ------------------------ +# Utilities to work with the series files from the command line + +# "dumpf" +def dump_floats_file(filepath): + print "Floats dump of ", filepath + with open(filepath, "rb") as f: + s = os.stat(filepath) + size = s.st_size + num = size / 4 + a = array.array('f') + a.fromfile(f, num) + print a.tolist() + +if __name__ == '__main__': + args = sys.argv[1:] + + if len(args) == 2 and args[0] == "dumpf": + file = args[1] + dump_floats_file(file) + else: + print "Bad arguments" + diff -r b9ea8e2d071a -r d364a130b221 utils/scalar_series/test_series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/scalar_series/test_series.py Mon Mar 01 11:45:25 2010 -0500 @@ -0,0 +1,197 @@ +#!/usr/bin/python +# coding: utf-8 + +import sys +import tempfile +import os.path +import os + +import numpy + +from series import BaseSeries, AccumulatorSeries, SeriesContainer, BasicStatsSeries, SeriesMultiplexer, SeriesList, ParamsArrayStats + + +BASEDIR = tempfile.mkdtemp() + +def tempname(): + file = tempfile.NamedTemporaryFile(dir=BASEDIR) + filepath = file.name + return os.path.split(filepath) + +def tempdir(): + wholepath = os.path.dirname(tempfile.mkdtemp(dir=BASEDIR)) + # split again, interpreting the last directory as a filename + return os.path.split(wholepath) + +def tempseries(type='f', flush_every=1): + dir, filename = tempname() + + s = BaseSeries(name=filename, directory=dir, type=type, flush_every=flush_every) + + return s + +def test_Series_storeload(): + s = tempseries() + + s.append(12.0) + s.append_list([13.0,14.0,15.0]) + + s2 = BaseSeries(name=s.name, directory=s.directory, flush_every=15) + # also test if elements stored before load_from_file (and before a flush) + # are deleted (or array is restarted from scratch... both work) + s2.append(10.0) + s2.append_list([30.0,40.0]) + s2.load_from_file() + + assert s2.tolist() == [12.0,13.0,14.0,15.0] + + +def test_AccumulatorSeries_mean(): + dir, filename = tempname() + + s = AccumulatorSeries(reduce_every=15, mean=True, name=filename, directory=dir) + + for i in range(50): + s.append(i) + + assert s.tolist() == [7.0,22.0,37.0] + +def test_BasicStatsSeries_commoncase(): + a1 = numpy.arange(25).reshape((5,5)) + a2 = numpy.arange(40).reshape((8,5)) + + parent_dir, dir = tempdir() + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir) + + bss.append(a1) + bss.append(a2) + + assert bss.means.tolist() == [12.0, 19.5] + assert bss.mins.tolist() == [0.0, 0.0] + assert bss.maxes.tolist() == [24.0, 39.0] + assert (bss.stds.tolist()[0] - 7.211102) < 1e-3 + assert (bss.stds.tolist()[1] - 11.54339) < 1e-3 + + # try to reload + + bss2 = BasicStatsSeries(parent_directory=parent_dir, name=dir) + bss2.load_from_directory() + + assert bss2.means.tolist() == [12.0, 19.5] + assert bss2.mins.tolist() == [0.0, 0.0] + assert bss2.maxes.tolist() == [24.0, 39.0] + assert (bss2.stds.tolist()[0] - 7.211102) < 1e-3 + assert (bss2.stds.tolist()[1] - 11.54339) < 1e-3 + +def test_BasicStatsSeries_reload(): + a1 = numpy.arange(25).reshape((5,5)) + a2 = numpy.arange(40).reshape((8,5)) + + parent_dir, dir = tempdir() + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir) + + bss.append(a1) + bss.append(a2) + + # try to reload + + bss2 = BasicStatsSeries(parent_directory=parent_dir, name=dir) + bss2.load_from_directory() + + assert bss2.means.tolist() == [12.0, 19.5] + assert bss2.mins.tolist() == [0.0, 0.0] + assert bss2.maxes.tolist() == [24.0, 39.0] + assert (bss2.stds.tolist()[0] - 7.211102) < 1e-3 + assert (bss2.stds.tolist()[1] - 11.54339) < 1e-3 + + +def test_BasicStatsSeries_withaccumulator(): + a1 = numpy.arange(25).reshape((5,5)) + a2 = numpy.arange(40).reshape((8,5)) + a3 = numpy.arange(20).reshape((4,5)) + a4 = numpy.arange(48).reshape((6,8)) + + parent_dir, dir = tempdir() + + sc = AccumulatorSeries.series_constructor(reduce_every=2, mean=False) + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir, series_constructor=sc) + + bss.append(a1) + bss.append(a2) + bss.append(a3) + bss.append(a4) + + assert bss.means.tolist() == [31.5, 33.0] + +def test_SeriesList_withbasicstats(): + dir = tempfile.mkdtemp(dir=BASEDIR) + + bscstr = BasicStatsSeries.series_constructor() + + slist = SeriesList(num_elements=5, name="foo", directory=dir, series_constructor=bscstr) + + for i in range(10): # 10 elements in each list + curlist = [] + for j in range(5): # 5 = num_elements, ie. number of list to append to + dist = numpy.arange(i*j, i*j+10) + curlist.append(dist) + slist.append(curlist) + + slist2 = SeriesList(num_elements=5, name="foo", directory=dir, series_constructor=bscstr) + + slist2.load_from_files() + + l1 = slist2._subseries[0].means.tolist() + l2 = slist2._subseries[4].means.tolist() + + print l1 + print l2 + + assert l1 == [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5] + assert l2 == [4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5] + +# same test as above, just with the shortcut +def test_ParamsArrayStats_reload(): + dir = tempfile.mkdtemp(dir=BASEDIR) + + slist = ParamsArrayStats(5, name="foo", directory=dir) + + for i in range(10): # 10 elements in each list + curlist = [] + for j in range(5): # 5 = num_elements, ie. number of list to append to + dist = numpy.arange(i*j, i*j+10) + curlist.append(dist) + slist.append(curlist) + + slist2 = ParamsArrayStats(5, name="foo", directory=dir) + + slist2.load_from_files() + + l1 = slist2._subseries[0].means.tolist() + l2 = slist2._subseries[4].means.tolist() + + print l1 + print l2 + + assert l1 == [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5] + assert l2 == [4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5] + + +def manual_BasicStatsSeries_graph(): + parent_dir, dir = tempdir() + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir) + + for i in range(50): + bss.append(1.0/numpy.arange(i*5, i*5+5)) + + bss.graph() + +#if __name__ == '__main__': +# import pylab +# manual_BasicStatsSeries_graph() +# pylab.show() +