changeset 186:d364a130b221

Ajout du code de base pour scalar_series. Modifications à stacked_dae: réglé un problème avec les input_divider (empêchait une optimisation), et ajouté utilisation des séries. Si j'avais pas déjà commité, aussi, j'ai enlevé l'histoire de réutilisation du pretraining: c'était compliqué (error prone) et ça créait des jobs beaucoup trop longues.
author fsavard
date Mon, 01 Mar 2010 11:45:25 -0500
parents b9ea8e2d071a
children c03692aa6158
files deep/__init__.py deep/stacked_dae/nist_sda.py deep/stacked_dae/sgd_optimization.py deep/stacked_dae/stacked_dae.py utils/scalar_series/__init__.py utils/scalar_series/series.py utils/scalar_series/test_series.py
diffstat 6 files changed, 528 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/deep/stacked_dae/nist_sda.py	Fri Feb 26 17:45:52 2010 -0500
+++ b/deep/stacked_dae/nist_sda.py	Mon Mar 01 11:45:25 2010 -0500
@@ -32,7 +32,7 @@
 except ImportError:
     print "Could not import Series"
 
-TEST_CONFIG = True
+TEST_CONFIG = False
 
 NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
 
@@ -46,7 +46,7 @@
     MAX_FINETUNING_EPOCHS = 2
     REDUCE_EVERY = 10
 
-EXPERIMENT_PATH = "ift6266.scripts.stacked_dae.nist_sda.jobman_entrypoint"
+EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"
 
 JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
         'pretraining_epochs_per_layer': [10,20],
--- a/deep/stacked_dae/sgd_optimization.py	Fri Feb 26 17:45:52 2010 -0500
+++ b/deep/stacked_dae/sgd_optimization.py	Mon Mar 01 11:45:25 2010 -0500
@@ -33,7 +33,7 @@
         self.hp = hyperparameters
         self.n_ins = n_ins
         self.n_outs = n_outs
-        self.input_divider = numpy.asarray(input_divider, dtype=theano.config.floatX)
+        self.input_divider = input_divider
    
         if not series_mux:
             series_mux = DummyMux()
@@ -117,14 +117,15 @@
 
         # create a function to compute the mistakes that are made by the model
         # on the validation set, or testing set
+        shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX))
         test_model = theano.function([index], self.classifier.errors,
                  givens = {
-                   self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / self.input_divider,
+                   self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
                    self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]})
 
         validate_model = theano.function([index], self.classifier.errors,
                 givens = {
-                   self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / self.input_divider,
+                   self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
                    self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]})
 
 
@@ -161,6 +162,7 @@
                     
                     validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)]
                     this_validation_loss = numpy.mean(validation_losses)
+                    self.series_mux.append("validation_error", this_validation_loss)
                     print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                            (epoch, minibatch_index+1, self.n_train_batches, \
                             this_validation_loss*100.))
@@ -181,6 +183,7 @@
                         # test it on the test set
                         test_losses = [test_model(i) for i in xrange(self.n_test_batches)]
                         test_score = numpy.mean(test_losses)
+                        self.series_mux.append("test_error", test_score)
                         print(('     epoch %i, minibatch %i/%i, test error of best '
                               'model %f %%') % 
                                      (epoch, minibatch_index+1, self.n_train_batches,
@@ -188,7 +191,7 @@
 
                     sys.stdout.flush()
 
-            self.series_mux.append("params", self.classifier.params)
+            self.series_mux.append("params", self.classifier.all_params)
 
             if patience <= iter :
                 done_looping = True
--- a/deep/stacked_dae/stacked_dae.py	Fri Feb 26 17:45:52 2010 -0500
+++ b/deep/stacked_dae/stacked_dae.py	Mon Mar 01 11:45:25 2010 -0500
@@ -152,9 +152,12 @@
         self.layers             = []
         self.pretrain_functions = []
         self.params             = []
+        # MODIF: added this so we also get the b_primes
+        # (not used for finetuning... still using ".params")
+        self.all_params         = []
         self.n_layers           = len(hidden_layers_sizes)
 
-        self.input_divider = numpy.asarray(input_divider, dtype=theano.config.floatX)
+        self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
 
         if len(hidden_layers_sizes) < 1 :
             raiseException (' You must have at least one hidden layer ')
@@ -196,6 +199,8 @@
                           corruption_level = corruption_levels[0],\
                           input = layer_input, \
                           shared_W = layer.W, shared_b = layer.b)
+
+            self.all_params += dA_layer.params
         
             # Construct a function that trains this dA
             # compute gradients of layer parameters
@@ -209,7 +214,7 @@
             update_fn = theano.function([index], dA_layer.cost, \
                   updates = updates,
                   givens = { 
-                     self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.input_divider})
+                     self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
             # collect this function into a list
             self.pretrain_functions += [update_fn]
 
@@ -220,6 +225,7 @@
                          n_in = hidden_layers_sizes[-1], n_out = n_outs)
 
         self.params += self.logLayer.params
+        self.all_params += self.logLayer.params
         # construct a function that implements one step of finetunining
 
         # compute the cost, defined as the negative log likelihood 
@@ -234,7 +240,7 @@
         self.finetune = theano.function([index], cost, 
                 updates = updates,
                 givens = {
-                  self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.input_divider,
+                  self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
                   self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
 
         # symbolic variable that points to the number of errors made on the
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/scalar_series/__init__.py	Mon Mar 01 11:45:25 2010 -0500
@@ -0,0 +1,2 @@
+from series import BaseSeries, AccumulatorSeries, SeriesContainer, BasicStatsSeries, SeriesMultiplexer, SeriesList, ParamsArrayStats
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/scalar_series/series.py	Mon Mar 01 11:45:25 2010 -0500
@@ -0,0 +1,311 @@
+#!/usr/bin/python
+# coding: utf-8
+
+from __future__ import with_statement
+
+import sys
+import os
+import os.path
+import array
+
+# for BasicStatsSeries
+import numpy
+
+# To access .value if necessary
+import theano.tensor.sharedvar
+
+'''
+* TODO: add xy series
+* TODO: add graph() for base and accumulator
+* TODO: flush_every for BaseStatsSeries
+* TODO: warn when Mux append() is called with a nonexisting name
+* SeriesContainers are also series, albeit with more complex elements appended
+* Each series has a "name" which corresponds in some way to the directory or file in which it's saved
+'''
+
+# Simple class to append numbers and flush them to a file once in a while
+class BaseSeries():
+    # for types, see http://docs.python.org/library/array.html
+    def __init__(self, name, directory, type='f', flush_every=1):
+        self.type = type
+        self.flush_every = flush_every
+
+        if not name or not directory:
+            raise Exception("name and directory must be provided (strings)")
+
+        self.directory = directory
+        self.name = name
+
+        if name and directory:
+            self.filepath = os.path.join(directory, name)
+
+        self._array = array.array(type)
+        # stores the length not stored in file, waiting to be flushed
+        self._buffered = 0
+
+    def append(self, newitem):
+        self._array.append(newitem)
+
+        self._buffered += 1
+        if self._buffered >= self.flush_every:
+            self.flush()
+
+    def append_list(self, items):
+        self._array.fromlist(items)
+        self._buffered += len(items)
+        if self._buffered >= self.flush_every:
+            self.flush()
+
+    def flush(self):
+        if self._buffered == 0:
+            return
+        with open(self.filepath, "wb") as f:
+            s = self._array[-self._buffered:].tostring()
+            f.write(s)
+
+    def tolist(self):
+        return self._array.tolist()
+
+    def load_from_file(self):
+        if not self.filepath:
+            raise Exception("No name/directory provided")
+
+        self._array = array.array(self.type)
+        self._buffered = 0
+
+        statinfo = os.stat(self.filepath)
+        size = statinfo.st_size
+        num_items = size / self._array.itemsize
+
+        with open(self.filepath, "rb") as f:
+            self._array.fromfile(f, num_items)
+
+class AccumulatorSeries(BaseSeries):
+    '''
+    reduce_every: group (sum or mean) the last "reduce_every" items whenever we have enough
+                and create a new item added to the real, saved array
+                (if elements remain at the end, less then "reduce_every", they'll be discarded on program close)
+    flush_every: this is for items of the real, saved array, not in terms of number of calls to "append"
+    '''
+    def __init__(self, reduce_every,
+                    name, directory, flush_every=1,
+                    mean=False):
+        BaseSeries.__init__(self, name=name, directory=directory, type='f', flush_every=flush_every)
+        self.reduce_every = reduce_every
+        self._accumulator = 0.0
+        self._num_accumulated = 0
+        self.use_mean = mean
+
+    @classmethod
+    def series_constructor(cls, reduce_every, mean=False):
+        def cstr(name, directory, flush_every=1):
+            return cls(reduce_every=reduce_every, mean=mean, name=name, directory=directory, flush_every=flush_every)
+        return cstr
+
+    def append(self, item):
+        self._accumulator += item
+        self._num_accumulated += 1
+        if self._num_accumulated >= self.reduce_every:
+            n = self._accumulator
+            if self.use_mean:
+                n = n / self.reduce_every
+            BaseSeries.append(self, n)
+
+            self._num_accumulated = 0
+            self._accumulator = 0.0
+
+    def append_list(self, items):
+        for i in items:
+            self.append(i)
+
+class SeriesContainer():
+    def __init__(self, parent_directory, name,
+                    series_constructor=BaseSeries):
+        self.parent_directory = parent_directory
+        self.name = name
+
+        if not parent_directory or not name:
+            raise Exception("parent_directory and name must be provided (strings)")
+
+        self.directory_path = os.path.join(parent_directory, name)
+
+        self.series_constructor = series_constructor
+
+        # attempt to create directory for series
+        if not os.path.isdir(self.directory_path):
+            os.mkdir(self.directory_path)
+
+    def graph(self):
+        pass
+
+class BasicStatsSeries(SeriesContainer):
+    def __init__(self, parent_directory, name, series_constructor=BaseSeries,
+            mean=True, minmax=True, std=True):
+        SeriesContainer.__init__(self, parent_directory=parent_directory, name=name, series_constructor=series_constructor)
+
+        self.save_mean = mean
+        self.save_minmax = minmax
+        self.save_std = std
+
+        self.create_series()
+
+    @classmethod
+    def series_constructor(cls, mean=True, minmax=True, std=True):
+        def cstr(name, directory, flush_every=1):
+            return cls(name=name, parent_directory=directory,
+                        mean=mean, minmax=minmax, std=std)
+        return cstr
+
+
+    def create_series(self):
+        if self.save_mean:
+            self.means = self.series_constructor(name="mean", directory=self.directory_path)
+
+        if self.save_minmax:
+            self.mins = self.series_constructor(name="min", directory=self.directory_path)
+            self.maxes = self.series_constructor(name="max", directory=self.directory_path)
+
+        if self.save_std:
+            self.stds = self.series_constructor(name="std", directory=self.directory_path)
+
+    def append(self, array):
+        # TODO: shouldn't this be the job of the caller? (at least ParamsArraySeries)
+        if isinstance(array, theano.tensor.sharedvar.TensorSharedVariable):
+            array = array.value
+
+        if self.save_mean:
+            n = numpy.mean(array)
+            self.means.append(n)
+        if self.save_minmax:
+            n = numpy.min(array)
+            self.mins.append(n)
+            n = numpy.max(array)
+            self.maxes.append(n)
+        if self.save_std:
+            n = numpy.std(array)
+            self.stds.append(n)
+
+    def load_from_file(self):
+        self.load_from_directory()
+
+    def load_from_directory(self):
+        if self.save_mean:
+            self.means.load_from_file()
+
+        if self.save_minmax:
+            self.mins.load_from_file()
+            self.maxes.load_from_file()
+
+        if self.save_std:
+            self.stds.load_from_file()
+
+    def graph(self, xes=None):
+        import pylab
+
+        if self.save_minmax:
+            mn = numpy.array(self.mins.tolist())
+            mx = numpy.array(self.maxes.tolist())
+            if self.save_mean:
+                y = numpy.array(self.means.tolist())
+            else:
+                y = (mn+mx) / 2
+
+            above_y = mx - y
+            below_y = y - mn
+
+            if not xes:
+                xes = numpy.arange(len(y))
+
+            pylab.errorbar(x=xes, y=y, yerr=[below_y, above_y])
+
+        elif self.save_mean:
+            y = numpy.array(self.means.tolist())
+            if not xes:
+                xes = numpy.arange(len(y))
+
+            pylab.plot(x=xes, y=y)
+
+
+class SeriesMultiplexer():
+    def __init__(self):
+        self._series_dict = {}
+        self._warned_for = {}
+
+    def append(self, series_name, item):
+        # if we don't have the series, just don't do anything
+        if self._series_dict.has_key(series_name):
+            s = self._series_dict[series_name]
+            s.append(item)
+        elif not self._warned_for.has_key(series_name):
+            print "WARNING: SeriesMultiplexer called with unknown name ", series_name
+            self._warned_for[series_name] = 1
+
+    def append_list(self, series_name, items):
+        if self._series_dict.has_key(series_name):
+            s = self._series_dict[series_name]
+            s.append_list(items)
+        elif not self._warned_for.has_key(series_name):
+            print "WARNING: SeriesMultiplexer called with unknown name ", series_name
+            self._warned_for[series_name] = 1
+
+    def add_series(self, series):
+        if self._series_dict.has_key(series.name):
+            raise Exception("A series with such a name already exists")
+        self._series_dict[series.name] = series
+
+class SeriesList():
+    def __init__(self, num_elements, name, directory, series_constructor=BaseSeries):
+        self._subseries = [None] * num_elements
+        self.name = name
+
+        for i in range(num_elements):
+            newname = name + "." + str(i)
+            self._subseries[i] = series_constructor(name=newname, directory=directory)
+
+    def load_from_files(self):
+        self.load_from_file()
+
+    def load_from_file(self):
+        for s in self._subseries:
+            s.load_from_file()
+
+    # no "append_list", this would get confusing
+    def append(self, list_of_items):
+        if len(list_of_items) != len(self._subseries):
+            raise Exception("bad number of items, expected " + str(len(self._subseries)) + ", got " + str(len(list_of_items)))
+        for i in range(len(list_of_items)):
+            self._subseries[i].append(list_of_items[i])
+
+
+# Just a shortcut
+class ParamsArrayStats(SeriesList):
+    def __init__(self, num_params_arrays, name, directory):
+        cstr = BasicStatsSeries.series_constructor()
+
+        SeriesList.__init__(self, num_elements=num_params_arrays,
+                                name=name, directory=directory,
+                                series_constructor=cstr)
+
+# ------------------------
+# Utilities to work with the series files from the command line
+
+# "dumpf"
+def dump_floats_file(filepath):
+    print "Floats dump of ", filepath
+    with open(filepath, "rb") as f:
+        s = os.stat(filepath)
+        size = s.st_size
+        num = size / 4
+        a = array.array('f')
+        a.fromfile(f, num)
+        print a.tolist()
+
+if __name__ == '__main__':
+    args = sys.argv[1:]
+
+    if len(args) == 2 and args[0] == "dumpf":
+        file = args[1]
+        dump_floats_file(file)
+    else:
+        print "Bad arguments"
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/scalar_series/test_series.py	Mon Mar 01 11:45:25 2010 -0500
@@ -0,0 +1,197 @@
+#!/usr/bin/python
+# coding: utf-8
+
+import sys
+import tempfile
+import os.path
+import os
+
+import numpy
+
+from series import BaseSeries, AccumulatorSeries, SeriesContainer, BasicStatsSeries, SeriesMultiplexer, SeriesList, ParamsArrayStats
+
+
+BASEDIR = tempfile.mkdtemp()
+
+def tempname():
+    file = tempfile.NamedTemporaryFile(dir=BASEDIR)
+    filepath = file.name
+    return os.path.split(filepath)
+
+def tempdir():
+    wholepath = os.path.dirname(tempfile.mkdtemp(dir=BASEDIR))
+    # split again, interpreting the last directory as a filename
+    return os.path.split(wholepath)
+
+def tempseries(type='f', flush_every=1):
+    dir, filename = tempname()
+
+    s = BaseSeries(name=filename, directory=dir, type=type, flush_every=flush_every)
+
+    return s
+
+def test_Series_storeload():
+    s = tempseries()
+
+    s.append(12.0)
+    s.append_list([13.0,14.0,15.0])
+
+    s2 = BaseSeries(name=s.name, directory=s.directory, flush_every=15)
+    # also test if elements stored before load_from_file (and before a flush)
+    # are deleted (or array is restarted from scratch... both work)
+    s2.append(10.0)
+    s2.append_list([30.0,40.0])
+    s2.load_from_file()
+
+    assert s2.tolist() == [12.0,13.0,14.0,15.0]
+
+
+def test_AccumulatorSeries_mean():
+    dir, filename = tempname()
+
+    s = AccumulatorSeries(reduce_every=15, mean=True, name=filename, directory=dir)
+
+    for i in range(50):
+        s.append(i)
+
+    assert s.tolist() == [7.0,22.0,37.0]
+
+def test_BasicStatsSeries_commoncase():
+    a1 = numpy.arange(25).reshape((5,5))
+    a2 = numpy.arange(40).reshape((8,5))
+    
+    parent_dir, dir = tempdir()
+
+    bss = BasicStatsSeries(parent_directory=parent_dir, name=dir)
+
+    bss.append(a1)
+    bss.append(a2)
+
+    assert bss.means.tolist() == [12.0, 19.5]
+    assert bss.mins.tolist() == [0.0, 0.0]
+    assert bss.maxes.tolist() == [24.0, 39.0]
+    assert (bss.stds.tolist()[0] - 7.211102) < 1e-3
+    assert (bss.stds.tolist()[1] - 11.54339) < 1e-3
+
+    # try to reload
+
+    bss2 = BasicStatsSeries(parent_directory=parent_dir, name=dir)
+    bss2.load_from_directory()
+
+    assert bss2.means.tolist() == [12.0, 19.5]
+    assert bss2.mins.tolist() == [0.0, 0.0]
+    assert bss2.maxes.tolist() == [24.0, 39.0]
+    assert (bss2.stds.tolist()[0] - 7.211102) < 1e-3
+    assert (bss2.stds.tolist()[1] - 11.54339) < 1e-3
+
+def test_BasicStatsSeries_reload():
+    a1 = numpy.arange(25).reshape((5,5))
+    a2 = numpy.arange(40).reshape((8,5))
+    
+    parent_dir, dir = tempdir()
+
+    bss = BasicStatsSeries(parent_directory=parent_dir, name=dir)
+
+    bss.append(a1)
+    bss.append(a2)
+
+    # try to reload
+
+    bss2 = BasicStatsSeries(parent_directory=parent_dir, name=dir)
+    bss2.load_from_directory()
+
+    assert bss2.means.tolist() == [12.0, 19.5]
+    assert bss2.mins.tolist() == [0.0, 0.0]
+    assert bss2.maxes.tolist() == [24.0, 39.0]
+    assert (bss2.stds.tolist()[0] - 7.211102) < 1e-3
+    assert (bss2.stds.tolist()[1] - 11.54339) < 1e-3
+
+
+def test_BasicStatsSeries_withaccumulator():
+    a1 = numpy.arange(25).reshape((5,5))
+    a2 = numpy.arange(40).reshape((8,5))
+    a3 = numpy.arange(20).reshape((4,5))
+    a4 = numpy.arange(48).reshape((6,8))
+    
+    parent_dir, dir = tempdir()
+
+    sc = AccumulatorSeries.series_constructor(reduce_every=2, mean=False)
+
+    bss = BasicStatsSeries(parent_directory=parent_dir, name=dir, series_constructor=sc)
+
+    bss.append(a1)
+    bss.append(a2)
+    bss.append(a3)
+    bss.append(a4)
+
+    assert bss.means.tolist() == [31.5, 33.0]
+
+def test_SeriesList_withbasicstats():
+    dir = tempfile.mkdtemp(dir=BASEDIR)
+
+    bscstr = BasicStatsSeries.series_constructor()
+
+    slist = SeriesList(num_elements=5, name="foo", directory=dir, series_constructor=bscstr)
+
+    for i in range(10): # 10 elements in each list
+        curlist = []
+        for j in range(5): # 5 = num_elements, ie. number of list to append to
+            dist = numpy.arange(i*j, i*j+10)
+            curlist.append(dist)
+        slist.append(curlist)
+
+    slist2 = SeriesList(num_elements=5, name="foo", directory=dir, series_constructor=bscstr)
+
+    slist2.load_from_files()
+
+    l1 = slist2._subseries[0].means.tolist()
+    l2 = slist2._subseries[4].means.tolist()
+
+    print l1
+    print l2
+
+    assert l1 == [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5]
+    assert l2 == [4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5]
+
+# same test as above, just with the shortcut
+def test_ParamsArrayStats_reload():
+    dir = tempfile.mkdtemp(dir=BASEDIR)
+
+    slist = ParamsArrayStats(5, name="foo", directory=dir)
+
+    for i in range(10): # 10 elements in each list
+        curlist = []
+        for j in range(5): # 5 = num_elements, ie. number of list to append to
+            dist = numpy.arange(i*j, i*j+10)
+            curlist.append(dist)
+        slist.append(curlist)
+
+    slist2 = ParamsArrayStats(5, name="foo", directory=dir)
+
+    slist2.load_from_files()
+
+    l1 = slist2._subseries[0].means.tolist()
+    l2 = slist2._subseries[4].means.tolist()
+
+    print l1
+    print l2
+
+    assert l1 == [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5]
+    assert l2 == [4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5]
+
+
+def manual_BasicStatsSeries_graph():
+    parent_dir, dir = tempdir()
+
+    bss = BasicStatsSeries(parent_directory=parent_dir, name=dir)
+
+    for i in range(50):
+        bss.append(1.0/numpy.arange(i*5, i*5+5))
+
+    bss.graph()
+
+#if __name__ == '__main__':
+#    import pylab
+#    manual_BasicStatsSeries_graph()
+#    pylab.show()
+