# HG changeset patch
# User fsavard
# Date 1268692221 14400
# Node ID 42005ec87747360de11e3a7313053bc00baed41a
# Parent  9fc641d7adda89273fb5272805e1c6279266ea3e
Mergé (manuellement) les changements de Sylvain pour utiliser le code de dataset d'Arnaud, à cette différence près que je n'utilse pas les givens. J'ai probablement une approche différente pour limiter la taille du dataset dans mon débuggage, aussi.

diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/config.py.example
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/v2/config.py.example	Mon Mar 15 18:30:21 2010 -0400
@@ -0,0 +1,64 @@
+'''
+These are parameters used by nist_sda.py. They'll end up as globals in there.
+
+Rename this file to config.py and configure as needed.
+DON'T add the renamed file to the repository, as others might use it
+without realizing it, with dire consequences.
+'''
+
+# Set this to True when you want to run cluster tests, ie. you want
+# to run on the cluster, many jobs, but want to reduce the training
+# set size and the number of epochs, so you know everything runs
+# fine on the cluster.
+# Set this PRIOR to inserting your test jobs in the DB.
+TEST_CONFIG = False
+
+NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
+NIST_ALL_TRAIN_SIZE = 649081
+# valid et test =82587 82587 
+
+# change "sandbox" when you're ready
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere'
+EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint"
+
+# reduce training set to that many examples
+REDUCE_TRAIN_TO = None
+# that's a max, it usually doesn't get to that point
+MAX_FINETUNING_EPOCHS = 1000
+# number of minibatches before taking means for valid error etc.
+REDUCE_EVERY = 100
+
+if TEST_CONFIG:
+    REDUCE_TRAIN_TO = 1000
+    MAX_FINETUNING_EPOCHS = 2
+    REDUCE_EVERY = 10
+
+
+# This is to configure insertion of jobs on the cluster.
+# Possible values the hyperparameters can take. These are then
+# combined with produit_cartesien_jobs so we get a list of all
+# possible combinations, each one resulting in a job inserted
+# in the jobman DB.
+JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
+        'pretraining_epochs_per_layer': [10,20],
+        'hidden_layers_sizes': [300,800],
+        'corruption_levels': [0.1,0.2,0.3],
+        'minibatch_size': [20],
+        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
+        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
+        'num_hidden_layers':[2,3]}
+
+# Just useful for tests... minimal number of epochs
+# (This is used when running a single job, locally, when
+# calling ./nist_sda.py test_jobman_entrypoint
+DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
+                       'pretraining_lr':0.1,
+                       'pretraining_epochs_per_layer':2,
+                       'max_finetuning_epochs':2,
+                       'hidden_layers_sizes':800,
+                       'corruption_levels':0.2,
+                       'minibatch_size':20,
+                       'reduce_train_to':10000,
+                       'num_hidden_layers':1})
+
+
diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/nist_sda.py
--- a/deep/stacked_dae/v2/nist_sda.py	Mon Mar 15 13:22:20 2010 -0400
+++ b/deep/stacked_dae/v2/nist_sda.py	Mon Mar 15 18:30:21 2010 -0400
@@ -29,48 +29,8 @@
 from ift6266.utils.seriestables import *
 import tables
 
-##############################################################################
-# GLOBALS
-
-TEST_CONFIG = False
-
-NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/fsavard_sda_v2'
-EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint"
-
-REDUCE_TRAIN_TO = None
-MAX_FINETUNING_EPOCHS = 1000
-# number of minibatches before taking means for valid error etc.
-REDUCE_EVERY = 100
-
-if TEST_CONFIG:
-    REDUCE_TRAIN_TO = 1000
-    MAX_FINETUNING_EPOCHS = 2
-    REDUCE_EVERY = 10
-
-# Possible values the hyperparameters can take. These are then
-# combined with produit_cartesien_jobs so we get a list of all
-# possible combinations, each one resulting in a job inserted
-# in the jobman DB.
-JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
-        'pretraining_epochs_per_layer': [10,20],
-        'hidden_layers_sizes': [300,800],
-        'corruption_levels': [0.1,0.2,0.3],
-        'minibatch_size': [20],
-        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
-        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
-        'num_hidden_layers':[2,3]}
-
-# Just useful for tests... minimal number of epochs
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
-                       'pretraining_lr':0.1,
-                       'pretraining_epochs_per_layer':2,
-                       'max_finetuning_epochs':2,
-                       'hidden_layers_sizes':800,
-                       'corruption_levels':0.2,
-                       'minibatch_size':20,
-                       'reduce_train_to':10000,
-                       'num_hidden_layers':1})
+from ift6266 import datasets
+from config import *
 
 '''
 Function called by jobman upon launching each job
@@ -82,14 +42,6 @@
     # TODO: remove this, bad for number of simultaneous requests on DB
     channel.save()
 
-    workingdir = os.getcwd()
-
-    print "Will load NIST"
-
-    nist = NIST(minibatch_size=20)
-
-    print "NIST loaded"
-
     # For test runs, we don't want to use the whole dataset so
     # reduce it to fewer elements if asked to.
     rtt = None
@@ -97,29 +49,27 @@
         rtt = state['reduce_train_to']
     elif REDUCE_TRAIN_TO:
         rtt = REDUCE_TRAIN_TO
-
-    if rtt:
-        print "Reducing training set to "+str(rtt)+ " examples"
-        nist.reduce_train_set(rtt)
-
-    train,valid,test = nist.get_tvt()
-    dataset = (train,valid,test)
-
+ 
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
+     
+    examples_per_epoch = NIST_ALL_TRAIN_SIZE
 
     series = create_series(state.num_hidden_layers)
 
     print "Creating optimizer with state, ", state
 
-    optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \
+    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, 
+                                    hyperparameters=state, \
                                     n_ins=n_ins, n_outs=n_outs,\
-                                    input_divider=255.0, series=series)
+                                    examples_per_epoch=examples_per_epoch, \
+                                    series=series,
+                                    max_minibatches=rtt)
 
-    optimizer.pretrain()
+    optimizer.pretrain(datasets.nist_all)
     channel.save()
 
-    optimizer.finetune()
+    optimizer.finetune(datasets.nist_all)
     channel.save()
 
     return channel.COMPLETE
@@ -200,93 +150,14 @@
 
     print "inserted"
 
-class NIST:
-    def __init__(self, minibatch_size, basepath=None, reduce_train_to=None):
-        global NIST_ALL_LOCATION
-
-        self.minibatch_size = minibatch_size
-        self.basepath = basepath and basepath or NIST_ALL_LOCATION
-
-        self.set_filenames()
-
-        # arrays of 2 elements: .x, .y
-        self.train = [None, None]
-        self.test = [None, None]
-
-        self.load_train_test()
-
-        self.valid = [[], []]
-        self.split_train_valid()
-        if reduce_train_to:
-            self.reduce_train_set(reduce_train_to)
-
-    def get_tvt(self):
-        return self.train, self.valid, self.test
-
-    def set_filenames(self):
-        self.train_files = ['all_train_data.ft',
-                                'all_train_labels.ft']
-
-        self.test_files = ['all_test_data.ft',
-                            'all_test_labels.ft']
-
-    def load_train_test(self):
-        self.load_data_labels(self.train_files, self.train)
-        self.load_data_labels(self.test_files, self.test)
-
-    def load_data_labels(self, filenames, pair):
-        for i, fn in enumerate(filenames):
-            f = open(os.path.join(self.basepath, fn))
-            pair[i] = filetensor.read(f)
-            f.close()
-
-    def reduce_train_set(self, max):
-        self.train[0] = self.train[0][:max]
-        self.train[1] = self.train[1][:max]
-
-        if max < len(self.test[0]):
-            for ar in (self.test, self.valid):
-                ar[0] = ar[0][:max]
-                ar[1] = ar[1][:max]
-
-    def split_train_valid(self):
-        test_len = len(self.test[0])
-        
-        new_train_x = self.train[0][:-test_len]
-        new_train_y = self.train[1][:-test_len]
-
-        self.valid[0] = self.train[0][-test_len:]
-        self.valid[1] = self.train[1][-test_len:]
-
-        self.train[0] = new_train_x
-        self.train[1] = new_train_y
-
-def test_load_nist():
-    print "Will load NIST"
-
-    import time
-    t1 = time.time()
-    nist = NIST(20)
-    t2 = time.time()
-
-    print "NIST loaded. time delta = ", t2-t1
-
-    tr,v,te = nist.get_tvt()
-
-    print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0])
-
-    raw_input("Press any key")
-
 if __name__ == '__main__':
 
-    import sys
-
     args = sys.argv[1:]
 
-    if len(args) > 0 and args[0] == 'load_nist':
-        test_load_nist()
+    #if len(args) > 0 and args[0] == 'load_nist':
+    #    test_load_nist()
 
-    elif len(args) > 0 and args[0] == 'jobman_insert':
+    if len(args) > 0 and args[0] == 'jobman_insert':
         jobman_insert_nist()
 
     elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/sgd_optimization.py
--- a/deep/stacked_dae/v2/sgd_optimization.py	Mon Mar 15 13:22:20 2010 -0400
+++ b/deep/stacked_dae/v2/sgd_optimization.py	Mon Mar 15 18:30:21 2010 -0400
@@ -17,19 +17,6 @@
 
 from ift6266.utils.seriestables import *
 
-def shared_dataset(data_xy):
-    data_x, data_y = data_xy
-    if theano.config.device.startswith("gpu"):
-        print "TRANSFERING DATASETS (via shared()) TO GPU"
-        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
-        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
-        shared_y = T.cast(shared_y, 'int32')
-    else:
-        print "WILL RUN ON CPU, NOT GPU, SO DATASETS REMAIN IN BYTES"
-        shared_x = theano.shared(data_x)
-        shared_y = theano.shared(data_y)
-    return shared_x, shared_y
-
 default_series = { \
         'reconstruction_error' : DummySeries(),
         'training_error' : DummySeries(),
@@ -38,37 +25,33 @@
         'params' : DummySeries()
         }
 
+def itermax(iter, max):
+    for i,it in enumerate(iter):
+        if i >= max:
+            break
+        yield i
+
 class SdaSgdOptimizer:
-    def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series=default_series):
+    def __init__(self, dataset, hyperparameters, n_ins, n_outs,
+                    examples_per_epoch, series=default_series, max_minibatches=None):
         self.dataset = dataset
         self.hp = hyperparameters
         self.n_ins = n_ins
         self.n_outs = n_outs
-        self.input_divider = input_divider
    
+        self.max_minibatches = max_minibatches
+        print "SdaSgdOptimizer, max_minibatches =", max_minibatches
+
+        self.ex_per_epoch = examples_per_epoch
+        self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size
+
         self.series = series
 
         self.rng = numpy.random.RandomState(1234)
 
-        self.init_datasets()
         self.init_classifier()
 
         sys.stdout.flush()
-     
-    def init_datasets(self):
-        print "init_datasets"
-        sys.stdout.flush()
-
-        train_set, valid_set, test_set = self.dataset
-        self.test_set_x, self.test_set_y = shared_dataset(test_set)
-        self.valid_set_x, self.valid_set_y = shared_dataset(valid_set)
-        self.train_set_x, self.train_set_y = shared_dataset(train_set)
-
-        # compute number of minibatches for training, validation and testing
-        self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size
-        self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size
-        # remove last batch in case it's incomplete
-        self.n_test_batches  = (self.test_set_x.value.shape[0]  / self.hp.minibatch_size) - 1
 
     def init_classifier(self):
         print "Constructing classifier"
@@ -81,8 +64,6 @@
 
         # construct the stacked denoising autoencoder class
         self.classifier = SdA( \
-                          train_set_x= self.train_set_x, \
-                          train_set_y = self.train_set_y,\
                           batch_size = self.hp.minibatch_size, \
                           n_ins= self.n_ins, \
                           hidden_layers_sizes = layers_sizes, \
@@ -90,18 +71,17 @@
                           corruption_levels = corruption_levels,\
                           rng = self.rng,\
                           pretrain_lr = self.hp.pretraining_lr, \
-                          finetune_lr = self.hp.finetuning_lr,\
-                          input_divider = self.input_divider )
+                          finetune_lr = self.hp.finetuning_lr)
 
         #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph")
 
         sys.stdout.flush()
 
     def train(self):
-        self.pretrain()
-        self.finetune()
+        self.pretrain(self.dataset)
+        self.finetune(self.dataset)
 
-    def pretrain(self):
+    def pretrain(self,dataset):
         print "STARTING PRETRAINING, time = ", datetime.datetime.now()
         sys.stdout.flush()
 
@@ -111,10 +91,19 @@
             # go through pretraining epochs 
             for epoch in xrange(self.hp.pretraining_epochs_per_layer):
                 # go through the training set
-                for batch_index in xrange(self.n_train_batches):
-                    c = self.classifier.pretrain_functions[i](batch_index)
+                batch_index=0
+                for x,y in dataset.train(self.hp.minibatch_size):
+                    c = self.classifier.pretrain_functions[i](x)
 
                     self.series["reconstruction_error"].append((epoch, batch_index), c)
+                    batch_index+=1
+
+                    if batch_index % 10000 == 0:
+                        print "10000 batches"
+
+                    # useful when doing tests
+                    if self.max_minibatches and batch_index >= self.max_minibatches:
+                        break
                         
                 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
                 sys.stdout.flush()
@@ -128,24 +117,26 @@
 
         sys.stdout.flush()
 
-    def finetune(self):
+    def finetune(self,dataset):
         print "STARTING FINETUNING, time = ", datetime.datetime.now()
 
-        index   = T.lscalar()    # index to a [mini]batch 
         minibatch_size = self.hp.minibatch_size
 
         # create a function to compute the mistakes that are made by the model
         # on the validation set, or testing set
-        shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX))
-        test_model = theano.function([index], self.classifier.errors,
-                 givens = {
-                   self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
-                   self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+        test_model = \
+            theano.function(
+                [self.classifier.x,self.classifier.y], self.classifier.errors)
+        #         givens = {
+        #           self.classifier.x: ensemble_x,
+        #           self.classifier.y: ensemble_y]})
 
-        validate_model = theano.function([index], self.classifier.errors,
-                givens = {
-                   self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
-                   self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+        validate_model = \
+            theano.function(
+                [self.classifier.x,self.classifier.y], self.classifier.errors)
+        #        givens = {
+        #           self.classifier.x: ,
+        #           self.classifier.y: ]})
 
 
         # early-stopping parameters
@@ -154,7 +145,7 @@
                                       # found
         improvement_threshold = 0.995 # a relative improvement of this much is 
                                       # considered significant
-        validation_frequency  = min(self.n_train_batches, patience/2)
+        validation_frequency  = min(self.mb_per_epoch, patience/2)
                                       # go through this many 
                                       # minibatche before checking the network 
                                       # on the validation set; in this case we 
@@ -168,18 +159,24 @@
         done_looping = False
         epoch = 0
 
+        total_mb_index = 0
+
         while (epoch < self.hp.max_finetuning_epochs) and (not done_looping):
             epoch = epoch + 1
-            for minibatch_index in xrange(self.n_train_batches):
-
-                cost_ij = self.classifier.finetune(minibatch_index)
-                iter    = epoch * self.n_train_batches + minibatch_index
+            minibatch_index = -1
+            for x,y in dataset.train(minibatch_size):
+                minibatch_index += 1
+                cost_ij = self.classifier.finetune(x,y)
+                total_mb_index += 1
 
                 self.series["training_error"].append((epoch, minibatch_index), cost_ij)
 
-                if (iter+1) % validation_frequency == 0: 
+                if (total_mb_index+1) % validation_frequency == 0: 
                     
-                    validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)]
+                    iter = dataset.valid(minibatch_size)
+                    if self.max_minibatches:
+                        iter = itermax(iter, self.max_minibatches)
+                    validation_losses = [validate_model(x,y) for x,y in iter]
                     this_validation_loss = numpy.mean(validation_losses)
 
                     self.series["validation_error"].\
@@ -196,14 +193,17 @@
                         #improve patience if loss improvement is good enough
                         if this_validation_loss < best_validation_loss *  \
                                improvement_threshold :
-                            patience = max(patience, iter * patience_increase)
+                            patience = max(patience, total_mb_index * patience_increase)
 
                         # save best validation score and iteration number
                         best_validation_loss = this_validation_loss
-                        best_iter = iter
+                        best_iter = total_mb_index
 
                         # test it on the test set
-                        test_losses = [test_model(i) for i in xrange(self.n_test_batches)]
+                        iter = dataset.test(minibatch_size)
+                        if self.max_minibatches:
+                            iter = itermax(iter, self.max_minibatches)
+                        test_losses = [test_model(x,y) for x,y in iter]
                         test_score = numpy.mean(test_losses)
 
                         self.series["test_error"].\
@@ -216,9 +216,13 @@
 
                     sys.stdout.flush()
 
+                # useful when doing tests
+                if self.max_minibatches and batch_index >= self.max_minibatches:
+                    break
+
             self.series['params'].append((epoch,), self.classifier.all_params)
 
-            if patience <= iter :
+            if patience <= total_mb_index:
                 done_looping = True
                 break
 
diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/stacked_dae.py
--- a/deep/stacked_dae/v2/stacked_dae.py	Mon Mar 15 13:22:20 2010 -0400
+++ b/deep/stacked_dae/v2/stacked_dae.py	Mon Mar 15 18:30:21 2010 -0400
@@ -165,9 +165,9 @@
 
 
 class SdA(object):
-    def __init__(self, train_set_x, train_set_y, batch_size, n_ins, 
+    def __init__(self, batch_size, n_ins, 
                  hidden_layers_sizes, n_outs, 
-                 corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0):
+                 corruption_levels, rng, pretrain_lr, finetune_lr):
         # Just to make sure those are not modified somewhere else afterwards
         hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
         corruption_levels = copy.deepcopy(corruption_levels)
@@ -190,17 +190,14 @@
         print "n_outs", n_outs
         print "pretrain_lr", pretrain_lr
         print "finetune_lr", finetune_lr
-        print "input_divider", input_divider
         print "----"
 
-        self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
-
         if len(hidden_layers_sizes) < 1 :
             raiseException (' You must have at least one hidden layer ')
 
 
         # allocate symbolic variables for the data
-        index   = T.lscalar()    # index to a [mini]batch 
+        #index   = T.lscalar()    # index to a [mini]batch 
         self.x  = T.matrix('x')  # the data is presented as rasterized images
         self.y  = T.ivector('y') # the labels are presented as 1D vector of 
                                  # [int] labels
@@ -247,10 +244,15 @@
                 updates[param] = param - gparam * pretrain_lr
             
             # create a function that trains the dA
-            update_fn = theano.function([index], dA_layer.cost, \
-                  updates = updates,
-                  givens = { 
-                     self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
+            update_fn = theano.function([self.x], dA_layer.cost, \
+                  updates = updates)#,
+            #     givens = { 
+            #         self.x : ensemble})
+            # collect this function into a list
+            #update_fn = theano.function([index], dA_layer.cost, \
+            #      updates = updates,
+            #      givens = { 
+            #         self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
             # collect this function into a list
             self.pretrain_functions += [update_fn]
 
@@ -273,11 +275,11 @@
         for param,gparam in zip(self.params, gparams):
             updates[param] = param - gparam*finetune_lr
             
-        self.finetune = theano.function([index], cost, 
-                updates = updates,
-                givens = {
-                  self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
-                  self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
+        self.finetune = theano.function([self.x,self.y], cost, 
+                updates = updates)#,
+        #        givens = {
+        #          self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
+        #          self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
 
         # symbolic variable that points to the number of errors made on the
         # minibatch given by self.x and self.y