diff deep/stacked_dae/v2/sgd_optimization.py @ 239:42005ec87747

Mergé (manuellement) les changements de Sylvain pour utiliser le code de dataset d'Arnaud, à cette différence près que je n'utilse pas les givens. J'ai probablement une approche différente pour limiter la taille du dataset dans mon débuggage, aussi.
author fsavard
date Mon, 15 Mar 2010 18:30:21 -0400
parents 02eb98d051fe
children f213a0fb2b08
line wrap: on
line diff
--- a/deep/stacked_dae/v2/sgd_optimization.py	Mon Mar 15 13:22:20 2010 -0400
+++ b/deep/stacked_dae/v2/sgd_optimization.py	Mon Mar 15 18:30:21 2010 -0400
@@ -17,19 +17,6 @@
 
 from ift6266.utils.seriestables import *
 
-def shared_dataset(data_xy):
-    data_x, data_y = data_xy
-    if theano.config.device.startswith("gpu"):
-        print "TRANSFERING DATASETS (via shared()) TO GPU"
-        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
-        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
-        shared_y = T.cast(shared_y, 'int32')
-    else:
-        print "WILL RUN ON CPU, NOT GPU, SO DATASETS REMAIN IN BYTES"
-        shared_x = theano.shared(data_x)
-        shared_y = theano.shared(data_y)
-    return shared_x, shared_y
-
 default_series = { \
         'reconstruction_error' : DummySeries(),
         'training_error' : DummySeries(),
@@ -38,37 +25,33 @@
         'params' : DummySeries()
         }
 
+def itermax(iter, max):
+    for i,it in enumerate(iter):
+        if i >= max:
+            break
+        yield i
+
 class SdaSgdOptimizer:
-    def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series=default_series):
+    def __init__(self, dataset, hyperparameters, n_ins, n_outs,
+                    examples_per_epoch, series=default_series, max_minibatches=None):
         self.dataset = dataset
         self.hp = hyperparameters
         self.n_ins = n_ins
         self.n_outs = n_outs
-        self.input_divider = input_divider
    
+        self.max_minibatches = max_minibatches
+        print "SdaSgdOptimizer, max_minibatches =", max_minibatches
+
+        self.ex_per_epoch = examples_per_epoch
+        self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size
+
         self.series = series
 
         self.rng = numpy.random.RandomState(1234)
 
-        self.init_datasets()
         self.init_classifier()
 
         sys.stdout.flush()
-     
-    def init_datasets(self):
-        print "init_datasets"
-        sys.stdout.flush()
-
-        train_set, valid_set, test_set = self.dataset
-        self.test_set_x, self.test_set_y = shared_dataset(test_set)
-        self.valid_set_x, self.valid_set_y = shared_dataset(valid_set)
-        self.train_set_x, self.train_set_y = shared_dataset(train_set)
-
-        # compute number of minibatches for training, validation and testing
-        self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size
-        self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size
-        # remove last batch in case it's incomplete
-        self.n_test_batches  = (self.test_set_x.value.shape[0]  / self.hp.minibatch_size) - 1
 
     def init_classifier(self):
         print "Constructing classifier"
@@ -81,8 +64,6 @@
 
         # construct the stacked denoising autoencoder class
         self.classifier = SdA( \
-                          train_set_x= self.train_set_x, \
-                          train_set_y = self.train_set_y,\
                           batch_size = self.hp.minibatch_size, \
                           n_ins= self.n_ins, \
                           hidden_layers_sizes = layers_sizes, \
@@ -90,18 +71,17 @@
                           corruption_levels = corruption_levels,\
                           rng = self.rng,\
                           pretrain_lr = self.hp.pretraining_lr, \
-                          finetune_lr = self.hp.finetuning_lr,\
-                          input_divider = self.input_divider )
+                          finetune_lr = self.hp.finetuning_lr)
 
         #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph")
 
         sys.stdout.flush()
 
     def train(self):
-        self.pretrain()
-        self.finetune()
+        self.pretrain(self.dataset)
+        self.finetune(self.dataset)
 
-    def pretrain(self):
+    def pretrain(self,dataset):
         print "STARTING PRETRAINING, time = ", datetime.datetime.now()
         sys.stdout.flush()
 
@@ -111,10 +91,19 @@
             # go through pretraining epochs 
             for epoch in xrange(self.hp.pretraining_epochs_per_layer):
                 # go through the training set
-                for batch_index in xrange(self.n_train_batches):
-                    c = self.classifier.pretrain_functions[i](batch_index)
+                batch_index=0
+                for x,y in dataset.train(self.hp.minibatch_size):
+                    c = self.classifier.pretrain_functions[i](x)
 
                     self.series["reconstruction_error"].append((epoch, batch_index), c)
+                    batch_index+=1
+
+                    if batch_index % 10000 == 0:
+                        print "10000 batches"
+
+                    # useful when doing tests
+                    if self.max_minibatches and batch_index >= self.max_minibatches:
+                        break
                         
                 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
                 sys.stdout.flush()
@@ -128,24 +117,26 @@
 
         sys.stdout.flush()
 
-    def finetune(self):
+    def finetune(self,dataset):
         print "STARTING FINETUNING, time = ", datetime.datetime.now()
 
-        index   = T.lscalar()    # index to a [mini]batch 
         minibatch_size = self.hp.minibatch_size
 
         # create a function to compute the mistakes that are made by the model
         # on the validation set, or testing set
-        shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX))
-        test_model = theano.function([index], self.classifier.errors,
-                 givens = {
-                   self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
-                   self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+        test_model = \
+            theano.function(
+                [self.classifier.x,self.classifier.y], self.classifier.errors)
+        #         givens = {
+        #           self.classifier.x: ensemble_x,
+        #           self.classifier.y: ensemble_y]})
 
-        validate_model = theano.function([index], self.classifier.errors,
-                givens = {
-                   self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
-                   self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+        validate_model = \
+            theano.function(
+                [self.classifier.x,self.classifier.y], self.classifier.errors)
+        #        givens = {
+        #           self.classifier.x: ,
+        #           self.classifier.y: ]})
 
 
         # early-stopping parameters
@@ -154,7 +145,7 @@
                                       # found
         improvement_threshold = 0.995 # a relative improvement of this much is 
                                       # considered significant
-        validation_frequency  = min(self.n_train_batches, patience/2)
+        validation_frequency  = min(self.mb_per_epoch, patience/2)
                                       # go through this many 
                                       # minibatche before checking the network 
                                       # on the validation set; in this case we 
@@ -168,18 +159,24 @@
         done_looping = False
         epoch = 0
 
+        total_mb_index = 0
+
         while (epoch < self.hp.max_finetuning_epochs) and (not done_looping):
             epoch = epoch + 1
-            for minibatch_index in xrange(self.n_train_batches):
-
-                cost_ij = self.classifier.finetune(minibatch_index)
-                iter    = epoch * self.n_train_batches + minibatch_index
+            minibatch_index = -1
+            for x,y in dataset.train(minibatch_size):
+                minibatch_index += 1
+                cost_ij = self.classifier.finetune(x,y)
+                total_mb_index += 1
 
                 self.series["training_error"].append((epoch, minibatch_index), cost_ij)
 
-                if (iter+1) % validation_frequency == 0: 
+                if (total_mb_index+1) % validation_frequency == 0: 
                     
-                    validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)]
+                    iter = dataset.valid(minibatch_size)
+                    if self.max_minibatches:
+                        iter = itermax(iter, self.max_minibatches)
+                    validation_losses = [validate_model(x,y) for x,y in iter]
                     this_validation_loss = numpy.mean(validation_losses)
 
                     self.series["validation_error"].\
@@ -196,14 +193,17 @@
                         #improve patience if loss improvement is good enough
                         if this_validation_loss < best_validation_loss *  \
                                improvement_threshold :
-                            patience = max(patience, iter * patience_increase)
+                            patience = max(patience, total_mb_index * patience_increase)
 
                         # save best validation score and iteration number
                         best_validation_loss = this_validation_loss
-                        best_iter = iter
+                        best_iter = total_mb_index
 
                         # test it on the test set
-                        test_losses = [test_model(i) for i in xrange(self.n_test_batches)]
+                        iter = dataset.test(minibatch_size)
+                        if self.max_minibatches:
+                            iter = itermax(iter, self.max_minibatches)
+                        test_losses = [test_model(x,y) for x,y in iter]
                         test_score = numpy.mean(test_losses)
 
                         self.series["test_error"].\
@@ -216,9 +216,13 @@
 
                     sys.stdout.flush()
 
+                # useful when doing tests
+                if self.max_minibatches and batch_index >= self.max_minibatches:
+                    break
+
             self.series['params'].append((epoch,), self.classifier.all_params)
 
-            if patience <= iter :
+            if patience <= total_mb_index:
                 done_looping = True
                 break