changeset 143:f341a4efb44a

added adaptive lr, weight file save, traine error and error curves
author XavierMuller
date Tue, 23 Feb 2010 18:08:11 -0500
parents 93b4b84d86cf
children c958941c1b9d
files baseline_algorithms/mlp/mlp_nist.py
diffstat 1 files changed, 99 insertions(+), 51 deletions(-) [+]
line wrap: on
line diff
--- a/baseline_algorithms/mlp/mlp_nist.py	Tue Feb 16 17:12:35 2010 -0500
+++ b/baseline_algorithms/mlp/mlp_nist.py	Tue Feb 23 18:08:11 2010 -0500
@@ -30,6 +30,7 @@
 import theano.tensor as T
 import time 
 import theano.tensor.nnet
+import pylearn
 from pylearn.io import filetensor as ft
 
 data_path = '/data/lisa/data/nist/by_class/'
@@ -45,7 +46,7 @@
 
 
 
-    def __init__(self, input, n_in, n_hidden, n_out):
+    def __init__(self, input, n_in, n_hidden, n_out,learning_rate):
         """Initialize the parameters for the multilayer perceptron
 
         :param input: symbolic variable that describes the input of the 
@@ -94,8 +95,14 @@
         self.b2 = theano.shared( value = numpy.zeros((n_out,), 
                                                 dtype= theano.config.floatX))
 
+        #include the learning rate in the classifer so
+        #we can modify it on the fly when we want
+        lr_value=learning_rate
+        self.lr=theano.shared(value=lr_value)
         # symbolic expression computing the values of the hidden layer
         self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
+        
+        
 
         # symbolic expression computing the values of the top layer 
         self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
@@ -103,6 +110,10 @@
         # compute prediction as class whose probability is maximal in 
         # symbolic form
         self.y_pred = T.argmax( self.p_y_given_x, axis =1)
+        self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1)
+        
+        
+        
         
         # L1 norm ; one regularization option is to enforce L1 norm to 
         # be small 
@@ -150,21 +161,9 @@
         else:
             raise NotImplementedError()
 
-#def jobman_mlp(state,channel):
-#    (validation_error,test_error,nb_exemples,time)=mlp_full_nist(state.learning_rate,\
- #                                                                state.n_iter,\
- #                                                                state.batch_size,\
- #                                                                state.nb_hidden_units)
- #   state.validation_error = validation_error
- #   state.test_error = test_error
- #   state.nb_exemples = nb_exemples
-  #  state.time=time
-   # return channel.COMPLETE
-
-
-                                                                 
 
 def mlp_full_nist(      verbose = False,\
+                        adaptive_lr = False,\
                         train_data = 'all/all_train_data.ft',\
                         train_labels = 'all/all_train_labels.ft',\
                         test_data = 'all/all_test_data.ft',\
@@ -178,6 +177,14 @@
                         nb_targets = 62):
    
     
+    configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr]
+    
+    total_validation_error_list = []
+    total_train_error_list = []
+    learning_rate_list=[]
+    best_training_error=float('inf');
+    
+    
    
     f = open(data_path+train_data)
     g= open(data_path+train_labels)
@@ -235,11 +242,17 @@
     y = T.lvector()  # the labels are presented as 1D vector of 
                           # [long int] labels
 
+    if verbose==True:
+        print 'finished parsing the data'
     # construct the logistic regression class
     classifier = MLP( input=x.reshape((batch_size,32*32)),\
                         n_in=32*32,\
                         n_hidden=nb_hidden,\
-                        n_out=nb_targets)
+                        n_out=nb_targets,
+                        learning_rate=learning_rate)
+                        
+                        
+   
 
     # the cost we minimize during training is the negative log likelihood of 
     # the model plus the regularization terms (L1 and L2); cost is expressed
@@ -260,10 +273,10 @@
 
     # specify how to update the parameters of the model as a dictionary
     updates = \
-        { classifier.W1: classifier.W1 - learning_rate*g_W1 \
-        , classifier.b1: classifier.b1 - learning_rate*g_b1 \
-        , classifier.W2: classifier.W2 - learning_rate*g_W2 \
-        , classifier.b2: classifier.b2 - learning_rate*g_b2 }
+        { classifier.W1: classifier.W1 - classifier.lr*g_W1 \
+        , classifier.b1: classifier.b1 - classifier.lr*g_b1 \
+        , classifier.W2: classifier.W2 - classifier.lr*g_W2 \
+        , classifier.b2: classifier.b2 - classifier.lr*g_b2 }
 
     # compiling a theano function `train_model` that returns the cost, but in 
     # the same time updates the parameter of the model based on the rules 
@@ -273,13 +286,17 @@
 
    
    
+    
+   
    
    #conditions for stopping the adaptation:
    #1) we have reached  nb_max_exemples (this is rounded up to be a multiple of the train size)
-   #2) validation error is going up (probable overfitting)
+   #2) validation error is going up twice in a row(probable overfitting)
    
    # This means we no longer stop on slow convergence as low learning rates stopped
    # too fast. 
+   
+   # no longer relevant
     patience              =nb_max_exemples/batch_size
     patience_increase     = 2     # wait this much longer when a new best is 
                                   # found
@@ -296,9 +313,9 @@
     test_score           = 0.
     start_time = time.clock()
     n_iter = nb_max_exemples/batch_size  # nb of max times we are allowed to run through all exemples
-    n_iter = n_iter/n_minibatches + 1
+    n_iter = n_iter/n_minibatches + 1 #round up
     n_iter=max(1,n_iter) # run at least once on short debug call
-    # have a maximum of `n_iter` iterations through the entire dataset
+    
    
     if verbose == True:
         print 'looping at most %d times through the data set' %n_iter
@@ -307,7 +324,9 @@
         # get epoch and minibatch index
         epoch           = iter / n_minibatches
         minibatch_index =  iter % n_minibatches
-
+        
+      
+        
         # get the minibatches corresponding to `iter` modulo
         # `len(train_batches)`
         x,y = train_batches[ minibatch_index ]
@@ -317,7 +336,7 @@
 
         if (iter+1) % validation_frequency == 0: 
             # compute zero-one loss on validation set 
-           
+            
             this_validation_loss = 0.
             for x,y in validation_batches:
                 # sum up the errors for each minibatch
@@ -325,26 +344,40 @@
                 this_validation_loss += test_model(x_float,y)
             # get the average by dividing with the number of minibatches
             this_validation_loss /= len(validation_batches)
+            #save the validation loss
+            total_validation_error_list.append(this_validation_loss)
+            
+            #get the training error rate
+            this_train_loss=0
+            for x,y in train_batches:
+                # sum up the errors for each minibatch
+                x_float = x/255.0
+                this_train_loss += test_model(x_float,y)
+            # get the average by dividing with the number of minibatches
+            this_train_loss /= len(train_batches)
+            #save the validation loss
+            total_train_error_list.append(this_train_loss)
+            if(this_train_loss<best_training_error):
+                best_training_error=this_train_loss
+                
             if verbose == True:
-                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \
                     (epoch, minibatch_index+1, n_minibatches, \
-                        this_validation_loss*100.))
+                        this_validation_loss*100.,this_train_loss*100))
+                        
+                        
+            #save the learning rate
+            learning_rate_list.append(classifier.lr.value)
 
 
             # if we got the best validation score until now
             if this_validation_loss < best_validation_loss:
-
-                #improve patience if loss improvement is good enough
-                if this_validation_loss < best_validation_loss *  \
-                       improvement_threshold :
-                    patience = max(patience, iter * patience_increase)
-                elif verbose == True:
-                    print 'slow convergence stop'
-
                 # save best validation score and iteration number
                 best_validation_loss = this_validation_loss
                 best_iter = iter
-
+                # reset patience if we are going down again
+                # so we continue exploring
+                patience=nb_max_exemples/batch_size
                 # test it on the test set
                 test_score = 0.
                 for x,y in test_batches:
@@ -357,33 +390,40 @@
                                 (epoch, minibatch_index+1, n_minibatches,
                                 test_score*100.))
                                 
-            #if the validation error is going up, we are overfitting
-            #stop converging
-            elif this_validation_loss > best_validation_loss:
+            # if the validation error is going up, we are overfitting (or oscillating)
+            # stop converging but run at least to next validation
+            # to check overfitting or ocsillation
+            # the saved weights of the model will be a bit off in that case
+            elif this_validation_loss >= best_validation_loss:
                 #calculate the test error at this point and exit
                 # test it on the test set
-                if verbose==True:
-                    print ' We are diverging'
-                best_iter = iter
+                # however, if adaptive_lr is true, try reducing the lr to
+                # get us out of an oscilliation
+                if adaptive_lr==True:
+                    classifier.lr.value=classifier.lr.value/2.0
+
                 test_score = 0.
+                #cap the patience so we are allowed one more validation error
+                #calculation before aborting
+                patience = iter+validation_frequency+1
                 for x,y in test_batches:
                     x_float=x/255.0
                     test_score += test_model(x_float,y)
                 test_score /= len(test_batches)
                 if verbose == True:
-                    print ' validation error is going up, stopping now'
+                    print ' validation error is going up, possibly stopping soon'
                     print(('     epoch %i, minibatch %i/%i, test error of best '
                         'model %f %%') % 
                                 (epoch, minibatch_index+1, n_minibatches,
                                 test_score*100.))
                                 
-                break
+                
 
 
-            
-            if patience <= iter :
-               break 
-        
+        if iter>patience:
+            print 'we have diverged'
+            break
+
 
     end_time = time.clock()
     if verbose == True:
@@ -391,17 +431,25 @@
             'obtained at iteration %i, with test performance %f %%') %  
                     (best_validation_loss * 100., best_iter, test_score*100.))
         print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
-    print iter
-    return (best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
+        print iter
+        
+    #save the model and the weights
+    numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value)
+    numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\
+    learning_rate_list=learning_rate_list)
+    
+    return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
 
 
 if __name__ == '__main__':
     mlp_full_mnist()
 
 def jobman_mlp_full_nist(state,channel):
-    (validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
+    (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
                                                                 nb_max_exemples=state.nb_max_exemples,\
-                                                                nb_hidden=state.nb_hidden)
+                                                                nb_hidden=state.nb_hidden,\
+                                                                adaptive_lr=state.adaptive_lr)
+    state.train_error=train_error
     state.validation_error=validation_error
     state.test_error=test_error
     state.nb_exemples=nb_exemples