# HG changeset patch
# User xaviermuller
# Date 1270584052 14400
# Node ID 7439073664766a050410677633b73a019b49c046
# Parent  403b9e6ecfaa10cdf53cadd73fe525ee03773f4c
code clean up in progress

diff -r 403b9e6ecfaa -r 743907366476 baseline/mlp/mlp_nist.py
--- a/baseline/mlp/mlp_nist.py	Fri Apr 02 14:54:05 2010 -0400
+++ b/baseline/mlp/mlp_nist.py	Tue Apr 06 16:00:52 2010 -0400
@@ -33,6 +33,7 @@
 import pylearn
 import theano,pylearn.version,ift6266
 from pylearn.io import filetensor as ft
+from ift6266 import datasets
 
 data_path = '/data/lisa/data/nist/by_class/'
 
@@ -165,16 +166,13 @@
 
 def mlp_full_nist(      verbose = 1,\
                         adaptive_lr = 0,\
-                        train_data = 'all/all_train_data.ft',\
-                        train_labels = 'all/all_train_labels.ft',\
-                        test_data = 'all/all_test_data.ft',\
-                        test_labels = 'all/all_test_labels.ft',\
+                        data_set=0,\
                         learning_rate=0.01,\
                         L1_reg = 0.00,\
                         L2_reg = 0.0001,\
                         nb_max_exemples=1000000,\
                         batch_size=20,\
-                        nb_hidden = 500,\
+                        nb_hidden = 30,\
                         nb_targets = 62,
 			tau=1e6,\
 			lr_t2_factor=0.5):
@@ -190,57 +188,11 @@
     learning_rate_list=[]
     best_training_error=float('inf');
     
+    if data_set==0:
+    	dataset=datasets.nist_all()
     
     
-   
-    f = open(data_path+train_data)
-    g= open(data_path+train_labels)
-    h = open(data_path+test_data)
-    i= open(data_path+test_labels)
     
-    raw_train_data = ft.read(f)
-    raw_train_labels = ft.read(g)
-    raw_test_data = ft.read(h)
-    raw_test_labels = ft.read(i)
-    
-    f.close()
-    g.close()
-    i.close()
-    h.close()
-    #create a validation set the same size as the test size
-    #use the end of the training array for this purpose
-    #discard the last remaining so we get a %batch_size number
-    test_size=len(raw_test_labels)
-    test_size = int(test_size/batch_size)
-    test_size*=batch_size
-    train_size = len(raw_train_data)
-    train_size = int(train_size/batch_size)
-    train_size*=batch_size
-    validation_size =test_size 
-    offset = train_size-test_size
-    if verbose == 1:
-        print 'train size = %d' %train_size
-        print 'test size = %d' %test_size
-        print 'valid size = %d' %validation_size
-        print 'offset = %d' %offset
-    
-    
-    train_set = (raw_train_data,raw_train_labels)
-    train_batches = []
-    for i in xrange(0, train_size-test_size, batch_size):
-        train_batches = train_batches + \
-            [(raw_train_data[i:i+batch_size], raw_train_labels[i:i+batch_size])]
-            
-    test_batches = []
-    for i in xrange(0, test_size, batch_size):
-        test_batches = test_batches + \
-            [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])]
-    
-    validation_batches = []
-    for i in xrange(0, test_size, batch_size):
-        validation_batches = validation_batches + \
-            [(raw_train_data[offset+i:offset+i+batch_size], raw_train_labels[offset+i:offset+i+batch_size])]
-
 
     ishape     = (32,32) # this is the size of NIST images
 
@@ -249,10 +201,9 @@
     y = T.lvector()  # the labels are presented as 1D vector of 
                           # [long int] labels
 
-    if verbose==1:
-        print 'finished parsing the data'
+    
     # construct the logistic regression class
-    classifier = MLP( input=x.reshape((batch_size,32*32)),\
+    classifier = MLP( input=x,\
                         n_in=32*32,\
                         n_hidden=nb_hidden,\
                         n_out=nb_targets,
@@ -289,7 +240,9 @@
     # the same time updates the parameter of the model based on the rules 
     # defined in `updates`
     train_model = theano.function([x, y], cost, updates = updates )
-    n_minibatches        = len(train_batches)
+    
+    
+   
 
    
    
@@ -303,8 +256,13 @@
    # This means we no longer stop on slow convergence as low learning rates stopped
    # too fast. 
    
-   # no longer relevant
-    patience              =nb_max_exemples/batch_size
+    #approximate number of samples in the training set
+    #this is just to have a validation frequency
+    #roughly proportionnal to the training set
+    n_minibatches        = 650000/batch_size
+    
+    
+    patience              =nb_max_exemples/batch_size #in units of minibatch
     patience_increase     = 2     # wait this much longer when a new best is 
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
@@ -314,139 +272,121 @@
      
 
    
-    best_params          = None
+    
     best_validation_loss = float('inf')
     best_iter            = 0
     test_score           = 0.
     start_time = time.clock()
-    n_iter = nb_max_exemples/batch_size  # nb of max times we are allowed to run through all exemples
-    n_iter = n_iter/n_minibatches + 1 #round up
-    n_iter=max(1,n_iter) # run at least once on short debug call
     time_n=0 #in unit of exemples
+    minibatch_index=0
+    epoch=0
+    temp=0
+    
     
     
-   
     if verbose == 1:
-        print 'looping at most %d times through the data set' %n_iter
-    for iter in xrange(n_iter* n_minibatches):
-
-        # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
-        
-	
-	if adaptive_lr==2:
-	    classifier.lr.value = tau*initial_lr/(tau+time_n)
-      
+        print 'looking at most at %i exemples' %nb_max_exemples
+    while(minibatch_index*batch_size<nb_max_exemples):
         
-        # get the minibatches corresponding to `iter` modulo
-        # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
-        # convert to float
-        x_float = x/255.0
-        cost_ij = train_model(x_float,y)
-
-        if (iter+1) % validation_frequency == 0: 
-            # compute zero-one loss on validation set 
-            
-            this_validation_loss = 0.
-            for x,y in validation_batches:
-                # sum up the errors for each minibatch
-                x_float = x/255.0
-                this_validation_loss += test_model(x_float,y)
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(validation_batches)
-            #save the validation loss
-            total_validation_error_list.append(this_validation_loss)
-            
-            #get the training error rate
-            this_train_loss=0
-            for x,y in train_batches:
-                # sum up the errors for each minibatch
-                x_float = x/255.0
-                this_train_loss += test_model(x_float,y)
-            # get the average by dividing with the number of minibatches
-            this_train_loss /= len(train_batches)
-            #save the validation loss
-            total_train_error_list.append(this_train_loss)
-            if(this_train_loss<best_training_error):
-                best_training_error=this_train_loss
-                
-            if verbose == 1:
-                print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \
-                    (epoch, minibatch_index+1, n_minibatches, \
-                        this_validation_loss*100.,this_train_loss*100))
-		print 'learning rate = %f' %classifier.lr.value
-		print 'time  = %i' %time_n
-                        
-                        
-            #save the learning rate
-            learning_rate_list.append(classifier.lr.value)
+        for x, y in dataset.train(batch_size):
 
-
-            # if we got the best validation score until now
-            if this_validation_loss < best_validation_loss:
-                # save best validation score and iteration number
-                best_validation_loss = this_validation_loss
-                best_iter = iter
-                # reset patience if we are going down again
-                # so we continue exploring
-                patience=nb_max_exemples/batch_size
-                # test it on the test set
-                test_score = 0.
-                for x,y in test_batches:
-                    x_float=x/255.0
-                    test_score += test_model(x_float,y)
-                test_score /= len(test_batches)
+            
+            minibatch_index =  minibatch_index + 1
+            if adaptive_lr==2:
+                    classifier.lr.value = tau*initial_lr/(tau+time_n)
+        
+            
+            #train model
+            cost_ij = train_model(x,y)
+    
+            if (minibatch_index+1) % validation_frequency == 0: 
+                
+                #save the current learning rate
+                learning_rate_list.append(classifier.lr.value)
+                
+                # compute the validation error
+                this_validation_loss = 0.
+                temp=0
+                for xv,yv in dataset.valid(1):
+                    # sum up the errors for each minibatch
+                    axxa=test_model(xv,yv)
+                    this_validation_loss += axxa
+                    temp=temp+1
+                # get the average by dividing with the number of minibatches
+                this_validation_loss /= temp
+                #save the validation loss
+                total_validation_error_list.append(this_validation_loss)
                 if verbose == 1:
-                    print(('     epoch %i, minibatch %i/%i, test error of best '
-                        'model %f %%') % 
-                                (epoch, minibatch_index+1, n_minibatches,
-                                test_score*100.))
-                                
-            # if the validation error is going up, we are overfitting (or oscillating)
-            # stop converging but run at least to next validation
-            # to check overfitting or ocsillation
-            # the saved weights of the model will be a bit off in that case
-            elif this_validation_loss >= best_validation_loss:
-                #calculate the test error at this point and exit
-                # test it on the test set
-                # however, if adaptive_lr is true, try reducing the lr to
-                # get us out of an oscilliation
-                if adaptive_lr==1:
-                    classifier.lr.value=classifier.lr.value*lr_t2_factor
-
-                test_score = 0.
-                #cap the patience so we are allowed one more validation error
-                #calculation before aborting
-                patience = iter+validation_frequency+1
-                for x,y in test_batches:
-                    x_float=x/255.0
-                    test_score += test_model(x_float,y)
-                test_score /= len(test_batches)
-                if verbose == 1:
-                    print ' validation error is going up, possibly stopping soon'
-                    print(('     epoch %i, minibatch %i/%i, test error of best '
-                        'model %f %%') % 
-                                (epoch, minibatch_index+1, n_minibatches,
-                                test_score*100.))
-                                
-                
-
-
-        if iter>patience:
-            print 'we have diverged'
-            break
-
-
-    	time_n= time_n + batch_size
+                    print(('epoch %i, minibatch %i, learning rate %f current validation error %f ') % 
+                                (epoch, minibatch_index+1,classifier.lr.value,
+                                this_validation_loss*100.))
+    
+                # if we got the best validation score until now
+                if this_validation_loss < best_validation_loss:
+                    # save best validation score and iteration number
+                    best_validation_loss = this_validation_loss
+                    best_iter = minibatch_index
+                    # reset patience if we are going down again
+                    # so we continue exploring
+                    patience=nb_max_exemples/batch_size
+                    # test it on the test set
+                    test_score = 0.
+                    temp =0
+                    for xt,yt in dataset.test(batch_size):
+                        test_score += test_model(xt,yt)
+                        temp = temp+1
+                    test_score /= temp
+                    if verbose == 1:
+                        print(('epoch %i, minibatch %i, test error of best '
+                            'model %f %%') % 
+                                    (epoch, minibatch_index+1,
+                                    test_score*100.))
+                                    
+                # if the validation error is going up, we are overfitting (or oscillating)
+                # stop converging but run at least to next validation
+                # to check overfitting or ocsillation
+                # the saved weights of the model will be a bit off in that case
+                elif this_validation_loss >= best_validation_loss:
+                    #calculate the test error at this point and exit
+                    # test it on the test set
+                    # however, if adaptive_lr is true, try reducing the lr to
+                    # get us out of an oscilliation
+                    if adaptive_lr==1:
+                        classifier.lr.value=classifier.lr.value*lr_t2_factor
+    
+                    test_score = 0.
+                    #cap the patience so we are allowed one more validation error
+                    #calculation before aborting
+                    patience = minibatch_index+validation_frequency+1
+                    temp=0
+                    for xt,yt in dataset.test(batch_size):
+                        test_score += test_model(xt,yt)
+                        temp=temp+1
+                    test_score /= temp
+                    if verbose == 1:
+                        print ' validation error is going up, possibly stopping soon'
+                        print(('     epoch %i, minibatch %i, test error of best '
+                            'model %f %%') % 
+                                    (epoch, minibatch_index+1,
+                                    test_score*100.))
+                                    
+                    
+    
+    
+            if minibatch_index>patience:
+                print 'we have diverged'
+                break
+    
+    
+            time_n= time_n + batch_size
+        epoch = epoch+1
     end_time = time.clock()
     if verbose == 1:
         print(('Optimization complete. Best validation score of %f %% '
             'obtained at iteration %i, with test performance %f %%') %  
                     (best_validation_loss * 100., best_iter, test_score*100.))
         print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
-        print iter
+        print minibatch_index
         
     #save the model and the weights
     numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value)