# HG changeset patch
# User Arnaud Bergeron <abergeron@gmail.com>
# Date 1267571809 18000
# Node ID 5d88ed99c0af2cde30d490f388358c127f091946
# Parent  9116cfe8e4ab6407f2ec597c12793a7e12c96611
Modify the log_reg.py tutorial code to use the datasets module.

diff -r 9116cfe8e4ab -r 5d88ed99c0af baseline/log_reg/log_reg.py
--- a/baseline/log_reg/log_reg.py	Tue Mar 02 18:03:42 2010 -0500
+++ b/baseline/log_reg/log_reg.py	Tue Mar 02 18:16:49 2010 -0500
@@ -35,11 +35,11 @@
 """
 __docformat__ = 'restructedtext en'
 
-import numpy, time, cPickle, gzip
+import numpy, time
 
 import theano
 import theano.tensor as T
-
+from ift6266 import datasets
 
 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -135,107 +135,12 @@
         else:
             raise NotImplementedError()
         
-def shared_dataset( data_xy ):
-        """ Function that loads the dataset into shared variables
-        
-        The reason we store our dataset in shared variables is to allow 
-        Theano to copy it into the GPU memory (when code is run on GPU). 
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared 
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) )
-        shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) )
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are 
-        # floats it doesn't make sense) therefore instead of returning 
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast( shared_y, 'int32' )
-
-def load_data_pkl_gz( dataset ):
-    ''' Loads the dataset
-
-    :type dataset: string
-    :param dataset: the path to the dataset (here MNIST)
-    '''
-
-    #--------------------------------------------------------------------------------------------------------------------
-    # Load Data
-    #--------------------------------------------------------------------------------------------------------------------
-
-
-    print '... loading data'
-
-    # Load the dataset 
-    f = gzip.open(dataset,'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-    
-    test_set_x,  test_set_y  = shared_dataset( test_set )
-    valid_set_x, valid_set_y = shared_dataset( valid_set )
-    train_set_x, train_set_y = shared_dataset( train_set )
-
-    rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ]
-    return rval
-
-##def load_data_ft(      verbose = False,\
-##                                    data_path = '/data/lisa/data/nist/by_class/'\
-##                                    train_data = 'all/all_train_data.ft',\
-##                                    train_labels = 'all/all_train_labels.ft',\
-##                                    test_data = 'all/all_test_data.ft',\
-##                                    test_labels = 'all/all_test_labels.ft'):
-##   
-##    train_data_file = open(data_path + train_data)
-##    train_labels_file = open(data_path + train_labels)
-##    test_labels_file = open(data_path + test_data)
-##    test_data_file = open(data_path + test_labels)
-##    
-##    raw_train_data = ft.read( train_data_file)
-##    raw_train_labels = ft.read(train_labels_file)
-##    raw_test_data = ft.read( test_labels_file)
-##    raw_test_labels = ft.read( test_data_file)
-##    
-##    f.close()
-##    g.close()
-##    i.close()
-##    h.close()
-##    
-##    
-##    test_set_x,  test_set_y  = shared_dataset(test_set)
-##    valid_set_x, valid_set_y = shared_dataset(valid_set)
-##    train_set_x, train_set_y = shared_dataset(train_set)
-##
-##    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
-##    return rval
-##    #create a validation set the same size as the test size
-##    #use the end of the training array for this purpose
-##    #discard the last remaining so we get a %batch_size number
-##    test_size=len(raw_test_labels)
-##    test_size = int(test_size/batch_size)
-##    test_size*=batch_size
-##    train_size = len(raw_train_data)
-##    train_size = int(train_size/batch_size)
-##    train_size*=batch_size
-##    validation_size =test_size 
-##    offset = train_size-test_size
-##    if verbose == True:
-##        print 'train size = %d' %train_size
-##        print 'test size = %d' %test_size
-##        print 'valid size = %d' %validation_size
-##        print 'offset = %d' %offset
-##    
-##    
-
 #--------------------------------------------------------------------------------------------------------------------
 # MAIN
 #--------------------------------------------------------------------------------------------------------------------
 
 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
-                    dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10,  \
+                    dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10,  \
                     patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
     
     """
@@ -254,9 +159,8 @@
     :type batch_size: int  
     :param batch_size:  size of the minibatch
 
-    :type dataset_name: string
-    :param dataset: the path of the MNIST dataset file from 
-                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+    :type dataset: dataset
+    :param dataset: a dataset instance from ift6266.datasets
                         
     :type image_size: int
     :param image_size: size of the input image in pixels (width * height)
@@ -275,17 +179,6 @@
 
 
     """
-    datasets = load_data_pkl_gz( dataset_name )
-
-    train_set_x, train_set_y = datasets[0]
-    valid_set_x, valid_set_y = datasets[1]
-    test_set_x , test_set_y   = datasets[2]
-
-    # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.value.shape[0] / batch_size
-    n_valid_batches = valid_set_x.value.shape[0] / batch_size
-    n_test_batches  = test_set_x.value.shape[0]  / batch_size
-
     #--------------------------------------------------------------------------------------------------------------------
     # Build actual model
     #--------------------------------------------------------------------------------------------------------------------
@@ -308,17 +201,11 @@
 
     # compiling a Theano function that computes the mistakes that are made by 
     # the model on a minibatch
-    test_model = theano.function( inputs = [ index ], 
-            outputs = classifier.errors( y ),
-            givens = {
-                x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+    test_model = theano.function( inputs = [ x, y ], 
+            outputs = classifier.errors( y ))
 
-    validate_model = theano.function( inputs = [ index ], 
-            outputs = classifier.errors( y ),
-            givens = {
-                x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+    validate_model = theano.function( inputs = [ x, y ], 
+            outputs = classifier.errors( y ))
 
     # compute the gradient of cost with respect to theta = ( W, b ) 
     g_W = T.grad( cost = cost, wrt = classifier.W )
@@ -331,12 +218,9 @@
     # compiling a Theano function `train_model` that returns the cost, but in 
     # the same time updates the parameter of the model based on the rules 
     # defined in `updates`
-    train_model = theano.function( inputs = [ index ], 
+    train_model = theano.function( inputs = [ x, y ], 
             outputs = cost, 
-            updates = updates,
-            givens = {
-                x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+            updates = updates)
 
     #--------------------------------------------------------------------------------------------------------------------
     # Train model
@@ -349,38 +233,38 @@
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = min( n_train_batches, patience * 0.5 )  
+    validation_frequency  = patience * 0.5
                                   # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
 
-    best_params             = None
+    best_params          = None
     best_validation_loss = float('inf')
-    test_score                 = 0.
-    start_time                  = time.clock()
+    test_score           = 0.
+    start_time           = time.clock()
 
     done_looping = False 
-    n_epochs       = nb_max_examples / train_set_x.value.shape[0]
-    epoch             = 0  
+    n_iters      = nb_max_examples / batch_size
+    epoch        = 0
+    iter        = 0
     
-    while ( epoch < n_epochs ) and ( not done_looping ):
+    while ( iter < n_iters ) and ( not done_looping ):
         
       epoch = epoch + 1
-      for minibatch_index in xrange( n_train_batches ):
+      for x, y in dataset.train(batch_size):
 
-        minibatch_avg_cost = train_model( minibatch_index )
+        minibatch_avg_cost = train_model( x, y )
         # iteration number
-        iter = epoch * n_train_batches + minibatch_index
+        iter += 1
 
-        if ( iter + 1 ) % validation_frequency == 0: 
+        if iter % validation_frequency == 0: 
             # compute zero-one loss on validation set 
-            validation_losses     = [ validate_model( i ) for i in xrange( n_valid_batches ) ]
+            validation_losses     = [ validate_model( xv, yv ) for xv, yv in dataset.valid(batch_size) ]
             this_validation_loss = numpy.mean( validation_losses )
 
-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                 ( epoch, minibatch_index + 1,n_train_batches, \
-                  this_validation_loss*100. ) )
+            print('epoch %i, iter %i, validation error %f %%' % \
+                 ( epoch, iter, this_validation_loss*100. ) )
 
 
             # if we got the best validation score until now
@@ -393,12 +277,12 @@
                 best_validation_loss = this_validation_loss
                 # test it on the test set
 
-                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)]
                 test_score  = numpy.mean(test_losses)
 
-                print(('     epoch %i, minibatch %i/%i, test error of best ' 
+                print(('     epoch %i, iter %i, test error of best ' 
                        'model %f %%') % \
-                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
+                  (epoch, iter, test_score*100.))
 
         if patience <= iter :
                 done_looping = True