changeset 202:6ea5dcf0541e

Branch merge.
author Arnaud Bergeron <abergeron@gmail.com>
date Wed, 03 Mar 2010 16:52:10 -0500
parents 25444fc301e0 (diff) c69c1d832a53 (current diff)
children fd1b5237e49e
files deep/stacked_dae/nist_sda.py test.py
diffstat 10 files changed, 823 insertions(+), 415 deletions(-) [+]
line wrap: on
line diff
--- a/baseline/conv_mlp/convolutional_mlp.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/baseline/conv_mlp/convolutional_mlp.py	Wed Mar 03 16:52:10 2010 -0500
@@ -26,7 +26,8 @@
 import theano.sandbox.softsign
 import pylearn.datasets.MNIST
 from pylearn.io import filetensor as ft
-from theano.sandbox import conv, downsample
+from theano.tensor.signal import downsample
+from theano.tensor.nnet import conv
 
 class LeNetConvPoolLayer(object):
 
--- a/baseline/log_reg/log_reg.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/baseline/log_reg/log_reg.py	Wed Mar 03 16:52:10 2010 -0500
@@ -35,11 +35,11 @@
 """
 __docformat__ = 'restructedtext en'
 
-import numpy, time, cPickle, gzip
+import numpy, time
 
 import theano
 import theano.tensor as T
-
+from ift6266 import datasets
 
 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -112,6 +112,8 @@
         # i.e., the mean log-likelihood across the minibatch.
         return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] )
 
+    def MSE(self, y):
+        return -T.mean(abs((self.p_t_given_x)[T.arange(y.shape[0]), y]-y)**2)
 
     def errors( self, y ):
         """Return a float representing the number of errors in the minibatch 
@@ -135,107 +137,12 @@
         else:
             raise NotImplementedError()
         
-def shared_dataset( data_xy ):
-        """ Function that loads the dataset into shared variables
-        
-        The reason we store our dataset in shared variables is to allow 
-        Theano to copy it into the GPU memory (when code is run on GPU). 
-        Since copying data into the GPU is slow, copying a minibatch everytime
-        is needed (the default behaviour if the data is not in a shared 
-        variable) would lead to a large decrease in performance.
-        """
-        data_x, data_y = data_xy
-        shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) )
-        shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) )
-        # When storing data on the GPU it has to be stored as floats
-        # therefore we will store the labels as ``floatX`` as well
-        # (``shared_y`` does exactly that). But during our computations
-        # we need them as ints (we use labels as index, and if they are 
-        # floats it doesn't make sense) therefore instead of returning 
-        # ``shared_y`` we will have to cast it to int. This little hack
-        # lets ous get around this issue
-        return shared_x, T.cast( shared_y, 'int32' )
-
-def load_data_pkl_gz( dataset ):
-    ''' Loads the dataset
-
-    :type dataset: string
-    :param dataset: the path to the dataset (here MNIST)
-    '''
-
-    #--------------------------------------------------------------------------------------------------------------------
-    # Load Data
-    #--------------------------------------------------------------------------------------------------------------------
-
-
-    print '... loading data'
-
-    # Load the dataset 
-    f = gzip.open(dataset,'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-    
-    test_set_x,  test_set_y  = shared_dataset( test_set )
-    valid_set_x, valid_set_y = shared_dataset( valid_set )
-    train_set_x, train_set_y = shared_dataset( train_set )
-
-    rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ]
-    return rval
-
-##def load_data_ft(      verbose = False,\
-##                                    data_path = '/data/lisa/data/nist/by_class/'\
-##                                    train_data = 'all/all_train_data.ft',\
-##                                    train_labels = 'all/all_train_labels.ft',\
-##                                    test_data = 'all/all_test_data.ft',\
-##                                    test_labels = 'all/all_test_labels.ft'):
-##   
-##    train_data_file = open(data_path + train_data)
-##    train_labels_file = open(data_path + train_labels)
-##    test_labels_file = open(data_path + test_data)
-##    test_data_file = open(data_path + test_labels)
-##    
-##    raw_train_data = ft.read( train_data_file)
-##    raw_train_labels = ft.read(train_labels_file)
-##    raw_test_data = ft.read( test_labels_file)
-##    raw_test_labels = ft.read( test_data_file)
-##    
-##    f.close()
-##    g.close()
-##    i.close()
-##    h.close()
-##    
-##    
-##    test_set_x,  test_set_y  = shared_dataset(test_set)
-##    valid_set_x, valid_set_y = shared_dataset(valid_set)
-##    train_set_x, train_set_y = shared_dataset(train_set)
-##
-##    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
-##    return rval
-##    #create a validation set the same size as the test size
-##    #use the end of the training array for this purpose
-##    #discard the last remaining so we get a %batch_size number
-##    test_size=len(raw_test_labels)
-##    test_size = int(test_size/batch_size)
-##    test_size*=batch_size
-##    train_size = len(raw_train_data)
-##    train_size = int(train_size/batch_size)
-##    train_size*=batch_size
-##    validation_size =test_size 
-##    offset = train_size-test_size
-##    if verbose == True:
-##        print 'train size = %d' %train_size
-##        print 'test size = %d' %test_size
-##        print 'valid size = %d' %validation_size
-##        print 'offset = %d' %offset
-##    
-##    
-
 #--------------------------------------------------------------------------------------------------------------------
 # MAIN
 #--------------------------------------------------------------------------------------------------------------------
 
 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
-                    dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10,  \
+                    dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10,  \
                     patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
     
     """
@@ -254,9 +161,8 @@
     :type batch_size: int  
     :param batch_size:  size of the minibatch
 
-    :type dataset_name: string
-    :param dataset: the path of the MNIST dataset file from 
-                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+    :type dataset: dataset
+    :param dataset: a dataset instance from ift6266.datasets
                         
     :type image_size: int
     :param image_size: size of the input image in pixels (width * height)
@@ -275,17 +181,6 @@
 
 
     """
-    datasets = load_data_pkl_gz( dataset_name )
-
-    train_set_x, train_set_y = datasets[0]
-    valid_set_x, valid_set_y = datasets[1]
-    test_set_x , test_set_y   = datasets[2]
-
-    # compute number of minibatches for training, validation and testing
-    n_train_batches = train_set_x.value.shape[0] / batch_size
-    n_valid_batches = valid_set_x.value.shape[0] / batch_size
-    n_test_batches  = test_set_x.value.shape[0]  / batch_size
-
     #--------------------------------------------------------------------------------------------------------------------
     # Build actual model
     #--------------------------------------------------------------------------------------------------------------------
@@ -308,17 +203,11 @@
 
     # compiling a Theano function that computes the mistakes that are made by 
     # the model on a minibatch
-    test_model = theano.function( inputs = [ index ], 
-            outputs = classifier.errors( y ),
-            givens = {
-                x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+    test_model = theano.function( inputs = [ x, y ], 
+            outputs = classifier.errors( y ))
 
-    validate_model = theano.function( inputs = [ index ], 
-            outputs = classifier.errors( y ),
-            givens = {
-                x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+    validate_model = theano.function( inputs = [ x, y ], 
+            outputs = classifier.errors( y ))
 
     # compute the gradient of cost with respect to theta = ( W, b ) 
     g_W = T.grad( cost = cost, wrt = classifier.W )
@@ -331,12 +220,9 @@
     # compiling a Theano function `train_model` that returns the cost, but in 
     # the same time updates the parameter of the model based on the rules 
     # defined in `updates`
-    train_model = theano.function( inputs = [ index ], 
+    train_model = theano.function( inputs = [ x, y ], 
             outputs = cost, 
-            updates = updates,
-            givens = {
-                x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
-                y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+            updates = updates)
 
     #--------------------------------------------------------------------------------------------------------------------
     # Train model
@@ -349,38 +235,38 @@
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = min( n_train_batches, patience * 0.5 )  
+    validation_frequency  = patience * 0.5
                                   # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
 
-    best_params             = None
+    best_params          = None
     best_validation_loss = float('inf')
-    test_score                 = 0.
-    start_time                  = time.clock()
+    test_score           = 0.
+    start_time           = time.clock()
 
     done_looping = False 
-    n_epochs       = nb_max_examples / train_set_x.value.shape[0]
-    epoch             = 0  
+    n_iters      = nb_max_examples / batch_size
+    epoch        = 0
+    iter        = 0
     
-    while ( epoch < n_epochs ) and ( not done_looping ):
+    while ( iter < n_iters ) and ( not done_looping ):
         
       epoch = epoch + 1
-      for minibatch_index in xrange( n_train_batches ):
+      for x, y in dataset.train(batch_size):
 
-        minibatch_avg_cost = train_model( minibatch_index )
+        minibatch_avg_cost = train_model( x, y )
         # iteration number
-        iter = epoch * n_train_batches + minibatch_index
+        iter += 1
 
-        if ( iter + 1 ) % validation_frequency == 0: 
+        if iter % validation_frequency == 0: 
             # compute zero-one loss on validation set 
-            validation_losses     = [ validate_model( i ) for i in xrange( n_valid_batches ) ]
+            validation_losses     = [ validate_model( xv, yv ) for xv, yv in dataset.valid(batch_size) ]
             this_validation_loss = numpy.mean( validation_losses )
 
-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                 ( epoch, minibatch_index + 1,n_train_batches, \
-                  this_validation_loss*100. ) )
+            print('epoch %i, iter %i, validation error %f %%' % \
+                 ( epoch, iter, this_validation_loss*100. ) )
 
 
             # if we got the best validation score until now
@@ -393,12 +279,12 @@
                 best_validation_loss = this_validation_loss
                 # test it on the test set
 
-                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)]
                 test_score  = numpy.mean(test_losses)
 
-                print(('     epoch %i, minibatch %i/%i, test error of best ' 
+                print(('     epoch %i, iter %i, test error of best ' 
                        'model %f %%') % \
-                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
+                  (epoch, iter, test_score*100.))
 
         if patience <= iter :
                 done_looping = True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/autoencoder/DA_training.py	Wed Mar 03 16:52:10 2010 -0500
@@ -0,0 +1,601 @@
+"""
+ This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.
+
+ Denoising autoencoders are the building blocks for SDAE. 
+ They are based on auto-encoders as the ones used in Bengio et al. 2007.
+ An autoencoder takes an input x and first maps it to a hidden representation
+ y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting 
+ latent representation y is then mapped back to a "reconstructed" vector 
+ z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight 
+ matrix W' can optionally be constrained such that W' = W^T, in which case 
+ the autoencoder is said to have tied weights. The network is trained such 
+ that to minimize the reconstruction error (the error between x and z).
+
+ For the denosing autoencoder, during training, first x is corrupted into 
+ \tilde{x}, where \tilde{x} is a partially destroyed version of x by means 
+ of a stochastic mapping. Afterwards y is computed as before (using 
+ \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction 
+ error is now measured between z and the uncorrupted input x, which is 
+ computed as the cross-entropy : 
+      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+
+ For X iteration of the main program loop it takes *** minutes on an 
+ Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
+
+
+ References :
+   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and 
+   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
+   2008
+   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
+   Training of Deep Networks, Advances in Neural Information Processing 
+   Systems 19, 2007
+
+"""
+
+import numpy 
+import theano
+import time
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+import gzip
+import cPickle
+
+from pylearn.io import filetensor as ft
+
+class dA():
+  """Denoising Auto-Encoder class (dA) 
+
+  A denoising autoencoders tries to reconstruct the input from a corrupted 
+  version of it by projecting it first in a latent space and reprojecting 
+  it afterwards back in the input space. Please refer to Vincent et al.,2008
+  for more details. If x is the input then equation (1) computes a partially
+  destroyed version of x by means of a stochastic mapping q_D. Equation (2) 
+  computes the projection of the input into the latent space. Equation (3) 
+  computes the reconstruction of the input, while equation (4) computes the 
+  reconstruction error.
+  
+  .. math::
+
+    \tilde{x} ~ q_D(\tilde{x}|x)                                         (1)
+
+    y = s(W \tilde{x} + b)                                               (2)
+
+    z = s(W' y  + b')                                                    (3)
+
+    L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]          (4)
+
+  """
+
+  def __init__(self, n_visible= 784, n_hidden= 500, complexity = 0.1, input= None):
+        """
+        Initialize the DAE class by specifying the number of visible units (the 
+        dimension d of the input ), the number of hidden units ( the dimension 
+        d' of the latent or hidden space ) and by giving a symbolic variable 
+        for the input. Such a symbolic variable is useful when the input is 
+        the result of some computations. For example when dealing with SDAEs,
+        the dA on layer 2 gets as input the output of the DAE on layer 1. 
+        This output can be written as a function of the input to the entire 
+        model, and as such can be computed by theano whenever needed. 
+        
+        :param n_visible: number of visible units
+
+        :param n_hidden:  number of hidden units
+
+        :param input:     a symbolic description of the input or None 
+
+        """
+        self.n_visible = n_visible
+        self.n_hidden  = n_hidden
+        
+        # create a Theano random generator that gives symbolic random values
+        theano_rng = RandomStreams()
+        # create a numpy random generator
+        numpy_rng = numpy.random.RandomState()
+        
+         
+        # initial values for weights and biases
+        # note : W' was written as `W_prime` and b' as `b_prime`
+
+        # W is initialized with `initial_W` which is uniformely sampled
+        # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        initial_W = numpy.asarray( numpy.random.uniform( \
+                  low = -numpy.sqrt(6./(n_visible+n_hidden)), \
+                  high = numpy.sqrt(6./(n_visible+n_hidden)), \
+                  size = (n_visible, n_hidden)), dtype = theano.config.floatX)
+        initial_b       = numpy.zeros(n_hidden)
+
+        # W' is initialized with `initial_W_prime` which is uniformely sampled
+        # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        initial_b_prime= numpy.zeros(n_visible)
+         
+        
+        # theano shared variables for weights and biases
+        self.W       = theano.shared(value = initial_W,       name = "W")
+        self.b       = theano.shared(value = initial_b,       name = "b")
+        # tied weights, therefore W_prime is W transpose
+        self.W_prime = self.W.T 
+        self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
+
+        # if no input is given, generate a variable representing the input
+        if input == None : 
+            # we use a matrix because we expect a minibatch of several examples,
+            # each example being a row
+            x = T.dmatrix(name = 'input') 
+        else:
+            x = input
+        # Equation (1)
+        # note : first argument of theano.rng.binomial is the shape(size) of 
+        #        random numbers that it should produce
+        #        second argument is the number of trials 
+        #        third argument is the probability of success of any trial
+        #
+        #        this will produce an array of 0s and 1s where 1 has a 
+        #        probability of 0.9 and 0 of 0.1
+
+        tilde_x  = theano_rng.binomial( x.shape,  1,  1-complexity) * x
+        # Equation (2)
+        # note  : y is stored as an attribute of the class so that it can be 
+        #         used later when stacking dAs. 
+        self.y   = T.nnet.sigmoid(T.dot(tilde_x, self.W      ) + self.b)
+        # Equation (3)
+        z        = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+        # Equation (4)
+        self.L = - T.sum( x*T.log(z) + (1-x)*T.log(1-z), axis=1 ) 
+        # note : L is now a vector, where each element is the cross-entropy cost 
+        #        of the reconstruction of the corresponding example of the 
+        #        minibatch. We need to compute the average of all these to get 
+        #        the cost of the minibatch
+        self.cost = T.mean(self.L)
+        # note : y is computed from the corrupted `tilde_x`. Later on, 
+        #        we will need the hidden layer obtained from the uncorrupted 
+        #        input when for example we will pass this as input to the layer 
+        #        above
+        self.hidden_values = T.nnet.sigmoid( T.dot(x, self.W) + self.b)
+
+
+
+def sgd_optimization_nist( learning_rate=0.01,  \
+                            n_iter = 300, n_code_layer = 400, \
+                            complexity = 0.1):
+    """
+    Demonstrate stochastic gradient descent optimization for a denoising autoencoder
+
+    This is demonstrated on MNIST.
+
+    :param learning_rate: learning rate used (factor for the stochastic 
+    gradient
+
+    :param pretraining_epochs: number of epoch to do pretraining
+
+    :param pretrain_lr: learning rate to be used during pre-training
+
+    :param n_iter: maximal number of iterations ot run the optimizer 
+
+    """
+    #open file to save the validation and test curve
+    filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
+    'c_' + str(complexity) + '.txt'
+
+    result_file = open(filename, 'w')
+
+
+
+    data_path = '/data/lisa/data/nist/by_class/'
+    f = open(data_path+'all/all_train_data.ft')
+    g = open(data_path+'all/all_train_labels.ft')
+    h = open(data_path+'all/all_test_data.ft')
+    i = open(data_path+'all/all_test_labels.ft')
+    
+    train_set_x = ft.read(f)
+    train_set_y = ft.read(g)
+    test_set_x = ft.read(h)
+    test_set_y = ft.read(i)
+    
+    f.close()
+    g.close()
+    i.close()
+    h.close()
+
+    # make minibatches of size 20 
+    batch_size = 20    # sized of the minibatch
+
+    #create a validation set the same size as the test size
+    #use the end of the training array for this purpose
+    #discard the last remaining so we get a %batch_size number
+    test_size=len(test_set_y)
+    test_size = int(test_size/batch_size)
+    test_size*=batch_size
+    train_size = len(train_set_x)
+    train_size = int(train_size/batch_size)
+    train_size*=batch_size
+    validation_size =test_size 
+    offset = train_size-test_size
+    if True:
+        print 'train size = %d' %train_size
+        print 'test size = %d' %test_size
+        print 'valid size = %d' %validation_size
+        print 'offset = %d' %offset
+    
+    
+    #train_set = (train_set_x,train_set_y)
+    train_batches = []
+    for i in xrange(0, train_size-test_size, batch_size):
+        train_batches = train_batches + \
+            [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+            
+    test_batches = []
+    for i in xrange(0, test_size, batch_size):
+        test_batches = test_batches + \
+            [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+    
+    valid_batches = []
+    for i in xrange(0, test_size, batch_size):
+        valid_batches = valid_batches + \
+            [(train_set_x[offset+i:offset+i+batch_size], \
+            train_set_y[offset+i:offset+i+batch_size])]
+
+
+    ishape     = (32,32) # this is the size of NIST images
+
+    # allocate symbolic variables for the data
+    x = T.fmatrix()  # the data is presented as rasterized images
+    y = T.lvector()  # the labels are presented as 1D vector of 
+                          # [long int] labels
+
+    # construct the denoising autoencoder class
+    n_ins = 32*32
+    encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
+
+    # Train autoencoder
+    
+    # compute gradients of the layer parameters
+    gW       = T.grad(encoder.cost, encoder.W)
+    gb       = T.grad(encoder.cost, encoder.b)
+    gb_prime = T.grad(encoder.cost, encoder.b_prime)
+    # compute the updated value of the parameters after one step
+    updated_W       = encoder.W       - gW       * learning_rate
+    updated_b       = encoder.b       - gb       * learning_rate
+    updated_b_prime = encoder.b_prime - gb_prime * learning_rate
+
+    # defining the function that evaluate the symbolic description of 
+    # one update step 
+    train_model = theano.function([x], encoder.cost, updates=\
+                    { encoder.W       : updated_W, \
+                      encoder.b       : updated_b, \
+                      encoder.b_prime : updated_b_prime } )
+
+
+ 
+
+    # compiling a theano function that computes the mistakes that are made  
+    # by the model on a minibatch
+    test_model = theano.function([x], encoder.cost)
+
+    normalize = numpy.asarray(255, dtype=theano.config.floatX)
+
+  
+    n_minibatches        = len(train_batches) 
+ 
+    # early-stopping parameters
+    patience              = 10000000 / batch_size # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = n_minibatches  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    best_iter            = 0
+    test_score           = 0.
+    start_time = time.clock()
+    # have a maximum of `n_iter` iterations through the entire dataset
+    for iter in xrange(n_iter* n_minibatches):
+
+        # get epoch and minibatch index
+        epoch           = iter / n_minibatches
+        minibatch_index =  iter % n_minibatches
+
+        # get the minibatches corresponding to `iter` modulo
+        # `len(train_batches)`
+        x,y = train_batches[ minibatch_index ]
+        '''
+        if iter == 0:
+            b = numpy.asarray(255, dtype=theano.config.floatX)
+            x = x / b
+            print x
+            print y
+            print x.__class__
+            print x.shape
+            print x.dtype.name
+            print y.dtype.name
+            print x.min(), x.max()
+        '''
+        
+        cost_ij = train_model(x/normalize)
+
+        if (iter+1) % validation_frequency == 0: 
+            # compute zero-one loss on validation set 
+            this_validation_loss = 0.
+            for x,y in valid_batches:
+                # sum up the errors for each minibatch
+                this_validation_loss += test_model(x/normalize)
+            # get the average by dividing with the number of minibatches
+            this_validation_loss /= len(valid_batches)
+
+            print('epoch %i, minibatch %i/%i, validation error %f ' % \
+                   (epoch, minibatch_index+1, n_minibatches, \
+                    this_validation_loss))
+
+            # save value in file
+            result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                best_validation_loss = this_validation_loss
+                best_iter = iter
+                # test it on the test set
+            
+                test_score = 0.
+                for x,y in test_batches:
+                    test_score += test_model(x/normalize)
+                test_score /= len(test_batches)
+                print(('     epoch %i, minibatch %i/%i, test error of best '
+                      'model %f ') % 
+                             (epoch, minibatch_index+1, n_minibatches,
+                              test_score))
+
+        if patience <= iter :
+                print('iter (%i) is superior than patience(%i). break', iter, patience)
+                break
+
+        
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f ,'
+           'with test performance %f ') %  
+                 (best_validation_loss, test_score))
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+    
+    result_file.close()
+
+    return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
+
+def sgd_optimization_mnist( learning_rate=0.01,  \
+                            n_iter = 1, n_code_layer = 400, \
+                            complexity = 0.1):
+    """
+    Demonstrate stochastic gradient descent optimization for a denoising autoencoder
+
+    This is demonstrated on MNIST.
+
+    :param learning_rate: learning rate used (factor for the stochastic 
+    gradient
+
+    :param pretraining_epochs: number of epoch to do pretraining
+
+    :param pretrain_lr: learning rate to be used during pre-training
+
+    :param n_iter: maximal number of iterations ot run the optimizer 
+
+    """
+    #open file to save the validation and test curve
+    filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
+    'c_' + str(complexity) + '.txt'
+
+    result_file = open(filename, 'w')
+
+    # Load the dataset 
+    f = gzip.open('/u/lisa/HTML/deep/data/mnist/mnist.pkl.gz','rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+    # make minibatches of size 20 
+    batch_size = 20    # sized of the minibatch
+
+    # Dealing with the training set
+    # get the list of training images (x) and their labels (y)
+    (train_set_x, train_set_y) = train_set
+    # initialize the list of training minibatches with empty list
+    train_batches = []
+    for i in xrange(0, len(train_set_x), batch_size):
+        # add to the list of minibatches the minibatch starting at 
+        # position i, ending at position i+batch_size
+        # a minibatch is a pair ; the first element of the pair is a list 
+        # of datapoints, the second element is the list of corresponding 
+        # labels
+        train_batches = train_batches + \
+               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+
+    # Dealing with the validation set
+    (valid_set_x, valid_set_y) = valid_set
+    # initialize the list of validation minibatches 
+    valid_batches = []
+    for i in xrange(0, len(valid_set_x), batch_size):
+        valid_batches = valid_batches + \
+               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
+
+    # Dealing with the testing set
+    (test_set_x, test_set_y) = test_set
+    # initialize the list of testing minibatches 
+    test_batches = []
+    for i in xrange(0, len(test_set_x), batch_size):
+        test_batches = test_batches + \
+              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+
+
+    ishape     = (28,28) # this is the size of MNIST images
+
+    # allocate symbolic variables for the data
+    x = T.fmatrix()  # the data is presented as rasterized images
+    y = T.lvector()  # the labels are presented as 1D vector of 
+                          # [long int] labels
+
+    # construct the denoising autoencoder class
+    n_ins = 28*28
+    encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
+
+    # Train autoencoder
+    
+    # compute gradients of the layer parameters
+    gW       = T.grad(encoder.cost, encoder.W)
+    gb       = T.grad(encoder.cost, encoder.b)
+    gb_prime = T.grad(encoder.cost, encoder.b_prime)
+    # compute the updated value of the parameters after one step
+    updated_W       = encoder.W       - gW       * learning_rate
+    updated_b       = encoder.b       - gb       * learning_rate
+    updated_b_prime = encoder.b_prime - gb_prime * learning_rate
+
+    # defining the function that evaluate the symbolic description of 
+    # one update step 
+    train_model = theano.function([x], encoder.cost, updates=\
+                    { encoder.W       : updated_W, \
+                      encoder.b       : updated_b, \
+                      encoder.b_prime : updated_b_prime } )
+
+
+ 
+
+    # compiling a theano function that computes the mistakes that are made  
+    # by the model on a minibatch
+    test_model = theano.function([x], encoder.cost)
+
+
+
+  
+    n_minibatches        = len(train_batches) 
+ 
+    # early-stopping parameters
+    patience              = 10000# look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = n_minibatches  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    best_iter            = 0
+    test_score           = 0.
+    start_time = time.clock()
+    # have a maximum of `n_iter` iterations through the entire dataset
+    for iter in xrange(n_iter* n_minibatches):
+
+        # get epoch and minibatch index
+        epoch           = iter / n_minibatches
+        minibatch_index =  iter % n_minibatches
+
+        # get the minibatches corresponding to `iter` modulo
+        # `len(train_batches)`
+        x,y = train_batches[ minibatch_index ]
+        cost_ij = train_model(x)
+
+        if (iter+1) % validation_frequency == 0: 
+            # compute zero-one loss on validation set 
+            this_validation_loss = 0.
+            for x,y in valid_batches:
+                # sum up the errors for each minibatch
+                this_validation_loss += test_model(x)
+            # get the average by dividing with the number of minibatches
+            this_validation_loss /= len(valid_batches)
+
+            print('epoch %i, minibatch %i/%i, validation error %f ' % \
+                   (epoch, minibatch_index+1, n_minibatches, \
+                    this_validation_loss))
+
+            # save value in file
+            result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                best_validation_loss = this_validation_loss
+                best_iter = iter
+                # test it on the test set
+            
+                test_score = 0.
+                for x,y in test_batches:
+                    test_score += test_model(x)
+                test_score /= len(test_batches)
+                print(('     epoch %i, minibatch %i/%i, test error of best '
+                      'model %f ') % 
+                             (epoch, minibatch_index+1, n_minibatches,
+                              test_score))
+
+        if patience <= iter :
+                print('iter (%i) is superior than patience(%i). break', iter, patience)
+                break
+
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f ,'
+           'with test performance %f ') %  
+                 (best_validation_loss, test_score))
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+    
+    result_file.close()
+
+    return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
+
+
+def experiment(state,channel):
+
+    (best_validation_loss, test_score, minutes_trained, iter) = \
+        sgd_optimization_mnist(state.learning_rate, state.n_iter, state.n_code_layer,
+            state.complexity)
+
+    state.best_validation_loss = best_validation_loss
+    state.test_score = test_score
+    state.minutes_trained = minutes_trained
+    state.iter = iter
+
+    return channel.COMPLETE
+
+def experiment_nist(state,channel):
+
+    (best_validation_loss, test_score, minutes_trained, iter) = \
+        sgd_optimization_nist(state.learning_rate, state.n_iter, state.n_code_layer,
+            state.complexity)
+
+    state.best_validation_loss = best_validation_loss
+    state.test_score = test_score
+    state.minutes_trained = minutes_trained
+    state.iter = iter
+
+    return channel.COMPLETE
+
+
+if __name__ == '__main__':
+
+    sgd_optimization_nist()
+
+
--- a/deep/convolutional_dae/stacked_convolutional_dae.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/deep/convolutional_dae/stacked_convolutional_dae.py	Wed Mar 03 16:52:10 2010 -0500
@@ -7,44 +7,10 @@
 
 from theano.tensor.signal import downsample
 from theano.tensor.nnet import conv 
-import gzip
-import cPickle
- 
- 
-class LogisticRegression(object):
- 
-    def __init__(self, input, n_in, n_out):
- 
-        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
-                                            dtype = theano.config.floatX) )
-
-        self.b = theano.shared( value=numpy.zeros((n_out,),
-                                            dtype = theano.config.floatX) )
-
-        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
-        
 
-        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
- 
-        self.params = [self.W, self.b]
- 
-    def negative_log_likelihood(self, y):
-        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
- 
-    def MSE(self, y):
-        return -T.mean(abs((self.p_y_given_x)[T.arange(y.shape[0]),y]-y)**2)
+from ift6266 import datasets
 
-    def errors(self, y):
-        if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred',
-                ('y', target.type, 'y_pred', self.y_pred.type))
- 
-
-        if y.dtype.startswith('int'):
-            return T.mean(T.neq(self.y_pred, y))
-        else:
-            raise NotImplementedError()
- 
+from ift6266.baseline.log_reg.log_reg import LogisticRegression
  
 class SigmoidalLayer(object):
     def __init__(self, rng, input, n_in, n_out):
@@ -65,8 +31,9 @@
  
 class dA_conv(object):
  
-  def __init__(self, corruption_level = 0.1, input = None, shared_W = None,\
-                   shared_b = None, filter_shape = None, image_shape = None, poolsize = (2,2)):
+  def __init__(self, input, filter_shape, corruption_level = 0.1, 
+               shared_W = None, shared_b = None, image_shape = None, 
+               poolsize = (2,2)):
 
     theano_rng = RandomStreams()
     
@@ -80,13 +47,11 @@
         self.W = shared_W
         self.b = shared_b
     else:
-        initial_W = numpy.asarray( numpy.random.uniform( \
-              low = -numpy.sqrt(6./(fan_in+fan_out)), \
-              high = numpy.sqrt(6./(fan_in+fan_out)), \
+        initial_W = numpy.asarray( numpy.random.uniform(
+              low = -numpy.sqrt(6./(fan_in+fan_out)),
+              high = numpy.sqrt(6./(fan_in+fan_out)),
               size = filter_shape), dtype = theano.config.floatX)
-        initial_b = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX)
-    
-    
+        initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
         self.W = theano.shared(value = initial_W, name = "W")
         self.b = theano.shared(value = initial_b, name = "b")
     
@@ -101,9 +66,8 @@
 
     self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x
 
-    conv1_out = conv.conv2d(self.tilde_x, self.W, \
-                             filter_shape=filter_shape, \
-                                image_shape=image_shape, border_mode='valid')
+    conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape,
+                            image_shape=image_shape, border_mode='valid')
 
     
     self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x'))
@@ -111,19 +75,15 @@
     
     da_filter_shape = [ filter_shape[1], filter_shape[0], filter_shape[2],\
                        filter_shape[3] ]
-    da_image_shape = [ image_shape[0],filter_shape[0],image_shape[2]-filter_shape[2]+1, \
-                         image_shape[3]-filter_shape[3]+1 ]
     initial_W_prime =  numpy.asarray( numpy.random.uniform( \
               low = -numpy.sqrt(6./(fan_in+fan_out)), \
               high = numpy.sqrt(6./(fan_in+fan_out)), \
               size = da_filter_shape), dtype = theano.config.floatX)
     self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime")
 
-    #import pdb;pdb.set_trace()
-
-    conv2_out = conv.conv2d(self.y, self.W_prime, \
-                               filter_shape = da_filter_shape, image_shape = da_image_shape ,\
-                                border_mode='full')
+    conv2_out = conv.conv2d(self.y, self.W_prime,
+                            filter_shape = da_filter_shape,
+                            border_mode='full')
 
     self.z =  (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale
 
@@ -134,19 +94,16 @@
     self.cost = T.mean(self.L)
 
     self.params = [ self.W, self.b, self.b_prime ] 
- 
- 
 
 class LeNetConvPoolLayer(object):
-    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2,2)):
-        assert image_shape[1]==filter_shape[1]
+    def __init__(self, rng, input, filter_shape, image_shape=None, poolsize=(2,2)):
         self.input = input
   
         W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX)
-        self.W = theano.shared(value = W_values)
+        self.W = theano.shared(value=W_values)
  
-        b_values = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX)
-        self.b = theano.shared(value= b_values)
+        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
+        self.b = theano.shared(value=b_values)
  
         conv_out = conv.conv2d(input, self.W,
                 filter_shape=filter_shape, image_shape=image_shape)
@@ -168,67 +125,60 @@
  
 
 class SdA():
-    def __init__(self, input, n_ins_conv, n_ins_mlp, train_set_x, train_set_y, batch_size, \
-                     conv_hidden_layers_sizes, mlp_hidden_layers_sizes, corruption_levels, \
-                     rng, n_out, pretrain_lr, finetune_lr):
-
+    def __init__(self, input, n_ins_mlp, conv_hidden_layers_sizes,
+                 mlp_hidden_layers_sizes, corruption_levels, rng, n_out, 
+                 pretrain_lr, finetune_lr):
+        
         self.layers = []
         self.pretrain_functions = []
         self.params = []
         self.conv_n_layers = len(conv_hidden_layers_sizes)
         self.mlp_n_layers = len(mlp_hidden_layers_sizes)
-         
-        index = T.lscalar() # index to a [mini]batch
+        
         self.x = T.dmatrix('x') # the data is presented as rasterized images
         self.y = T.ivector('y') # the labels are presented as 1D vector of
         
- 
-        
         for i in xrange( self.conv_n_layers ):
-
             filter_shape=conv_hidden_layers_sizes[i][0]
             image_shape=conv_hidden_layers_sizes[i][1]
             max_poolsize=conv_hidden_layers_sizes[i][2]
                 
             if i == 0 :
-                layer_input=self.x.reshape((batch_size,1,28,28))
+                layer_input=self.x.reshape((self.x.shape[0], 1, 32, 32))
             else:
                 layer_input=self.layers[-1].output
-
-            layer = LeNetConvPoolLayer(rng, input=layer_input, \
-                                image_shape=image_shape, \
-                                filter_shape=filter_shape,poolsize=max_poolsize)
-            print 'Convolutional layer '+str(i+1)+' created'
-                
+            
+            layer = LeNetConvPoolLayer(rng, input=layer_input,
+                                       image_shape=image_shape,
+                                       filter_shape=filter_shape,
+                                       poolsize=max_poolsize)
+            print 'Convolutional layer', str(i+1), 'created'
+            
             self.layers += [layer]
             self.params += layer.params
-                
-            da_layer = dA_conv(corruption_level = corruption_levels[0],\
-                                  input = layer_input, \
-                                  shared_W = layer.W, shared_b = layer.b,\
-                                  filter_shape = filter_shape , image_shape = image_shape )
-                
-                
+            
+            da_layer = dA_conv(corruption_level = corruption_levels[0],
+                               input = layer_input,
+                               shared_W = layer.W, shared_b = layer.b,
+                               filter_shape = filter_shape,
+                               image_shape = image_shape )
+            
             gparams = T.grad(da_layer.cost, da_layer.params)
-                
+            
             updates = {}
             for param, gparam in zip(da_layer.params, gparams):
-                    updates[param] = param - gparam * pretrain_lr
-                    
-                
-            update_fn = theano.function([index], da_layer.cost, \
-                                        updates = updates,
-                                        givens = {
-                    self.x : train_set_x[index*batch_size:(index+1)*batch_size]} )
-             
+                updates[param] = param - gparam * pretrain_lr
+            
+            update_fn = theano.function([self.x], da_layer.cost, updates = updates)
+            
             self.pretrain_functions += [update_fn]
-
+            
         for i in xrange( self.mlp_n_layers ): 
             if i == 0 :
                 input_size = n_ins_mlp
             else:
                 input_size = mlp_hidden_layers_sizes[i-1]
-
+            
             if i == 0 :
                 if len( self.layers ) == 0 :
                     layer_input=self.x
@@ -236,72 +186,43 @@
                     layer_input = self.layers[-1].output.flatten(2)
             else:
                 layer_input = self.layers[-1].output
-     
+            
             layer = SigmoidalLayer(rng, layer_input, input_size,
                                         mlp_hidden_layers_sizes[i] )
-              
+            
             self.layers += [layer]
             self.params += layer.params
             
-
-            print 'MLP layer '+str(i+1)+' created'
+            print 'MLP layer', str(i+1), 'created'
             
         self.logLayer = LogisticRegression(input=self.layers[-1].output, \
                                                      n_in=mlp_hidden_layers_sizes[-1], n_out=n_out)
         self.params += self.logLayer.params
-
+        
         cost = self.logLayer.negative_log_likelihood(self.y)
+        
+        gparams = T.grad(cost, self.params)
 
-        gparams = T.grad(cost, self.params)
         updates = {}
-
         for param,gparam in zip(self.params, gparams):
             updates[param] = param - gparam*finetune_lr
-            
-        self.finetune = theano.function([index], cost,
-                updates = updates,
-                givens = {
-                  self.x : train_set_x[index*batch_size:(index+1)*batch_size],
-                  self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
- 
+        
+        self.finetune = theano.function([self.x, self.y], cost, updates = updates)
+        
+        self.errors = self.logLayer.errors(self.y)
 
-        self.errors = self.logLayer.errors(self.y)
- 
- 
- 
 def sgd_optimization_mnist( learning_rate=0.1, pretraining_epochs = 2, \
                             pretrain_lr = 0.01, training_epochs = 1000, \
-                            dataset='mnist.pkl.gz'):
-
-    f = gzip.open(dataset,'rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
- 
- 
-    def shared_dataset(data_xy):
-        data_x, data_y = data_xy
-        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
-        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
-        return shared_x, T.cast(shared_y, 'int32')
- 
-
-    test_set_x, test_set_y = shared_dataset(test_set)
-    valid_set_x, valid_set_y = shared_dataset(valid_set)
-    train_set_x, train_set_y = shared_dataset(train_set)
- 
+                            dataset=datasets.nist_digits):
+    
     batch_size = 500 # size of the minibatch
  
-
-    n_train_batches = train_set_x.value.shape[0] / batch_size
-    n_valid_batches = valid_set_x.value.shape[0] / batch_size
-    n_test_batches = test_set_x.value.shape[0] / batch_size
- 
     # allocate symbolic variables for the data
     index = T.lscalar() # index to a [mini]batch
     x = T.matrix('x') # the data is presented as rasterized images
     y = T.ivector('y') # the labels are presented as 1d vector of
-                           # [int] labels
-    layer0_input = x.reshape((batch_size,1,28,28))
+    # [int] labels
+    layer0_input = x.reshape((x.shape[0],1,32,32))
     
 
     # Setup the convolutional layers with their DAs(add as many as you want)
@@ -310,45 +231,34 @@
     ker1=2
     ker2=2
     conv_layers=[]
-    conv_layers.append([[ker1,1,5,5], [batch_size,1,28,28], [2,2] ])
-    conv_layers.append([[ker2,ker1,5,5], [batch_size,ker1,12,12], [2,2] ])
+    conv_layers.append([[ker1,1,5,5], None, [2,2] ])
+    conv_layers.append([[ker2,ker1,5,5], None, [2,2] ])
 
     # Setup the MLP layers of the network
     mlp_layers=[500]
   
-    network = SdA(input = layer0_input, n_ins_conv = 28*28, n_ins_mlp = ker2*4*4, \
-                      train_set_x = train_set_x, train_set_y = train_set_y, batch_size = batch_size,
-                      conv_hidden_layers_sizes = conv_layers,  \
-                      mlp_hidden_layers_sizes = mlp_layers, \
-                      corruption_levels = corruption_levels , n_out = 10, \
-                      rng = rng , pretrain_lr = pretrain_lr , finetune_lr = learning_rate )
+    network = SdA(input = layer0_input, n_ins_mlp = ker2*4*4,
+                  conv_hidden_layers_sizes = conv_layers,
+                  mlp_hidden_layers_sizes = mlp_layers,
+                  corruption_levels = corruption_levels , n_out = 10,
+                  rng = rng , pretrain_lr = pretrain_lr ,
+                  finetune_lr = learning_rate )
 
-    test_model = theano.function([index], network.errors,
-             givens = {
-                network.x: test_set_x[index*batch_size:(index+1)*batch_size],
-                network.y: test_set_y[index*batch_size:(index+1)*batch_size]})
+    test_model = theano.function([network.x, network.y], network.errors)
  
-    validate_model = theano.function([index], network.errors,
-           givens = {
-                network.x: valid_set_x[index*batch_size:(index+1)*batch_size],
-                network.y: valid_set_y[index*batch_size:(index+1)*batch_size]})
-
-
-
     start_time = time.clock()
     for i in xrange(len(network.layers)-len(mlp_layers)):
         for epoch in xrange(pretraining_epochs):
-            for batch_index in xrange(n_train_batches):
-                c = network.pretrain_functions[i](batch_index)
-            print 'pre-training convolution layer %i, epoch %d, cost '%(i,epoch),c
+            for x, y in dataset.train(batch_size):
+                c = network.pretrain_functions[i](x)
+            print 'pre-training convolution layer %i, epoch %d, cost '%(i,epoch), c
 
     patience = 10000 # look as this many examples regardless
     patience_increase = 2. # WAIT THIS MUCH LONGER WHEN A NEW BEST IS
                                   # FOUND
     improvement_threshold = 0.995 # a relative improvement of this much is
 
-    validation_frequency = min(n_train_batches, patience/2)
- 
+    validation_frequency = patience/2
  
     best_params = None
     best_validation_loss = float('inf')
@@ -357,23 +267,21 @@
  
     done_looping = False
     epoch = 0
- 
+    iter = 0
+
     while (epoch < training_epochs) and (not done_looping):
       epoch = epoch + 1
-      for minibatch_index in xrange(n_train_batches):
+      for x, y in dataset.train(batch_size):
  
-        cost_ij = network.finetune(minibatch_index)
-        iter = epoch * n_train_batches + minibatch_index
- 
-        if (iter+1) % validation_frequency == 0:
+        cost_ij = network.finetune(x, y)
+        iter += 1
+        
+        if iter % validation_frequency == 0:
+            validation_losses = [test_model(xv, yv) for xv, yv in dataset.valid(batch_size)]
+            this_validation_loss = numpy.mean(validation_losses)
+            print('epoch %i, iter %i, validation error %f %%' % \
+                   (epoch, iter, this_validation_loss*100.))
             
-            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
-            this_validation_loss = numpy.mean(validation_losses)
-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                   (epoch, minibatch_index+1, n_train_batches, \
-                    this_validation_loss*100.))
- 
- 
             # if we got the best validation score until now
             if this_validation_loss < best_validation_loss:
  
@@ -381,35 +289,28 @@
                 if this_validation_loss < best_validation_loss * \
                        improvement_threshold :
                     patience = max(patience, iter * patience_increase)
- 
+                
                 # save best validation score and iteration number
                 best_validation_loss = this_validation_loss
                 best_iter = iter
- 
+                
                 # test it on the test set
-                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)]
                 test_score = numpy.mean(test_losses)
-                print((' epoch %i, minibatch %i/%i, test error of best '
+                print((' epoch %i, iter %i, test error of best '
                       'model %f %%') %
-                             (epoch, minibatch_index+1, n_train_batches,
-                              test_score*100.))
- 
- 
+                             (epoch, iter, test_score*100.))
+                
         if patience <= iter :
-                done_looping = True
-                break
- 
+            done_looping = True
+            break
+    
     end_time = time.clock()
     print(('Optimization complete with best validation score of %f %%,'
            'with test performance %f %%') %
                  (best_validation_loss * 100., test_score*100.))
     print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
  
- 
- 
- 
- 
- 
 if __name__ == '__main__':
     sgd_optimization_mnist()
  
--- a/deep/stacked_dae/mnist_sda.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/deep/stacked_dae/mnist_sda.py	Wed Mar 03 16:52:10 2010 -0500
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 # coding: utf-8
 
+# TODO: This probably doesn't work anymore, adapt to new code in sgd_opt
 # Parameterize call to sgd_optimization for MNIST
 
 import numpy 
--- a/deep/stacked_dae/nist_sda.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/deep/stacked_dae/nist_sda.py	Wed Mar 03 16:52:10 2010 -0500
@@ -21,22 +21,17 @@
 import jobman, jobman.sql
 from pylearn.io import filetensor
 
-from utils import produit_croise_jobs
+from utils import produit_cartesien_jobs
 
 from sgd_optimization import SdaSgdOptimizer
 
-SERIES_AVAILABLE = False
-try:
-    from ift6266.utils.scalar_series import *
-    SERIES_AVAILABLE = True
-except ImportError:
-    print "Could not import Series"
+from ift6266.utils.scalar_series import *
 
 TEST_CONFIG = False
 
 NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
 
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda2'
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4'
 
 REDUCE_TRAIN_TO = None
 MAX_FINETUNING_EPOCHS = 1000
@@ -48,6 +43,10 @@
 
 EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"
 
+# Possible values the hyperparameters can take. These are then
+# combined with produit_cartesien_jobs so we get a list of all
+# possible combinations, each one resulting in a job inserted
+# in the jobman DB.
 JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
         'pretraining_epochs_per_layer': [10,20],
         'hidden_layers_sizes': [300,800],
@@ -58,30 +57,34 @@
         'num_hidden_layers':[2,3]}
 
 # Just useful for tests... minimal number of epochs
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.01,
-                       'pretraining_lr':0.01,
-                       'pretraining_epochs_per_layer':1,
-                       'max_finetuning_epochs':1,
-                       'hidden_layers_sizes':1000,
+DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
+                       'pretraining_lr':0.1,
+                       'pretraining_epochs_per_layer':20,
+                       'max_finetuning_epochs':2,
+                       'hidden_layers_sizes':300,
                        'corruption_levels':0.2,
                        'minibatch_size':20,
-                       'reduce_train_to':1000,
-                       'num_hidden_layers':1})
+                       #'reduce_train_to':300,
+                       'num_hidden_layers':2})
 
+# Function called by jobman upon launching each job
+# Its path is the one given when inserting jobs:
+# ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint
 def jobman_entrypoint(state, channel):
+    # record mercurial versions of each package
     pylearn.version.record_versions(state,[theano,ift6266,pylearn])
     channel.save()
 
     workingdir = os.getcwd()
 
     print "Will load NIST"
-    sys.stdout.flush()
 
-    nist = NIST(20)
+    nist = NIST(minibatch_size=20)
 
     print "NIST loaded"
-    sys.stdout.flush()
 
+    # For test runs, we don't want to use the whole dataset so
+    # reduce it to fewer elements if asked to.
     rtt = None
     if state.has_key('reduce_train_to'):
         rtt = state['reduce_train_to']
@@ -89,7 +92,7 @@
         rtt = REDUCE_TRAIN_TO
 
     if rtt:
-        print "Reducing training set to ", rtt, " examples"
+        print "Reducing training set to "+str(rtt)+ " examples"
         nist.reduce_train_set(rtt)
 
     train,valid,test = nist.get_tvt()
@@ -98,17 +101,13 @@
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
 
-    hls = state.hidden_layers_sizes
-    cl = state.corruption_levels
-    nhl = state.num_hidden_layers
-    state.hidden_layers_sizes = [hls] * nhl
-    state.corruption_levels = [cl] * nhl
+    # b,b',W for each hidden layer 
+    # + b,W of last layer (logreg)
+    numparams = state.num_hidden_layers * 3 + 2
+    series_mux = None
+    series_mux = create_series(workingdir, numparams)
 
-    # b,b',W for each hidden layer + b,W of last layer (logreg)
-    numparams = nhl * 3 + 2
-    series_mux = None
-    if SERIES_AVAILABLE:
-        series_mux = create_series(workingdir, numparams)
+    print "Creating optimizer with state, ", state
 
     optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \
                                     n_ins=n_ins, n_outs=n_outs,\
@@ -120,11 +119,10 @@
     optimizer.finetune()
     channel.save()
 
-    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
-    channel.save()
-
     return channel.COMPLETE
 
+# These Series objects are used to save various statistics
+# during the training.
 def create_series(basedir, numparams):
     mux = SeriesMultiplexer()
 
@@ -146,8 +144,11 @@
 
     return mux
 
+# Perform insertion into the Postgre DB based on combination
+# of hyperparameter values above
+# (see comment for produit_cartesien_jobs() to know how it works)
 def jobman_insert_nist():
-    jobs = produit_croise_jobs(JOB_VALS)
+    jobs = produit_cartesien_jobs(JOB_VALS)
 
     db = jobman.sql.db(JOBDB)
     for job in jobs:
@@ -233,35 +234,6 @@
 
     raw_input("Press any key")
 
-# hp for hyperparameters
-def sgd_optimization_nist(hp=None, dataset_dir='/data/lisa/data/nist'):
-    global DEFAULT_HP_NIST
-    hp = hp and hp or DEFAULT_HP_NIST
-
-    print "Will load NIST"
-
-    import time
-    t1 = time.time()
-    nist = NIST(20, reduce_train_to=100)
-    t2 = time.time()
-
-    print "NIST loaded. time delta = ", t2-t1
-
-    train,valid,test = nist.get_tvt()
-    dataset = (train,valid,test)
-
-    print train[0][15]
-    print type(train[0][1])
-
-
-    print "Lengths train, valid, test: ", len(train[0]), len(valid[0]), len(test[0])
-
-    n_ins = 32*32
-    n_outs = 62 # 10 digits, 26*2 (lower, capitals)
-
-    optimizer = SdaSgdOptimizer(dataset, hp, n_ins, n_outs, input_divider=255.0)
-    optimizer.train()
-
 if __name__ == '__main__':
 
     import sys
@@ -275,11 +247,12 @@
         jobman_insert_nist()
 
     elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
-        chanmock = DD({'COMPLETE':0})
+        def f():
+            pass
+        chanmock = DD({'COMPLETE':0,'save':f})
         jobman_entrypoint(DEFAULT_HP_NIST, chanmock)
 
     elif len(args) > 0 and args[0] == 'estimate':
         estimate_total_time()
     else:
-        sgd_optimization_nist()
-
+        print "Bad arguments"
--- a/deep/stacked_dae/sgd_optimization.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/deep/stacked_dae/sgd_optimization.py	Wed Mar 03 16:52:10 2010 -0500
@@ -6,6 +6,7 @@
 import numpy 
 import theano
 import time
+import datetime
 import theano.tensor as T
 import sys
 
@@ -59,20 +60,27 @@
         # compute number of minibatches for training, validation and testing
         self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size
         self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size
-        self.n_test_batches  = self.test_set_x.value.shape[0]  / self.hp.minibatch_size
+        # remove last batch in case it's incomplete
+        self.n_test_batches  = (self.test_set_x.value.shape[0]  / self.hp.minibatch_size) - 1
 
     def init_classifier(self):
         print "Constructing classifier"
 
+        # we don't want to save arrays in DD objects, so
+        # we recreate those arrays here
+        nhl = self.hp.num_hidden_layers
+        layers_sizes = [self.hp.hidden_layers_sizes] * nhl
+        corruption_levels = [self.hp.corruption_levels] * nhl
+
         # construct the stacked denoising autoencoder class
         self.classifier = SdA( \
                           train_set_x= self.train_set_x, \
                           train_set_y = self.train_set_y,\
                           batch_size = self.hp.minibatch_size, \
                           n_ins= self.n_ins, \
-                          hidden_layers_sizes = self.hp.hidden_layers_sizes, \
+                          hidden_layers_sizes = layers_sizes, \
                           n_outs = self.n_outs, \
-                          corruption_levels = self.hp.corruption_levels,\
+                          corruption_levels = corruption_levels,\
                           rng = self.rng,\
                           pretrain_lr = self.hp.pretraining_lr, \
                           finetune_lr = self.hp.finetuning_lr,\
@@ -85,7 +93,7 @@
         self.finetune()
 
     def pretrain(self):
-        print "STARTING PRETRAINING"
+        print "STARTING PRETRAINING, time = ", datetime.datetime.now()
         sys.stdout.flush()
 
         start_time = time.clock()  
@@ -101,6 +109,8 @@
                         
                 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
                 sys.stdout.flush()
+
+                self.series_mux.append("params", self.classifier.all_params)
      
         end_time = time.clock()
 
@@ -110,7 +120,7 @@
         sys.stdout.flush()
 
     def finetune(self):
-        print "STARTING FINETUNING"
+        print "STARTING FINETUNING, time = ", datetime.datetime.now()
 
         index   = T.lscalar()    # index to a [mini]batch 
         minibatch_size = self.hp.minibatch_size
--- a/deep/stacked_dae/stacked_dae.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/deep/stacked_dae/stacked_dae.py	Wed Mar 03 16:52:10 2010 -0500
@@ -10,6 +10,15 @@
 
 from utils import update_locals
 
+# taken from LeDeepNet/daa.py
+# has a special case when taking log(0) (defined =0)
+# modified to not take the mean anymore
+from theano.tensor.xlogx import xlogx, xlogy0
+# it's target*log(output)
+def binary_cross_entropy(target, output, sum_axis=1):
+    XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output))
+    return -T.sum(XE, axis=sum_axis)
+
 class LogisticRegression(object):
     def __init__(self, input, n_in, n_out):
         # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
@@ -128,7 +137,8 @@
     # Equation (4)
     # note : we sum over the size of a datapoint; if we are using minibatches,
     #        L will  be a vector, with one entry per example in minibatch
-    self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
+    #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
+    self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
     # note : L is now a vector, where each element is the cross-entropy cost 
     #        of the reconstruction of the corresponding example of the 
     #        minibatch. We need to compute the average of all these to get 
@@ -138,8 +148,6 @@
     self.params = [ self.W, self.b, self.b_prime ]
 
 
-
-
 class SdA(object):
     def __init__(self, train_set_x, train_set_y, batch_size, n_ins, 
                  hidden_layers_sizes, n_outs, 
@@ -147,6 +155,7 @@
         # Just to make sure those are not modified somewhere else afterwards
         hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
         corruption_levels = copy.deepcopy(corruption_levels)
+
         update_locals(self, locals())      
  
         self.layers             = []
@@ -157,6 +166,17 @@
         self.all_params         = []
         self.n_layers           = len(hidden_layers_sizes)
 
+        print "Creating SdA with params:"
+        print "batch_size", batch_size
+        print "hidden_layers_sizes", hidden_layers_sizes
+        print "corruption_levels", corruption_levels
+        print "n_ins", n_ins
+        print "n_outs", n_outs
+        print "pretrain_lr", pretrain_lr
+        print "finetune_lr", finetune_lr
+        print "input_divider", input_divider
+        print "----"
+
         self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
 
         if len(hidden_layers_sizes) < 1 :
--- a/deep/stacked_dae/utils.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/deep/stacked_dae/utils.py	Wed Mar 03 16:52:10 2010 -0500
@@ -1,14 +1,26 @@
 #!/usr/bin/python
+# coding: utf-8
+
+from __future__ import with_statement
 
 from jobman import DD
 
 # from pylearn codebase
+# useful in __init__(param1, param2, etc.) to save
+# values in self.param1, self.param2... just call
+# update_locals(self, locals())
 def update_locals(obj, dct):
     if 'self' in dct:
         del dct['self']
     obj.__dict__.update(dct)
 
-def produit_croise_jobs(val_dict):
+# from a dictionary of possible values for hyperparameters, e.g.
+# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]}
+# create a list of other dictionaries representing all the possible
+# combinations, thus in this example creating:
+# [{'learning_rate': 0.1, 'num_layers': 1}, ...]
+# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2))
+def produit_cartesien_jobs(val_dict):
     job_list = [DD()]
     all_keys = val_dict.keys()
 
@@ -24,9 +36,9 @@
 
     return job_list
 
-def test_produit_croise_jobs():
+def test_produit_cartesien_jobs():
     vals = {'a': [1,2], 'b': [3,4,5]}
-    print produit_croise_jobs(vals)
+    print produit_cartesien_jobs(vals)
 
 
 # taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python
--- a/test.py	Tue Mar 02 17:28:14 2010 -0500
+++ b/test.py	Wed Mar 03 16:52:10 2010 -0500
@@ -12,16 +12,19 @@
                 continue
             test(name)
 
-def test(name, options = doctest.ELLIPSIS or doctest.DONT_ACCEPT_TRUE_FOR_1):
+def test(name):
     import ift6266
     predefs = ift6266.__dict__
+    options = doctest.ELLIPSIS or doctest.DONT_ACCEPT_TRUE_FOR_1
     print "Testing:", name
     __import__(name)
     doctest.testmod(sys.modules[name], extraglobs=predefs, optionflags=options)
-    
+
 if __name__ == '__main__':
     if len(sys.argv) > 1:
         for mod in sys.argv[1:]:
+            if mod.endswith('.py'):
+                mod = mod[:-3]
             test(mod)
     else:
         runTests()