# HG changeset patch
# User Dumitru Erhan <dumitru.erhan@gmail.com>
# Date 1267210527 18000
# Node ID 4bc5eeec6394549d304ee65b26031da9deb2ef46
# Parent  e3de934a98b6728ee0c76ec9a0615e8a7237357c
Updating the tutorial code to the latest revisions.

diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/DBN.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/DBN.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,384 @@
+"""
+"""
+import os
+
+import numpy, time, cPickle, gzip 
+
+import theano
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+from logistic_sgd import LogisticRegression, load_data
+from mlp import HiddenLayer
+from rbm import RBM
+
+
+
+class DBN(object):
+    """
+    """
+
+    def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, 
+                 hidden_layers_sizes = [500,500], n_outs = 10):
+        """This class is made to support a variable number of layers. 
+
+        :type numpy_rng: numpy.random.RandomState
+        :param numpy_rng: numpy random number generator used to draw initial 
+                    weights
+
+        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
+        :param theano_rng: Theano random generator; if None is given one is 
+                           generated based on a seed drawn from `rng`
+
+        :type n_ins: int
+        :param n_ins: dimension of the input to the DBN
+
+        :type n_layers_sizes: list of ints
+        :param n_layers_sizes: intermidiate layers size, must contain 
+                               at least one value
+
+        :type n_outs: int
+        :param n_outs: dimension of the output of the network
+        """
+        
+        self.sigmoid_layers = []
+        self.rbm_layers     = []
+        self.params         = []
+        self.n_layers       = len(hidden_layers_sizes)
+
+        assert self.n_layers > 0
+
+        if not theano_rng:
+            theano_rng = RandomStreams(numpy_rng.randint(2**30))
+
+        # allocate symbolic variables for the data
+        self.x  = T.matrix('x')  # the data is presented as rasterized images
+        self.y  = T.ivector('y') # the labels are presented as 1D vector of 
+                                 # [int] labels
+
+        # The DBN is an MLP, for which all weights of intermidiate layers are shared with a
+        # different RBM.  We will first construct the DBN as a deep multilayer perceptron, and
+        # when constructing each sigmoidal layer we also construct an RBM that shares weights
+        # with that layer. During pretraining we will train these RBMs (which will lead
+        # to chainging the weights of the MLP as well) During finetuning we will finish
+        # training the DBN by doing stochastic gradient descent on the MLP.
+
+        for i in xrange( self.n_layers ):
+            # construct the sigmoidal layer
+
+            # the size of the input is either the number of hidden units of the layer below or
+            # the input size if we are on the first layer
+            if i == 0 :
+                input_size = n_ins
+            else:
+                input_size = hidden_layers_sizes[i-1]
+
+            # the input to this layer is either the activation of the hidden layer below or the
+            # input of the DBN if you are on the first layer
+            if i == 0 : 
+                layer_input = self.x
+            else:
+                layer_input = self.sigmoid_layers[-1].output
+
+            sigmoid_layer = HiddenLayer(rng   = numpy_rng, 
+                                           input = layer_input, 
+                                           n_in  = input_size, 
+                                           n_out = hidden_layers_sizes[i],
+                                           activation = T.nnet.sigmoid)
+            
+            # add the layer to our list of layers 
+            self.sigmoid_layers.append(sigmoid_layer)
+
+            # its arguably a philosophical question...  but we are going to only declare that
+            # the parameters of the sigmoid_layers are parameters of the DBN. The visible
+            # biases in the RBM are parameters of those RBMs, but not of the DBN.
+            self.params.extend(sigmoid_layer.params)
+        
+            # Construct an RBM that shared weights with this layer
+            rbm_layer = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng, 
+                          input = layer_input, 
+                          n_visible = input_size, 
+                          n_hidden  = hidden_layers_sizes[i],  
+                          W = sigmoid_layer.W, 
+                          hbias = sigmoid_layer.b)
+            self.rbm_layers.append(rbm_layer)        
+
+        
+        # We now need to add a logistic layer on top of the MLP
+        self.logLayer = LogisticRegression(\
+                         input = self.sigmoid_layers[-1].output,\
+                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
+        self.params.extend(self.logLayer.params)
+
+        # construct a function that implements one step of fine-tuning compute the cost for
+        # second phase of training, defined as the negative log likelihood 
+        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
+
+        # compute the gradients with respect to the model parameters
+        # symbolic variable that points to the number of errors made on the
+        # minibatch given by self.x and self.y
+        self.errors = self.logLayer.errors(self.y)
+
+    def pretraining_functions(self, train_set_x, batch_size):
+        ''' Generates a list of functions, for performing one step of gradient descent at a
+        given layer. The function will require as input the minibatch index, and to train an
+        RBM you just need to iterate, calling the corresponding function on all minibatch
+        indexes.
+
+        :type train_set_x: theano.tensor.TensorType
+        :param train_set_x: Shared var. that contains all datapoints used for training the RBM
+        :type batch_size: int
+        :param batch_size: size of a [mini]batch
+        '''
+
+        # index to a [mini]batch
+        index            = T.lscalar('index')   # index to a minibatch
+        learning_rate    = T.scalar('lr')    # learning rate to use
+
+        # number of batches
+        n_batches = train_set_x.value.shape[0] / batch_size
+        # begining of a batch, given `index`
+        batch_begin = index * batch_size
+        # ending of a batch given `index`
+        batch_end = batch_begin+batch_size
+
+        pretrain_fns = []
+        for rbm in self.rbm_layers:
+
+            # get the cost and the updates list
+            # TODO: change cost function to reconstruction error
+            cost,updates = rbm.cd(learning_rate, persistent=None)
+
+            # compile the theano function    
+            fn = theano.function(inputs = [index, 
+                              theano.Param(learning_rate, default = 0.1)], 
+                    outputs = cost, 
+                    updates = updates,
+                    givens  = {self.x :train_set_x[batch_begin:batch_end]})
+            # append `fn` to the list of functions
+            pretrain_fns.append(fn)
+
+        return pretrain_fns
+ 
+
+    def build_finetune_functions(self, datasets, batch_size, learning_rate):
+        '''Generates a function `train` that implements one step of finetuning, a function
+        `validate` that computes the error on a batch from the validation set, and a function
+        `test` that computes the error on a batch from the testing set
+
+        :type datasets: list of pairs of theano.tensor.TensorType
+        :param datasets: It is a list that contain all the datasets;  the has to contain three
+        pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano
+        variables, one for the datapoints, the other for the labels
+        :type batch_size: int
+        :param batch_size: size of a minibatch
+        :type learning_rate: float
+        :param learning_rate: learning rate used during finetune stage
+        '''
+
+        (train_set_x, train_set_y) = datasets[0]
+        (valid_set_x, valid_set_y) = datasets[1]
+        (test_set_x , test_set_y ) = datasets[2]
+
+        # compute number of minibatches for training, validation and testing
+        n_valid_batches = valid_set_x.value.shape[0] / batch_size
+        n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+        index   = T.lscalar('index')    # index to a [mini]batch 
+
+        # compute the gradients with respect to the model parameters
+        gparams = T.grad(self.finetune_cost, self.params)
+
+        # compute list of fine-tuning updates
+        updates = {}
+        for param, gparam in zip(self.params, gparams):
+            updates[param] = param - gparam*learning_rate
+
+        train_fn = theano.function(inputs = [index], 
+              outputs =   self.finetune_cost, 
+              updates = updates,
+              givens  = {
+                self.x : train_set_x[index*batch_size:(index+1)*batch_size],
+                self.y : train_set_y[index*batch_size:(index+1)*batch_size]})
+
+        test_score_i = theano.function([index], self.errors,
+                 givens = {
+                   self.x: test_set_x[index*batch_size:(index+1)*batch_size],
+                   self.y: test_set_y[index*batch_size:(index+1)*batch_size]})
+
+        valid_score_i = theano.function([index], self.errors,
+              givens = {
+                 self.x: valid_set_x[index*batch_size:(index+1)*batch_size],
+                 self.y: valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+        # Create a function that scans the entire validation set
+        def valid_score():
+            return [valid_score_i(i) for i in xrange(n_valid_batches)]
+
+        # Create a function that scans the entire test set
+        def test_score():
+            return [test_score_i(i) for i in xrange(n_test_batches)]
+
+        return train_fn, valid_score, test_score
+
+
+
+
+
+
+def test_DBN( finetune_lr = 0.1, pretraining_epochs = 10, \
+              pretrain_lr = 0.1, training_epochs = 1000, \
+              dataset='mnist.pkl.gz'):
+    """
+    Demonstrates how to train and test a Deep Belief Network.
+
+    This is demonstrated on MNIST.
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used in the finetune stage 
+    :type pretraining_epochs: int
+    :param pretraining_epochs: number of epoch to do pretraining
+    :type pretrain_lr: float
+    :param pretrain_lr: learning rate to be used during pre-training
+    :type n_iter: int
+    :param n_iter: maximal number of iterations ot run the optimizer 
+    :type dataset: string
+    :param dataset: path the the pickled dataset
+    """
+
+    print 'finetune_lr = ', finetune_lr
+    print 'pretrain_lr = ', pretrain_lr
+
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
+
+
+    batch_size = 20    # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+
+    # numpy random generator
+    numpy_rng = numpy.random.RandomState(123)
+    print '... building the model'
+    # construct the Deep Belief Network
+    dbn = DBN(numpy_rng = numpy_rng, n_ins = 28*28, 
+              hidden_layers_sizes = [1000,1000,1000],
+              n_outs = 10)
+    
+
+    #########################
+    # PRETRAINING THE MODEL #
+    #########################
+    print '... getting the pretraining functions'
+    pretraining_fns = dbn.pretraining_functions(
+            train_set_x   = train_set_x, 
+            batch_size    = batch_size ) 
+
+    print '... pre-training the model'
+    start_time = time.clock()  
+    ## Pre-train layer-wise 
+    for i in xrange(dbn.n_layers):
+        # go through pretraining epochs 
+        for epoch in xrange(pretraining_epochs):
+            # go through the training set
+            c = []
+            for batch_index in xrange(n_train_batches):
+                c.append(pretraining_fns[i](index = batch_index, 
+                         lr = pretrain_lr ) )
+            print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c)
+ 
+    end_time = time.clock()
+
+    print ('Pretraining took %f minutes' %((end_time-start_time)/60.))
+    
+    ########################
+    # FINETUNING THE MODEL #
+    ########################
+
+    # get the training, validation and testing function for the model
+    print '... getting the finetuning functions'
+    train_fn, validate_model, test_model = dbn.build_finetune_functions ( 
+                datasets = datasets, batch_size = batch_size, 
+                learning_rate = finetune_lr) 
+
+    print '... finetunning the model'
+    # early-stopping parameters
+    patience              = 10000 # look as this many examples regardless
+    patience_increase     = 2.    # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = min(n_train_batches, patience/2)
+                                  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    test_score           = 0.
+    start_time = time.clock()
+
+    done_looping = False
+    epoch = 0
+
+    while (epoch < training_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
+
+        minibatch_avg_cost = train_fn(minibatch_index)
+        iter    = epoch * n_train_batches + minibatch_index
+
+        if (iter+1) % validation_frequency == 0: 
+            
+            validation_losses = validate_model()
+            this_validation_loss = numpy.mean(validation_losses)
+            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                   (epoch, minibatch_index+1, n_train_batches, \
+                    this_validation_loss*100.))
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                # save best validation score and iteration number
+                best_validation_loss = this_validation_loss
+                best_iter = iter
+
+                # test it on the test set
+                test_losses = test_model()
+                test_score = numpy.mean(test_losses)
+                print(('     epoch %i, minibatch %i/%i, test error of best '
+                      'model %f %%') % 
+                             (epoch, minibatch_index+1, n_train_batches,
+                              test_score*100.))
+
+
+        if patience <= iter :
+                done_looping = True
+                break
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %  
+                 (best_validation_loss * 100., test_score*100.))
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+
+
+
+
+if __name__ == '__main__':
+    pretrain_lr = numpy.float(os.sys.argv[1])
+    finetune_lr = numpy.float(os.sys.argv[2])
+    test_DBN(pretrain_lr=pretrain_lr, finetune_lr=finetune_lr)
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/SdA.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/SdA.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,441 @@
+"""
+ This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.
+
+ Denoising autoencoders are the building blocks for SdA. 
+ They are based on auto-encoders as the ones used in Bengio et al. 2007.
+ An autoencoder takes an input x and first maps it to a hidden representation
+ y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting 
+ latent representation y is then mapped back to a "reconstructed" vector 
+ z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight 
+ matrix W' can optionally be constrained such that W' = W^T, in which case 
+ the autoencoder is said to have tied weights. The network is trained such 
+ that to minimize the reconstruction error (the error between x and z).
+
+ For the denosing autoencoder, during training, first x is corrupted into 
+ \tilde{x}, where \tilde{x} is a partially destroyed version of x by means 
+ of a stochastic mapping. Afterwards y is computed as before (using 
+ \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction 
+ error is now measured between z and the uncorrupted input x, which is 
+ computed as the cross-entropy : 
+      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+
+
+ References :
+   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and 
+   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
+   2008
+   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
+   Training of Deep Networks, Advances in Neural Information Processing 
+   Systems 19, 2007
+
+"""
+
+import numpy, time, cPickle, gzip 
+
+import theano
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+from logistic_sgd import LogisticRegression, load_data
+from mlp import HiddenLayer
+from dA import dA
+
+
+
+class SdA(object):
+    """Stacked denoising auto-encoder class (SdA)
+
+    A stacked denoising autoencoder model is obtained by stacking several
+    dAs. The hidden layer of the dA at layer `i` becomes the input of 
+    the dA at layer `i+1`. The first layer dA gets as input the input of 
+    the SdA, and the hidden layer of the last dA represents the output. 
+    Note that after pretraining, the SdA is dealt with as a normal MLP, 
+    the dAs are only used to initialize the weights.
+    """
+
+    def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, 
+                 hidden_layers_sizes = [500,500], n_outs = 10, 
+                 corruption_levels = [0.1, 0.1]):
+        """ This class is made to support a variable number of layers. 
+
+        :type numpy_rng: numpy.random.RandomState
+        :param numpy_rng: numpy random number generator used to draw initial 
+                    weights
+
+        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
+        :param theano_rng: Theano random generator; if None is given one is 
+                           generated based on a seed drawn from `rng`
+
+        :type n_ins: int
+        :param n_ins: dimension of the input to the sdA
+
+        :type n_layers_sizes: list of ints
+        :param n_layers_sizes: intermidiate layers size, must contain 
+                               at least one value
+
+        :type n_outs: int
+        :param n_outs: dimension of the output of the network
+        
+        :type corruption_levels: list of float
+        :param corruption_levels: amount of corruption to use for each 
+                                  layer
+        """
+        
+        self.sigmoid_layers = []
+        self.dA_layers      = []
+        self.params         = []
+        self.n_layers       = len(hidden_layers_sizes)
+
+        assert self.n_layers > 0
+
+        if not theano_rng:
+            theano_rng = RandomStreams(numpy_rng.randint(2**30))
+        # allocate symbolic variables for the data
+        self.x  = T.matrix('x')  # the data is presented as rasterized images
+        self.y  = T.ivector('y') # the labels are presented as 1D vector of 
+                                 # [int] labels
+
+        # The SdA is an MLP, for which all weights of intermidiate layers
+        # are shared with a different denoising autoencoders 
+        # We will first construct the SdA as a deep multilayer perceptron,
+        # and when constructing each sigmoidal layer we also construct a 
+        # denoising autoencoder that shares weights with that layer 
+        # During pretraining we will train these autoencoders (which will
+        # lead to chainging the weights of the MLP as well)
+        # During finetunining we will finish training the SdA by doing 
+        # stochastich gradient descent on the MLP
+
+        for i in xrange( self.n_layers ):
+            # construct the sigmoidal layer
+
+            # the size of the input is either the number of hidden units of 
+            # the layer below or the input size if we are on the first layer
+            if i == 0 :
+                input_size = n_ins
+            else:
+                input_size = hidden_layers_sizes[i-1]
+
+            # the input to this layer is either the activation of the hidden
+            # layer below or the input of the SdA if you are on the first
+            # layer
+            if i == 0 : 
+                layer_input = self.x
+            else:
+                layer_input = self.sigmoid_layers[-1].output
+
+            sigmoid_layer = HiddenLayer(rng   = numpy_rng, 
+                                           input = layer_input, 
+                                           n_in  = input_size, 
+                                           n_out = hidden_layers_sizes[i],
+                                           activation = T.nnet.sigmoid)
+            # add the layer to our list of layers 
+            self.sigmoid_layers.append(sigmoid_layer)
+            # its arguably a philosophical question...
+            # but we are going to only declare that the parameters of the 
+            # sigmoid_layers are parameters of the StackedDAA
+            # the visible biases in the dA are parameters of those
+            # dA, but not the SdA
+            self.params.extend(sigmoid_layer.params)
+        
+            # Construct a denoising autoencoder that shared weights with this
+            # layer
+            dA_layer = dA(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, 
+                          n_visible = input_size, 
+                          n_hidden  = hidden_layers_sizes[i],  
+                          W = sigmoid_layer.W, bhid = sigmoid_layer.b)
+            self.dA_layers.append(dA_layer)        
+
+        
+        # We now need to add a logistic layer on top of the MLP
+        self.logLayer = LogisticRegression(\
+                         input = self.sigmoid_layers[-1].output,\
+                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
+
+        self.params.extend(self.logLayer.params)
+        # construct a function that implements one step of finetunining
+
+        # compute the cost for second phase of training, 
+        # defined as the negative log likelihood 
+        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
+        # compute the gradients with respect to the model parameters
+        # symbolic variable that points to the number of errors made on the
+        # minibatch given by self.x and self.y
+        self.errors = self.logLayer.errors(self.y)
+
+    def pretraining_functions(self, train_set_x, batch_size):
+        ''' Generates a list of functions, each of them implementing one 
+        step in trainnig the dA corresponding to the layer with same index.
+        The function will require as input the minibatch index, and to train
+        a dA you just need to iterate, calling the corresponding function on 
+        all minibatch indexes.
+
+        :type train_set_x: theano.tensor.TensorType
+        :param train_set_x: Shared variable that contains all datapoints used
+                            for training the dA
+
+        :type batch_size: int
+        :param batch_size: size of a [mini]batch
+
+        :type learning_rate: float
+        :param learning_rate: learning rate used during training for any of 
+                              the dA layers
+        '''
+
+        # index to a [mini]batch
+        index            = T.lscalar('index')   # index to a minibatch
+        corruption_level = T.scalar('corruption')    # amount of corruption to use
+        learning_rate    = T.scalar('lr')    # learning rate to use
+        # number of batches
+        n_batches = train_set_x.value.shape[0] / batch_size
+        # begining of a batch, given `index`
+        batch_begin = index * batch_size
+        # ending of a batch given `index`
+        batch_end = batch_begin+batch_size
+
+        pretrain_fns = []
+        for dA in self.dA_layers:
+            # get the cost and the updates list
+            cost,updates = dA.get_cost_updates( corruption_level, learning_rate)
+            # compile the theano function    
+            fn = theano.function( inputs = [index, 
+                              theano.Param(corruption_level, default = 0.2),
+                              theano.Param(learning_rate, default = 0.1)], 
+                    outputs = cost, 
+                    updates = updates,
+                    givens  = {self.x :train_set_x[batch_begin:batch_end]})
+            # append `fn` to the list of functions
+            pretrain_fns.append(fn)
+
+        return pretrain_fns
+ 
+
+    def build_finetune_functions(self, datasets, batch_size, learning_rate):
+        '''Generates a function `train` that implements one step of 
+        finetuning, a function `validate` that computes the error on 
+        a batch from the validation set, and a function `test` that 
+        computes the error on a batch from the testing set
+
+        :type datasets: list of pairs of theano.tensor.TensorType
+        :param datasets: It is a list that contain all the datasets;  
+                         the has to contain three pairs, `train`, 
+                         `valid`, `test` in this order, where each pair
+                         is formed of two Theano variables, one for the 
+                         datapoints, the other for the labels
+
+        :type batch_size: int
+        :param batch_size: size of a minibatch
+
+        :type learning_rate: float
+        :param learning_rate: learning rate used during finetune stage
+        '''
+
+        (train_set_x, train_set_y) = datasets[0]
+        (valid_set_x, valid_set_y) = datasets[1]
+        (test_set_x , test_set_y ) = datasets[2]
+
+        # compute number of minibatches for training, validation and testing
+        n_valid_batches = valid_set_x.value.shape[0] / batch_size
+        n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+        index   = T.lscalar('index')    # index to a [mini]batch 
+
+        # compute the gradients with respect to the model parameters
+        gparams = T.grad(self.finetune_cost, self.params)
+
+        # compute list of fine-tuning updates
+        updates = {}
+        for param, gparam in zip(self.params, gparams):
+            updates[param] = param - gparam*learning_rate
+
+        train_fn = theano.function(inputs = [index], 
+              outputs =   self.finetune_cost, 
+              updates = updates,
+              givens  = {
+                self.x : train_set_x[index*batch_size:(index+1)*batch_size],
+                self.y : train_set_y[index*batch_size:(index+1)*batch_size]})
+
+        test_score_i = theano.function([index], self.errors,
+                 givens = {
+                   self.x: test_set_x[index*batch_size:(index+1)*batch_size],
+                   self.y: test_set_y[index*batch_size:(index+1)*batch_size]})
+
+        valid_score_i = theano.function([index], self.errors,
+              givens = {
+                 self.x: valid_set_x[index*batch_size:(index+1)*batch_size],
+                 self.y: valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+        # Create a function that scans the entire validation set
+        def valid_score():
+            return [valid_score_i(i) for i in xrange(n_valid_batches)]
+
+        # Create a function that scans the entire test set
+        def test_score():
+            return [test_score_i(i) for i in xrange(n_test_batches)]
+
+        return train_fn, valid_score, test_score
+
+
+
+
+
+
+def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
+              pretrain_lr = 0.1, training_epochs = 1000, \
+              dataset='mnist.pkl.gz'):
+    """
+    Demonstrates how to train and test a stochastic denoising autoencoder.
+
+    This is demonstrated on MNIST.
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used in the finetune stage 
+    (factor for the stochastic gradient)
+
+    :type pretraining_epochs: int
+    :param pretraining_epochs: number of epoch to do pretraining
+
+    :type pretrain_lr: float
+    :param pretrain_lr: learning rate to be used during pre-training
+
+    :type n_iter: int
+    :param n_iter: maximal number of iterations ot run the optimizer 
+
+    :type dataset: string
+    :param dataset: path the the pickled dataset
+
+    """
+
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
+
+
+    batch_size = 20    # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+
+    # numpy random generator
+    numpy_rng = numpy.random.RandomState(123)
+    print '... building the model'
+    # construct the stacked denoising autoencoder class
+    sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28, 
+                      hidden_layers_sizes = [1000,1000,1000],
+                      n_outs = 10)
+    
+
+    #########################
+    # PRETRAINING THE MODEL #
+    #########################
+    print '... getting the pretraining functions'
+    pretraining_fns = sda.pretraining_functions( 
+                                        train_set_x   = train_set_x, 
+                                        batch_size    = batch_size ) 
+
+    print '... pre-training the model'
+    start_time = time.clock()  
+    ## Pre-train layer-wise 
+    for i in xrange(sda.n_layers):
+        # go through pretraining epochs 
+        for epoch in xrange(pretraining_epochs):
+            # go through the training set
+            c = []
+            for batch_index in xrange(n_train_batches):
+                c.append( pretraining_fns[i](index = batch_index, 
+                         corruption = 0.2, lr = pretrain_lr ) )
+            print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c)
+ 
+    end_time = time.clock()
+
+    print ('Pretraining took %f minutes' %((end_time-start_time)/60.))
+    
+    ########################
+    # FINETUNING THE MODEL #
+    ########################
+
+    # get the training, validation and testing function for the model
+    print '... getting the finetuning functions'
+    train_fn, validate_model, test_model = sda.build_finetune_functions ( 
+                datasets = datasets, batch_size = batch_size, 
+                learning_rate = finetune_lr) 
+
+    print '... finetunning the model'
+    # early-stopping parameters
+    patience              = 10000 # look as this many examples regardless
+    patience_increase     = 2.    # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = min(n_train_batches, patience/2)
+                                  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    test_score           = 0.
+    start_time = time.clock()
+
+    done_looping = False
+    epoch = 0
+
+    while (epoch < training_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
+
+        minibatch_avg_cost = train_fn(minibatch_index)
+        iter    = epoch * n_train_batches + minibatch_index
+
+        if (iter+1) % validation_frequency == 0: 
+            
+            validation_losses = validate_model()
+            this_validation_loss = numpy.mean(validation_losses)
+            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                   (epoch, minibatch_index+1, n_train_batches, \
+                    this_validation_loss*100.))
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                # save best validation score and iteration number
+                best_validation_loss = this_validation_loss
+                best_iter = iter
+
+                # test it on the test set
+                test_losses = test_model()
+                test_score = numpy.mean(test_losses)
+                print(('     epoch %i, minibatch %i/%i, test error of best '
+                      'model %f %%') % 
+                             (epoch, minibatch_index+1, n_train_batches,
+                              test_score*100.))
+
+
+        if patience <= iter :
+                done_looping = True
+                break
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %  
+                 (best_validation_loss * 100., test_score*100.))
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+
+
+
+
+
+if __name__ == '__main__':
+    test_SdA()
+
+
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/convolutional_mlp.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/convolutional_mlp.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,292 @@
+"""
+This tutorial introduces the LeNet5 neural network architecture using Theano.  LeNet5 is a
+convolutional neural network, good for classifying images. This tutorial shows how to build the
+architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST
+results.
+
+
+This implementation simplifies the model in the following ways:
+
+ - LeNetConvPool doesn't implement location-specific gain and bias parameters
+ - LeNetConvPool doesn't implement pooling by average, it implements pooling by max.
+ - Digit classification is implemented with a logistic regression rather than an RBF network
+ - LeNet5 was not fully-connected convolutions at second layer
+
+References:
+ - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document
+   Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
+   http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
+"""
+
+import numpy, time, cPickle, gzip
+
+import theano
+import theano.tensor as T
+from theano.tensor.signal import downsample
+from theano.tensor.nnet import conv
+
+from logistic_sgd import LogisticRegression, load_data
+from mlp import HiddenLayer
+
+
+class LeNetConvPoolLayer(object):
+    """Pool Layer of a convolutional network """
+
+    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2,2)):
+        """
+        Allocate a LeNetConvPoolLayer with shared variable internal parameters.
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.dtensor4
+        :param input: symbolic image tensor, of shape image_shape
+
+        :type filter_shape: tuple or list of length 4
+        :param filter_shape: (number of filters, num input feature maps,
+                              filter height,filter width)
+
+        :type image_shape: tuple or list of length 4
+        :param image_shape: (batch size, num input feature maps,
+                             image height, image width)
+
+        :type poolsize: tuple or list of length 2
+        :param poolsize: the downsampling (pooling) factor (#rows,#cols)
+        """
+
+        assert image_shape[1]==filter_shape[1]
+        self.input = input
+  
+        # initialize weights to temporary values until we know the shape of the output feature
+        # maps
+        W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX)
+        self.W = theano.shared(value = W_values)
+
+        # the bias is a 1D tensor -- one bias per output feature map
+        b_values = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX)
+        self.b = theano.shared(value= b_values)
+
+        # convolve input feature maps with filters
+        conv_out = conv.conv2d(input = input, filters = self.W, 
+                filter_shape=filter_shape, image_shape=image_shape)
+
+        # there are "num input feature maps * filter height * filter width" inputs
+        # to each hidden unit
+        fan_in = numpy.prod(filter_shape[1:])
+        # each unit in the lower layer receives a gradient from:
+        # "num output feature maps * filter height * filter width" / pooling size
+        fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)
+        # replace weight values with random weights
+        W_bound = numpy.sqrt(6./(fan_in + fan_out))
+        self.W.value = numpy.asarray( 
+                rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
+                dtype = theano.config.floatX)
+  
+        # downsample each feature map individually, using maxpooling
+        pooled_out = downsample.max_pool2D( input = conv_out, 
+                                    ds = poolsize, ignore_border=True)
+
+        # add the bias term. Since the bias is a vector (1D array), we first
+        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will thus
+        # be broadcasted across mini-batches and feature map width & height
+        self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
+
+        # store parameters of this layer
+        self.params = [self.W, self.b]
+
+
+
+def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20,50]):
+    """ Demonstrates lenet on MNIST dataset
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used (factor for the stochastic
+                          gradient) 
+
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer
+
+    :type dataset: string
+    :param dataset: path to the dataset used for training /testing (MNIST here)
+
+    :type nkerns: list of ints
+    :param nkerns: number of kernels on each layer
+    """
+
+    rng = numpy.random.RandomState(23455)
+
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
+
+
+    batch_size = 500    # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
+
+
+    ishape = (28,28)     # this is the size of MNIST images
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print '... building the model'
+
+    # Reshape matrix of rasterized images of shape (batch_size,28*28)
+    # to a 4D tensor, compatible with our LeNetConvPoolLayer
+    layer0_input = x.reshape((batch_size,1,28,28))
+
+    # Construct the first convolutional pooling layer:
+    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
+    # maxpooling reduces this further to (24/2,24/2) = (12,12)
+    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
+    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
+            image_shape=(batch_size,1,28,28), 
+            filter_shape=(nkerns[0],1,5,5), poolsize=(2,2))
+
+    # Construct the second convolutional pooling layer
+    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
+    # maxpooling reduces this further to (8/2,8/2) = (4,4)
+    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
+    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
+            image_shape=(batch_size,nkerns[0],12,12), 
+            filter_shape=(nkerns[1],nkerns[0],5,5), poolsize=(2,2))
+
+    # the TanhLayer being fully-connected, it operates on 2D matrices of
+    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
+    # This will generate a matrix of shape (20,32*4*4) = (20,512)
+    layer2_input = layer1.output.flatten(2)
+
+    # construct a fully-connected sigmoidal layer
+    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1]*4*4, 
+                         n_out=500, activation = T.tanh)
+
+    # classify the values of the fully-connected sigmoidal layer
+    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
+
+    # the cost we minimize during training is the NLL of the model
+    cost = layer3.negative_log_likelihood(y)
+
+    # create a function to compute the mistakes that are made by the model
+    test_model = theano.function([index], layer3.errors(y),
+             givens = {
+                x: test_set_x[index*batch_size:(index+1)*batch_size],
+                y: test_set_y[index*batch_size:(index+1)*batch_size]})
+
+    validate_model = theano.function([index], layer3.errors(y),
+            givens = {
+                x: valid_set_x[index*batch_size:(index+1)*batch_size],
+                y: valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+    # create a list of all model parameters to be fit by gradient descent
+    params = layer3.params+ layer2.params+ layer1.params + layer0.params
+    
+    # create a list of gradients for all model parameters
+    grads = T.grad(cost, params)
+
+    # train_model is a function that updates the model parameters by SGD
+    # Since this model has many parameters, it would be tedious to manually
+    # create an update rule for each model parameter. We thus create the updates
+    # dictionary by automatically looping over all (params[i],grads[i])  pairs.
+    updates = {}
+    for param_i, grad_i in zip(params, grads):
+        updates[param_i] = param_i - learning_rate * grad_i
+    
+    train_model = theano.function([index], cost, updates=updates,
+          givens = {
+            x: train_set_x[index*batch_size:(index+1)*batch_size],
+            y: train_set_y[index*batch_size:(index+1)*batch_size]})
+
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training'
+    # early-stopping parameters
+    patience              = 10000 # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = min(n_train_batches, patience/2)
+                                  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    best_iter            = 0
+    test_score           = 0.
+    start_time = time.clock()
+
+    epoch = 0 
+    done_looping = False
+
+    while (epoch < n_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
+        
+        iter = epoch * n_train_batches + minibatch_index
+
+        if iter %100 == 0:
+            print 'training @ iter = ', iter
+        cost_ij = train_model(minibatch_index)
+
+        if (iter+1) % validation_frequency == 0: 
+
+            # compute zero-one loss on validation set
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+            this_validation_loss = numpy.mean(validation_losses)
+            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                   (epoch, minibatch_index+1, n_train_batches, \
+                    this_validation_loss*100.))
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                # save best validation score and iteration number
+                best_validation_loss = this_validation_loss
+                best_iter = iter
+
+                # test it on the test set
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_score = numpy.mean(test_losses)
+                print(('     epoch %i, minibatch %i/%i, test error of best '
+                      'model %f %%') % 
+                             (epoch, minibatch_index+1, n_train_batches,
+                              test_score*100.))
+
+        if patience <= iter :
+            done_looping = False
+            break
+
+    end_time = time.clock()
+    print('Optimization complete.')
+    print('Best validation score of %f %% obtained at iteration %i,'\
+          'with test performance %f %%' %  
+          (best_validation_loss * 100., best_iter, test_score*100.))
+    print('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+if __name__ == '__main__':
+    evaluate_lenet5()
+
+def experiment(state, channel):
+    evaluate_lenet5(state.learning_rate, dataset=state.dataset)
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/dA.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/dA.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,330 @@
+"""
+ This tutorial introduces denoising auto-encoders (dA) using Theano.
+
+ Denoising autoencoders are the building blocks for SdA. 
+ They are based on auto-encoders as the ones used in Bengio et al. 2007.
+ An autoencoder takes an input x and first maps it to a hidden representation
+ y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting 
+ latent representation y is then mapped back to a "reconstructed" vector 
+ z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight 
+ matrix W' can optionally be constrained such that W' = W^T, in which case 
+ the autoencoder is said to have tied weights. The network is trained such 
+ that to minimize the reconstruction error (the error between x and z).
+
+ For the denosing autoencoder, during training, first x is corrupted into 
+ \tilde{x}, where \tilde{x} is a partially destroyed version of x by means 
+ of a stochastic mapping. Afterwards y is computed as before (using 
+ \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction 
+ error is now measured between z and the uncorrupted input x, which is 
+ computed as the cross-entropy : 
+      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+
+
+ References :
+   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and 
+   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
+   2008
+   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
+   Training of Deep Networks, Advances in Neural Information Processing 
+   Systems 19, 2007
+
+"""
+
+import numpy, time, cPickle, gzip 
+
+import theano
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+from logistic_sgd import load_data
+from utils import tile_raster_images
+
+import PIL.Image
+
+
+class dA(object):
+    """Denoising Auto-Encoder class (dA) 
+
+    A denoising autoencoders tries to reconstruct the input from a corrupted 
+    version of it by projecting it first in a latent space and reprojecting 
+    it afterwards back in the input space. Please refer to Vincent et al.,2008
+    for more details. If x is the input then equation (1) computes a partially
+    destroyed version of x by means of a stochastic mapping q_D. Equation (2) 
+    computes the projection of the input into the latent space. Equation (3) 
+    computes the reconstruction of the input, while equation (4) computes the 
+    reconstruction error.
+  
+    .. math::
+
+        \tilde{x} ~ q_D(\tilde{x}|x)                                     (1)
+
+        y = s(W \tilde{x} + b)                                           (2)
+
+        x = s(W' y  + b')                                                (3)
+
+        L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]      (4)
+
+    """
+
+    def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n_hidden= 500, 
+               W = None, bhid = None, bvis = None):
+        """
+        Initialize the dA class by specifying the number of visible units (the 
+        dimension d of the input ), the number of hidden units ( the dimension 
+        d' of the latent or hidden space ) and the corruption level. The 
+        constructor also receives symbolic variables for the input, weights and 
+        bias. Such a symbolic variables are useful when, for example the input is 
+        the result of some computations, or when weights are shared between the 
+        dA and an MLP layer. When dealing with SdAs this always happens,
+        the dA on layer 2 gets as input the output of the dA on layer 1, 
+        and the weights of the dA are used in the second stage of training 
+        to construct an MLP.
+   
+        :type numpy_rng: numpy.random.RandomState
+        :param numpy_rng: number random generator used to generate weights
+
+        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
+        :param theano_rng: Theano random generator; if None is given one is generated
+                     based on a seed drawn from `rng`
+    
+        :type input: theano.tensor.TensorType
+        :paran input: a symbolic description of the input or None for standalone
+                      dA
+
+        :type n_visible: int
+        :param n_visible: number of visible units
+
+        :type n_hidden: int
+        :param n_hidden:  number of hidden units
+
+        :type W: theano.tensor.TensorType
+        :param W: Theano variable pointing to a set of weights that should be 
+                  shared belong the dA and another architecture; if dA should 
+                  be standalone set this to None
+              
+        :type bhid: theano.tensor.TensorType
+        :param bhid: Theano variable pointing to a set of biases values (for 
+                     hidden units) that should be shared belong dA and another 
+                     architecture; if dA should be standalone set this to None
+
+        :type bvis: theano.tensor.TensorType
+        :param bvis: Theano variable pointing to a set of biases values (for 
+                     visible units) that should be shared belong dA and another
+                     architecture; if dA should be standalone set this to None
+        
+    
+        """
+        self.n_visible = n_visible
+        self.n_hidden  = n_hidden
+        
+        # create a Theano random generator that gives symbolic random values
+        if not theano_rng : 
+            theano_rng = RandomStreams(rng.randint(2**30))
+    
+        # note : W' was written as `W_prime` and b' as `b_prime`
+        if not W:
+            # W is initialized with `initial_W` which is uniformely sampled
+            # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+            # the output of uniform if converted using asarray to dtype 
+            # theano.config.floatX so that the code is runable on GPU
+            initial_W = numpy.asarray( numpy_rng.uniform( 
+                      low  = -numpy.sqrt(6./(n_hidden+n_visible)), 
+                      high = numpy.sqrt(6./(n_hidden+n_visible)), 
+                      size = (n_visible, n_hidden)), dtype = theano.config.floatX)
+            W = theano.shared(value = initial_W, name ='W')  
+    
+        if not bvis:
+            bvis = theano.shared(value = numpy.zeros(n_visible, 
+                                         dtype = theano.config.floatX))
+
+        if not bhid:
+            bhid = theano.shared(value = numpy.zeros(n_hidden,
+                                              dtype = theano.config.floatX))
+
+
+        self.W = W
+        # b corresponds to the bias of the hidden 
+        self.b = bhid
+        # b_prime corresponds to the bias of the visible
+        self.b_prime = bvis
+        # tied weights, therefore W_prime is W transpose
+        self.W_prime = self.W.T 
+        self.theano_rng = theano_rng
+        # if no input is given, generate a variable representing the input
+        if input == None : 
+            # we use a matrix because we expect a minibatch of several examples,
+            # each example being a row
+            self.x = T.dmatrix(name = 'input') 
+        else:
+            self.x = input
+
+        self.params = [self.W, self.b, self.b_prime]
+
+    def get_corrupted_input(self, input, corruption_level):
+        """ This function keeps ``1-corruption_level`` entries of the inputs the same 
+        and zero-out randomly selected subset of size ``coruption_level`` 
+        Note : first argument of theano.rng.binomial is the shape(size) of 
+               random numbers that it should produce
+               second argument is the number of trials 
+               third argument is the probability of success of any trial
+        
+                this will produce an array of 0s and 1s where 1 has a probability of 
+                1 - ``corruption_level`` and 0 with ``corruption_level``
+        """
+        return  self.theano_rng.binomial( size = input.shape, n = 1, prob =  1 - corruption_level) * input
+
+    
+    def get_hidden_values(self, input):
+        """ Computes the values of the hidden layer """
+        return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
+
+    def get_reconstructed_input(self, hidden ):
+        """ Computes the reconstructed input given the values of the hidden layer """
+        return  T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
+    
+    def get_cost_updates(self, corruption_level, learning_rate):
+        """ This function computes the cost and the updates for one trainng
+        step of the dA """
+
+        tilde_x = self.get_corrupted_input(self.x, corruption_level)
+        y       = self.get_hidden_values( tilde_x)
+        z       = self.get_reconstructed_input(y)
+        # note : we sum over the size of a datapoint; if we are using minibatches,
+        #        L will  be a vector, with one entry per example in minibatch
+        L = - T.sum( self.x*T.log(z) + (1-self.x)*T.log(1-z), axis=1 ) 
+        # note : L is now a vector, where each element is the cross-entropy cost 
+        #        of the reconstruction of the corresponding example of the 
+        #        minibatch. We need to compute the average of all these to get 
+        #        the cost of the minibatch
+        cost = T.mean(L)
+
+        # compute the gradients of the cost of the `dA` with respect
+        # to its parameters 
+        gparams = T.grad(cost, self.params)
+        # generate the list of updates
+        updates = {}
+        for param, gparam in zip(self.params, gparams):
+            updates[param] = param -  learning_rate*gparam
+    
+        return (cost, updates)
+
+
+
+
+def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz' ):
+
+    """
+    This demo is tested on MNIST
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used for training the DeNosing AutoEncoder
+
+    :type training_epochs: int
+    :param training_epochs: number of epochs used for training 
+
+    :type dataset: string
+    :param dataset: path to the picked dataset
+
+    """
+    datasets = load_data(dataset)
+    train_set_x, train_set_y = datasets[0]
+
+    batch_size = 20   # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+
+    ####################################
+    # BUILDING THE MODEL NO CORRUPTION #
+    ####################################
+
+    rng        = numpy.random.RandomState(123)
+    theano_rng = RandomStreams( rng.randint(2**30))
+
+    da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x,
+            n_visible = 28*28, n_hidden = 500)
+
+    cost, updates = da.get_cost_updates(corruption_level = 0.,
+                                learning_rate = learning_rate)
+
+    
+    train_da = theano.function([index], cost, updates = updates,
+         givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
+
+    start_time = time.clock()
+
+    ############
+    # TRAINING #
+    ############
+
+    # go through training epochs
+    for epoch in xrange(training_epochs):
+        # go through trainng set
+        c = []
+        for batch_index in xrange(n_train_batches):
+            c.append(train_da(batch_index))
+
+        print 'Training epoch %d, cost '%epoch, numpy.mean(c)
+
+    end_time = time.clock()
+
+    training_time = (end_time - start_time)
+
+    print ('Training took %f minutes' %(training_time/60.))
+
+    image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T,
+                 img_shape = (28,28),tile_shape = (10,10), 
+                 tile_spacing=(1,1)))
+    image.save('filters_corruption_0.png') 
+ 
+    #####################################
+    # BUILDING THE MODEL CORRUPTION 30% #
+    #####################################
+
+    rng        = numpy.random.RandomState(123)
+    theano_rng = RandomStreams( rng.randint(2**30))
+
+    da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x,
+            n_visible = 28*28, n_hidden = 500)
+
+    cost, updates = da.get_cost_updates(corruption_level = 0.3,
+                                learning_rate = learning_rate)
+
+    
+    train_da = theano.function([index], cost, updates = updates,
+         givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
+
+    start_time = time.clock()
+
+    ############
+    # TRAINING #
+    ############
+
+    # go through training epochs
+    for epoch in xrange(training_epochs):
+        # go through trainng set
+        c = []
+        for batch_index in xrange(n_train_batches):
+            c.append(train_da(batch_index))
+
+        print 'Training epoch %d, cost '%epoch, numpy.mean(c)
+
+    end_time = time.clock()
+
+    training_time = (end_time - start_time)
+
+    print ('Training took %f minutes' %(training_time/60.))
+
+    image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T,
+                 img_shape = (28,28),tile_shape = (10,10), 
+                 tile_spacing=(1,1)))
+    image.save('filters_corruption_30.png') 
+ 
+
+
+if __name__ == '__main__':
+    test_dA()
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/deep.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/deep.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,880 @@
+"""
+Draft of DBN, DAA, SDAA, RBM tutorial code
+
+"""
+import sys
+import numpy 
+import theano
+import time
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+from theano import shared, function
+
+import gzip
+import cPickle
+import pylearn.io.image_tiling
+import PIL
+
+# NNET STUFF
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W` 
+    and bias vector :math:`b`. Classification is done by projecting data 
+    points onto a set of hyperplanes, the distance to which is used to 
+    determine a class membership probability. 
+    """
+
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
+        :param input: symbolic variable that describes the input of the 
+                      architecture (one minibatch)
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in 
+                     which the datapoints lie
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in 
+                      which the labels lie
+        """ 
+
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
+        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
+                                            dtype = theano.config.floatX) )
+        # initialize the baises b as a vector of n_out 0s
+        self.b = theano.shared( value=numpy.zeros((n_out,), 
+                                            dtype = theano.config.floatX) )
+        # compute vector of class-membership probabilities in symbolic form
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
+        
+        # compute prediction as class whose probability is maximal in 
+        # symbolic form
+        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+
+        # list of parameters for this layer
+        self.params = [self.W, self.b]
+
+    def negative_log_likelihood(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+        Note: we use the mean instead of the sum so that
+        the learning rate is less dependent on the batch size
+        """
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch 
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+        """
+        # check if y has same dimension of y_pred 
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred', 
+                ('y', target.type, 'y_pred', self.y_pred.type))
+
+        # check if y is of the correct datatype        
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+class SigmoidalLayer(object):
+    def __init__(self, rng, input, n_in, n_out):
+        """
+        Typical hidden layer of a MLP: units are fully-connected and have
+        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
+        
+        Hidden unit activation is given by: sigmoid(dot(input,W) + b)
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+        :type input: theano.tensor.matrix
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+        :type n_in: int
+        :param n_in: dimensionality of input
+        :type n_out: int
+        :param n_out: number of hidden units
+        """
+        self.input = input
+
+        W_values = numpy.asarray( rng.uniform( \
+              low = -numpy.sqrt(6./(n_in+n_out)), \
+              high = numpy.sqrt(6./(n_in+n_out)), \
+              size = (n_in, n_out)), dtype = theano.config.floatX)
+        self.W = theano.shared(value = W_values)
+
+        b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
+        self.b = theano.shared(value= b_values)
+
+        self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b)
+        self.params = [self.W, self.b]
+
+# PRETRAINING LAYERS
+
+class RBM(object):
+    """
+    *** WRITE THE ENERGY FUNCTION  USE SAME LETTERS AS VARIABLE NAMES IN CODE
+    """
+
+    def __init__(self, input=None, n_visible=None, n_hidden=None,
+            W=None, hbias=None, vbias=None,
+            numpy_rng=None, theano_rng=None):
+        """ 
+        RBM constructor. Defines the parameters of the model along with
+        basic operations for inferring hidden from visible (and vice-versa), 
+        as well as for performing CD updates.
+
+        :param input: None for standalone RBMs or symbolic variable if RBM is
+        part of a larger graph.
+
+        :param n_visible: number of visible units (necessary when W or vbias is None)
+
+        :param n_hidden: number of hidden units (necessary when W or hbias is None)
+
+        :param W: weights to use for the RBM.  None means that a shared variable will be
+        created with a randomly chosen matrix of size (n_visible, n_hidden).
+
+        :param hbias: ***
+
+        :param vbias: ***
+
+        :param numpy_rng: random number generator (necessary when W is None)
+
+        """
+        
+        params = []
+        if W is None:
+            # choose initial values for weight matrix of RBM 
+            initial_W = numpy.asarray(
+                    numpy_rng.uniform( \
+                        low=-numpy.sqrt(6./(n_hidden+n_visible)), \
+                        high=numpy.sqrt(6./(n_hidden+n_visible)), \
+                        size=(n_visible, n_hidden)), \
+                    dtype=theano.config.floatX)
+            W = theano.shared(value=initial_W, name='W')
+            params.append(W)
+
+        if hbias is None:
+            # theano shared variables for hidden biases
+            hbias = theano.shared(value=numpy.zeros(n_hidden,
+                dtype=theano.config.floatX), name='hbias')
+            params.append(hbias)
+
+        if vbias is None:
+            # theano shared variables for visible biases
+            vbias = theano.shared(value=numpy.zeros(n_visible,
+                dtype=theano.config.floatX), name='vbias')
+            params.append(vbias)
+
+        if input is None:
+            # initialize input layer for standalone RBM or layer0 of DBN
+            input = T.matrix('input')
+
+        # setup theano random number generator
+        if theano_rng is None:
+            theano_rng = RandomStreams(numpy_rng.randint(2**30))
+
+        self.visible = self.input = input
+        self.W = W
+        self.hbias = hbias
+        self.vbias = vbias
+        self.theano_rng = theano_rng 
+        self.params = params
+        self.hidden_mean = T.nnet.sigmoid(T.dot(input, W)+hbias)
+        self.hidden_sample = theano_rng.binomial(self.hidden_mean.shape, 1, self.hidden_mean)
+
+    def gibbs_k(self, v_sample, k):
+        ''' This function implements k steps of Gibbs sampling '''
+ 
+        # We compute the visible after k steps of Gibbs by iterating 
+        # over ``gibs_1`` for k times; this can be done in Theano using
+        # the `scan op`. For a more comprehensive description of scan see 
+        # http://deeplearning.net/software/theano/library/scan.html .
+        
+        def gibbs_1(v0_sample, W, hbias, vbias):
+            ''' This function implements one Gibbs step '''
+
+            # compute the activation of the hidden units given a sample of the
+            # vissibles
+            h0_mean = T.nnet.sigmoid(T.dot(v0_sample, W) + hbias)
+            # get a sample of the hiddens given their activation
+            h0_sample = self.theano_rng.binomial(h0_mean.shape, 1, h0_mean)
+            # compute the activation of the visible given the hidden sample
+            v1_mean = T.nnet.sigmoid(T.dot(h0_sample, W.T) + vbias)
+            # get a sample of the visible given their activation
+            v1_act = self.theano_rng.binomial(v1_mean.shape, 1, v1_mean)
+            return [v1_mean, v1_act]
+
+
+        # DEBUGGING TO DO ALL WITHOUT SCAN
+        if k == 1:
+            return gibbs_1(v_sample, self.W, self.hbias, self.vbias)
+       
+       
+        # Because we require as output two values, namely the mean field
+        # approximation of the visible and the sample obtained after k steps, 
+        # scan needs to know the shape of those two outputs. Scan takes 
+        # this information from the variables containing the initial state
+        # of the outputs. Since we do not need a initial state of ``v_mean``
+        # we provide a dummy one used only to get the correct shape 
+        v_mean = T.zeros_like(v_sample)
+        
+        # ``outputs_taps`` is an argument of scan which describes at each
+        # time step what past values of the outputs the function applied 
+        # recursively needs. This is given in the form of a dictionary, 
+        # where the keys are outputs indexes, and values are a list of 
+        # of the offsets used  by the corresponding outputs
+        # In our case the function ``gibbs_1`` applied recursively, requires
+        # at time k the past value k-1 for the first output (index 0) and
+        # no past value of the second output
+        outputs_taps = { 0 : [-1], 1 : [] }
+
+        v_means, v_samples = theano.scan( fn = gibbs_1, 
+                                          sequences      = [], 
+                                          initial_states = [v_sample, v_mean],
+                                          non_sequences  = [self.W, self.hbias, self.vbias], 
+                                          outputs_taps   = outputs_taps,
+                                          n_steps        = k)
+        return v_means[-1], v_samples[-1]
+
+    def free_energy(self, v_sample):
+        wx_b = T.dot(v_sample, self.W) + self.hbias
+        vbias_term = T.sum(T.dot(v_sample, self.vbias))
+        hidden_term = T.sum(T.log(1+T.exp(wx_b)))
+        return -hidden_term - vbias_term
+
+    def cd(self, visible = None, persistent = None, steps = 1):
+        """
+        Return a 5-tuple of values related to contrastive divergence: (cost,
+        end-state of negative-phase chain, gradient on weights, gradient on
+        hidden bias, gradient on visible bias)
+
+        If visible is None, it defaults to self.input
+        If persistent is None, it defaults to self.input
+
+        CD aka CD1 - cd()
+        CD-10      - cd(steps=10)
+        PCD        - cd(persistent=shared(numpy.asarray(initializer)))
+        PCD-k      - cd(persistent=shared(numpy.asarray(initializer)),
+                        steps=10)
+        """
+        if visible is None:
+            visible = self.input
+
+        if visible is None:
+            raise TypeError('visible argument is required when self.input is None')
+
+        if steps is None:
+            steps = self.gibbs_1
+
+        if persistent is None:
+            chain_start = visible
+        else:
+            chain_start = persistent
+
+        chain_end_mean, chain_end_sample = self.gibbs_k(chain_start, steps)
+
+        #print >> sys.stderr, "WARNING: DEBUGGING with wrong FREE ENERGY"
+        #free_energy_delta = - self.free_energy(chain_end_sample)
+        free_energy_delta = self.free_energy(visible) - self.free_energy(chain_end_sample)
+
+        # we will return all of these regardless of what is in self.params
+        all_params = [self.W, self.hbias, self.vbias]
+
+        gparams = T.grad(free_energy_delta, all_params, 
+                consider_constant = [chain_end_sample])
+
+        cross_entropy = T.mean(T.sum(
+            visible*T.log(chain_end_mean) + (1 - visible)*T.log(1-chain_end_mean),
+            axis = 1))
+
+        return (cross_entropy, chain_end_sample,) + tuple(gparams)
+
+    def cd_updates(self, lr, visible = None, persistent = None, steps = 1):
+        """
+        Return the learning updates for the RBM parameters that are shared variables.
+
+        Also returns an update for the persistent if it is a shared variable.
+
+        These updates are returned as a dictionary.
+
+        :param lr: [scalar] learning rate for contrastive divergence learning
+        :param visible: see `cd_grad`
+        :param persistent: see `cd_grad`
+        :param steps: see `cd_grad`
+
+        """
+
+        cross_entropy, chain_end, gW, ghbias, gvbias = self.cd(visible,
+                persistent, steps)
+
+        updates = {}
+        if hasattr(self.W, 'value'):
+            updates[self.W] = self.W - lr * gW
+        if hasattr(self.hbias, 'value'):
+            updates[self.hbias] = self.hbias - lr * ghbias
+        if hasattr(self.vbias, 'value'):
+            updates[self.vbias] = self.vbias - lr * gvbias
+        if persistent:
+            #if persistent is a shared var, then it means we should use
+            updates[persistent] = chain_end
+
+        return updates
+
+# DEEP MODELS 
+
+class DBN(object):
+    """
+    *** WHAT IS A DBN?
+    """
+
+    def __init__(self, input_len, hidden_layers_sizes, n_classes, rng):
+        """ This class is made to support a variable number of layers. 
+
+        :param train_set_x: symbolic variable pointing to the training dataset 
+
+        :param train_set_y: symbolic variable pointing to the labels of the
+        training dataset
+
+        :param input_len: dimension of the input to the sdA
+
+        :param n_layers_sizes: intermidiate layers size, must contain 
+        at least one value
+
+        :param n_classes: dimension of the output of the network
+
+        :param corruption_levels: amount of corruption to use for each 
+        layer
+
+        :param rng: numpy random number generator used to draw initial weights
+
+        :param pretrain_lr: learning rate used during pre-trainnig stage
+
+        :param finetune_lr: learning rate used during finetune stage
+        """
+        
+        self.sigmoid_layers     = []
+        self.rbm_layers         = []
+        self.pretrain_functions = []
+        self.params             = []
+
+        theano_rng = RandomStreams(rng.randint(2**30))
+
+        # allocate symbolic variables for the data
+        index   = T.lscalar()    # index to a [mini]batch 
+        self.x  = T.matrix('x')  # the data is presented as rasterized images
+        self.y  = T.ivector('y') # the labels are presented as 1D vector of 
+                                 # [int] labels
+        input = self.x
+
+        # The SdA is an MLP, for which all weights of intermidiate layers
+        # are shared with a different denoising autoencoders 
+        # We will first construct the SdA as a deep multilayer perceptron,
+        # and when constructing each sigmoidal layer we also construct a 
+        # denoising autoencoder that shares weights with that layer, and 
+        # compile a training function for that denoising autoencoder
+
+        for n_hid in hidden_layers_sizes:
+            # construct the sigmoidal layer
+
+            sigmoid_layer = SigmoidalLayer(rng, input, input_len, n_hid)
+            self.sigmoid_layers.append(sigmoid_layer)
+
+            self.rbm_layers.append(RBM(input=input,
+                W=sigmoid_layer.W,
+                hbias=sigmoid_layer.b,
+                n_visible = input_len,
+                n_hidden = n_hid,
+                numpy_rng=rng,
+                theano_rng=theano_rng))
+
+            # its arguably a philosophical question...
+            # but we are going to only declare that the parameters of the 
+            # sigmoid_layers are parameters of the StackedDAA
+            # the hidden-layer biases in the daa_layers are parameters of those
+            # daa_layers, but not the StackedDAA
+            self.params.extend(self.sigmoid_layers[-1].params)
+
+            # get ready for the next loop iteration
+            input_len = n_hid
+            input = self.sigmoid_layers[-1].output
+        
+        # We now need to add a logistic layer on top of the MLP
+        self.logistic_regressor = LogisticRegression(input = input,
+                n_in = input_len, n_out = n_classes)
+
+        self.params.extend(self.logistic_regressor.params)
+
+    def pretraining_functions(self, train_set_x, batch_size, learning_rate, k=1):
+        if k!=1:
+            raise NotImplementedError()
+        index   = T.lscalar()    # index to a [mini]batch 
+        n_train_batches = train_set_x.value.shape[0] / batch_size
+        batch_begin = (index % n_train_batches) * batch_size
+        batch_end = batch_begin+batch_size
+
+        print 'TRAIN_SET X', train_set_x.value.shape
+        rval = []
+        for rbm in self.rbm_layers:
+            # N.B. these cd() samples are independent from the
+            # samples used for learning
+            outputs = list(rbm.cd())[0:2]
+            rval.append(function([index], outputs, 
+                    updates = rbm.cd_updates(lr=learning_rate),
+                    givens = {self.x: train_set_x[batch_begin:batch_end]}))
+            if rbm is self.rbm_layers[0]:
+                f = rval[-1]
+                AA=len(outputs)
+                for i, implicit_out in enumerate(f.maker.env.outputs): #[len(outputs):]:
+                    print 'OUTPUT ', i
+                    theano.printing.debugprint(implicit_out, file=sys.stdout)
+                
+        return rval
+
+    def finetune(self, datasets, lr, batch_size):
+
+        # unpack the various datasets
+        (train_set_x, train_set_y) = datasets[0]
+        (valid_set_x, valid_set_y) = datasets[1]
+        (test_set_x, test_set_y) = datasets[2]
+
+        # compute number of minibatches for training, validation and testing
+        assert train_set_x.value.shape[0] % batch_size == 0
+        assert valid_set_x.value.shape[0] % batch_size == 0
+        assert test_set_x.value.shape[0] % batch_size == 0
+        n_train_batches = train_set_x.value.shape[0] / batch_size
+        n_valid_batches = valid_set_x.value.shape[0] / batch_size
+        n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+        index   = T.lscalar()    # index to a [mini]batch 
+        target = self.y
+
+        train_index = index % n_train_batches
+
+        classifier = self.logistic_regressor
+        cost = classifier.negative_log_likelihood(target)
+        # compute the gradients with respect to the model parameters
+        gparams = T.grad(cost, self.params)
+
+        # compute list of fine-tuning updates
+        updates = [(param, param - gparam*finetune_lr)
+                for param,gparam in zip(self.params, gparams)]
+
+        train_fn = theano.function([index], cost, 
+                updates = updates,
+                givens = {
+                  self.x : train_set_x[train_index*batch_size:(train_index+1)*batch_size],
+                  target : train_set_y[train_index*batch_size:(train_index+1)*batch_size]})
+
+        test_score_i = theano.function([index], classifier.errors(target),
+                 givens = {
+                   self.x: test_set_x[index*batch_size:(index+1)*batch_size],
+                   target: test_set_y[index*batch_size:(index+1)*batch_size]})
+
+        valid_score_i = theano.function([index], classifier.errors(target),
+                givens = {
+                   self.x: valid_set_x[index*batch_size:(index+1)*batch_size],
+                   target: valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+        def test_scores():
+            return [test_score_i(i) for i in xrange(n_test_batches)]
+
+        def valid_scores():
+            return [valid_score_i(i) for i in xrange(n_valid_batches)]
+
+        return train_fn, valid_scores, test_scores
+
+def load_mnist(filename):
+    f = gzip.open(filename,'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+    def shared_dataset(data_xy):
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+        return shared_x, T.cast(shared_y, 'int32')
+
+    n_train_examples = train_set[0].shape[0]
+    datasets = shared_dataset(train_set), shared_dataset(valid_set), shared_dataset(test_set)
+
+    return n_train_examples, datasets
+
+def dbn_main(finetune_lr = 0.01,
+        pretraining_epochs = 10,
+        pretrain_lr = 0.1,
+        training_epochs = 1000,
+        batch_size = 20,
+        mnist_file='mnist.pkl.gz'):
+    """
+    Demonstrate stochastic gradient descent optimization for a multilayer perceptron
+
+    This is demonstrated on MNIST.
+
+    :param learning_rate: learning rate used in the finetune stage 
+    (factor for the stochastic gradient)
+
+    :param pretraining_epochs: number of epoch to do pretraining
+
+    :param pretrain_lr: learning rate to be used during pre-training
+
+    :param n_iter: maximal number of iterations ot run the optimizer 
+
+    :param mnist_file: path the the pickled mnist_file
+
+    """
+
+    n_train_examples, train_valid_test = load_mnist(mnist_file)
+
+    print "Creating a Deep Belief Network"
+    deep_model = DBN(
+            input_len=28*28,
+            hidden_layers_sizes = [500, 150, 100],
+            n_classes=10,
+            rng = numpy.random.RandomState())
+
+    ####
+    #### Phase 1: Pre-training
+    ####
+    print "Pretraining (unsupervised learning) ..."
+
+    pretrain_functions = deep_model.pretraining_functions(
+            batch_size=batch_size,
+            train_set_x=train_valid_test[0][0],
+            learning_rate=pretrain_lr,
+            )
+
+    start_time = time.clock()  
+    for layer_idx, pretrain_fn in enumerate(pretrain_functions):
+        # go through pretraining epochs 
+        print 'Pre-training layer %i'% layer_idx
+        for i in xrange(pretraining_epochs * n_train_examples / batch_size):
+            outstuff = pretrain_fn(i)
+            xe, negsample = outstuff[:2]
+            print (layer_idx, i,
+                    n_train_examples / batch_size,
+                    float(xe),
+                    'Wmin', deep_model.rbm_layers[0].W.value.min(),
+                    'Wmax', deep_model.rbm_layers[0].W.value.max(),
+                    'vmin', deep_model.rbm_layers[0].vbias.value.min(),
+                    'vmax', deep_model.rbm_layers[0].vbias.value.max(),
+                    #'x>0.3', (input_i>0.3).sum(),
+                    )
+            sys.stdout.flush()
+            if i % 1000 == 0:
+                PIL.Image.fromarray(
+                    pylearn.io.image_tiling.tile_raster_images(negsample, (28,28), (10,10),
+                            tile_spacing=(1,1))).save('samples_%i_%i.png'%(layer_idx,i))
+
+                PIL.Image.fromarray(
+                    pylearn.io.image_tiling.tile_raster_images(
+                        deep_model.rbm_layers[0].W.value.T,
+                        (28,28), (10,10),
+                        tile_spacing=(1,1))).save('filters_%i_%i.png'%(layer_idx,i))
+    end_time = time.clock()
+    print 'Pretraining took %f minutes' %((end_time - start_time)/60.)
+
+    return
+
+    print "Fine tuning (supervised learning) ..."
+    train_fn, valid_scores, test_scores =\
+        deep_model.finetune_functions(train_valid_test[0][0],
+            learning_rate=finetune_lr,      # the learning rate
+            batch_size = batch_size)        # number of examples to use at once
+
+    ####
+    #### Phase 2: Fine Tuning
+    ####
+
+    patience              = 10000 # look as this many examples regardless
+    patience_increase     = 2.    # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = min(n_train_examples, patience/2)
+                                  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+    patience_max = n_train_examples * training_epochs
+
+    best_epoch               = None 
+    best_epoch_test_score    = None
+    best_epoch_valid_score   = float('inf')
+    start_time               = time.clock()
+
+    for i in xrange(patience_max):
+        if i >= patience:
+            break
+
+        cost_i = train_fn(i)
+
+        if i % validation_frequency == 0:
+            validation_i = numpy.mean([score for score in valid_scores()])
+
+            # if we got the best validation score until now
+            if validation_i < best_epoch_valid_score:
+
+                # improve patience if loss improvement is good enough
+                threshold_i = best_epoch_valid_score * improvement_threshold
+                if validation_i < threshold_i:
+                    patience = max(patience, i * patience_increase)
+
+                # save best validation score and iteration number
+                best_epoch_valid_score = validation_i
+                best_epoch = i/validation_i
+                best_epoch_test_score = numpy.mean(
+                        [score for score in test_scores()])
+
+                print('epoch %i, validation error %f %%, test error %f %%'%(
+                    i/validation_frequency, validation_i*100.,
+                    best_epoch_test_score*100.))
+            else:
+                print('epoch %i, validation error %f %%' % (
+                    i/validation_frequency, validation_i*100.))
+    end_time = time.clock()
+
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %  
+                 (finetune_status['best_validation_loss']*100.,
+                     finetune_status['test_score']*100.))
+    print ('The code ran for %f minutes' % ((finetune_status['duration'])/60.))
+
+def rbm_main():
+    rbm = RBM(n_visible=20, n_hidden=30,
+            numpy_rng = numpy.random.RandomState(34))
+
+    cd_updates = rbm.cd_updates(lr=0.25)
+
+    print cd_updates
+
+    f = function([rbm.input], [],
+            updates={rbm.W:cd_updates[rbm.W]})
+
+    theano.printing.debugprint(f.maker.env.outputs[0],
+            file=sys.stdout)
+
+
+if __name__ == '__main__':
+    dbn_main()
+    #rbm_main()
+
+
+if 0:
+    class DAA(object):
+      def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\
+                   input = None, shared_W = None, shared_b = None):
+        """
+        Initialize the dA class by specifying the number of visible units (the 
+        dimension d of the input ), the number of hidden units ( the dimension 
+        d' of the latent or hidden space ) and the corruption level. The 
+        constructor also receives symbolic variables for the input, weights and 
+        bias. Such a symbolic variables are useful when, for example the input is 
+        the result of some computations, or when weights are shared between the 
+        dA and an MLP layer. When dealing with SdAs this always happens,
+        the dA on layer 2 gets as input the output of the dA on layer 1, 
+        and the weights of the dA are used in the second stage of training 
+        to construct an MLP.
+        
+        :param n_visible: number of visible units
+
+        :param n_hidden:  number of hidden units
+
+        :param input:     a symbolic description of the input or None 
+
+        :param corruption_level: the corruption mechanism picks up randomly this 
+        fraction of entries of the input and turns them to 0
+        
+        
+        """
+        self.n_visible = n_visible
+        self.n_hidden  = n_hidden
+        
+        # create a Theano random generator that gives symbolic random values
+        theano_rng = RandomStreams()
+        
+        if shared_W != None and shared_b != None : 
+            self.W = shared_W
+            self.b = shared_b
+        else:
+            # initial values for weights and biases
+            # note : W' was written as `W_prime` and b' as `b_prime`
+
+            # W is initialized with `initial_W` which is uniformely sampled
+            # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+            # the output of uniform if converted using asarray to dtype 
+            # theano.config.floatX so that the code is runable on GPU
+            initial_W = numpy.asarray( numpy.random.uniform( \
+                  low = -numpy.sqrt(6./(n_hidden+n_visible)), \
+                  high = numpy.sqrt(6./(n_hidden+n_visible)), \
+                  size = (n_visible, n_hidden)), dtype = theano.config.floatX)
+            initial_b       = numpy.zeros(n_hidden, dtype = theano.config.floatX)
+        
+        
+            # theano shared variables for weights and biases
+            self.W       = theano.shared(value = initial_W,       name = "W")
+            self.b       = theano.shared(value = initial_b,       name = "b")
+        
+     
+        initial_b_prime= numpy.zeros(n_visible)
+        # tied weights, therefore W_prime is W transpose
+        self.W_prime = self.W.T 
+        self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
+
+        # if no input is given, generate a variable representing the input
+        if input == None : 
+            # we use a matrix because we expect a minibatch of several examples,
+            # each example being a row
+            self.x = T.matrix(name = 'input') 
+        else:
+            self.x = input
+        # Equation (1)
+        # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs
+        # note : first argument of theano.rng.binomial is the shape(size) of 
+        #        random numbers that it should produce
+        #        second argument is the number of trials 
+        #        third argument is the probability of success of any trial
+        #
+        #        this will produce an array of 0s and 1s where 1 has a 
+        #        probability of 1 - ``corruption_level`` and 0 with
+        #        ``corruption_level``
+        self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level) * self.x
+        # Equation (2)
+        # note  : y is stored as an attribute of the class so that it can be 
+        #         used later when stacking dAs. 
+        self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
+        # Equation (3)
+        self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+        # Equation (4)
+        # note : we sum over the size of a datapoint; if we are using minibatches,
+        #        L will  be a vector, with one entry per example in minibatch
+        self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
+        # note : L is now a vector, where each element is the cross-entropy cost 
+        #        of the reconstruction of the corresponding example of the 
+        #        minibatch. We need to compute the average of all these to get 
+        #        the cost of the minibatch
+        self.cost = T.mean(self.L)
+
+        self.params = [ self.W, self.b, self.b_prime ]
+
+    class StackedDAA(DeepLayerwiseModel):
+        """Stacked denoising auto-encoder class (SdA)
+
+        A stacked denoising autoencoder model is obtained by stacking several
+        dAs. The hidden layer of the dA at layer `i` becomes the input of 
+        the dA at layer `i+1`. The first layer dA gets as input the input of 
+        the SdA, and the hidden layer of the last dA represents the output. 
+        Note that after pretraining, the SdA is dealt with as a normal MLP, 
+        the dAs are only used to initialize the weights.
+        """
+
+        def __init__(self, n_ins, hidden_layers_sizes, n_outs, 
+                     corruption_levels, rng, ):
+            """ This class is made to support a variable number of layers. 
+
+            :param train_set_x: symbolic variable pointing to the training dataset 
+
+            :param train_set_y: symbolic variable pointing to the labels of the
+            training dataset
+
+            :param n_ins: dimension of the input to the sdA
+
+            :param n_layers_sizes: intermidiate layers size, must contain 
+            at least one value
+
+            :param n_outs: dimension of the output of the network
+
+            :param corruption_levels: amount of corruption to use for each 
+            layer
+
+            :param rng: numpy random number generator used to draw initial weights
+
+            :param pretrain_lr: learning rate used during pre-trainnig stage
+
+            :param finetune_lr: learning rate used during finetune stage
+            """
+            
+            self.sigmoid_layers     = []
+            self.daa_layers         = []
+            self.pretrain_functions = []
+            self.params             = []
+            self.n_layers           = len(hidden_layers_sizes)
+
+            if len(hidden_layers_sizes) < 1 :
+                raiseException (' You must have at least one hidden layer ')
+
+            theano_rng = RandomStreams(rng.randint(2**30))
+
+            # allocate symbolic variables for the data
+            index   = T.lscalar()    # index to a [mini]batch 
+            self.x  = T.matrix('x')  # the data is presented as rasterized images
+            self.y  = T.ivector('y') # the labels are presented as 1D vector of 
+                                     # [int] labels
+
+            # The SdA is an MLP, for which all weights of intermidiate layers
+            # are shared with a different denoising autoencoders 
+            # We will first construct the SdA as a deep multilayer perceptron,
+            # and when constructing each sigmoidal layer we also construct a 
+            # denoising autoencoder that shares weights with that layer, and 
+            # compile a training function for that denoising autoencoder
+
+            for i in xrange( self.n_layers ):
+                # construct the sigmoidal layer
+
+                sigmoid_layer = SigmoidalLayer(rng,
+                        self.layers[-1].output if i else self.x,
+                        hidden_layers_sizes[i-1] if i else n_ins, 
+                        hidden_layers_sizes[i])
+
+                daa_layer = DAA(corruption_level = corruption_levels[i],
+                              input = sigmoid_layer.input,
+                              W = sigmoid_layer.W, 
+                              b = sigmoid_layer.b)
+
+                # add the layer to the 
+                self.sigmoid_layers.append(sigmoid_layer)
+                self.daa_layers.append(daa_layer)
+
+                # its arguably a philosophical question...
+                # but we are going to only declare that the parameters of the 
+                # sigmoid_layers are parameters of the StackedDAA
+                # the hidden-layer biases in the daa_layers are parameters of those
+                # daa_layers, but not the StackedDAA
+                self.params.extend(sigmoid_layer.params)
+            
+            # We now need to add a logistic layer on top of the MLP
+            self.logistic_regressor = LogisticRegression(
+                             input = self.sigmoid_layers[-1].output,
+                             n_in = hidden_layers_sizes[-1],
+                             n_out = n_outs)
+
+            self.params.extend(self.logLayer.params)
+
+        def pretraining_functions(self, train_set_x, batch_size):
+
+            # compiles update functions for each layer, and
+            # returns them as a list
+            # 
+            # Construct a function that trains this dA
+            # compute gradients of layer parameters
+            gparams = T.grad(dA_layer.cost, dA_layer.params)
+            # compute the list of updates
+            updates = {}
+            for param, gparam in zip(dA_layer.params, gparams):
+                updates[param] = param - gparam * pretrain_lr
+            
+            # create a function that trains the dA
+            update_fn = theano.function([index], dA_layer.cost, \
+                  updates = updates,
+                  givens = { 
+                     self.x : train_set_x[index*batch_size:(index+1)*batch_size]})
+            # collect this function into a list
+            self.pretrain_functions += [update_fn]
+
+
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/logistic_cg.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/logistic_cg.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,310 @@
+"""
+This tutorial introduces logistic regression using Theano and conjugate 
+gradient descent.  
+
+Logistic regression is a probabilistic, linear classifier. It is parametrized
+by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
+done by projecting data points onto a set of hyperplanes, the distance to
+which is used to determine a class membership probability. 
+
+Mathematically, this can be written as:
+
+.. math::
+  P(Y=i|x, W,b) &= softmax_i(W x + b) \\
+                &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
+
+
+The output of the model or prediction is then done by taking the argmax of 
+the vector whose i'th element is P(Y=i|x).
+
+.. math::
+
+  y_{pred} = argmax_i P(Y=i|x,W,b)
+
+
+This tutorial presents a stochastic gradient descent optimization method 
+suitable for large datasets, and a conjugate gradient optimization method 
+that is suitable for smaller datasets.
+
+
+References:
+
+   - textbooks: "Pattern Recognition and Machine Learning" - 
+                 Christopher M. Bishop, section 4.3.2
+
+
+"""
+__docformat__ = 'restructedtext en'
+
+
+import numpy, time, cPickle, gzip
+
+import theano
+import theano.tensor as T
+
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W` 
+    and bias vector :math:`b`. Classification is done by projecting data 
+    points onto a set of hyperplanes, the distance to which is used to 
+    determine a class membership probability. 
+    """
+
+
+
+
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the 
+                      architecture ( one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in 
+                     which the datapoint lies
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in 
+                      which the target lies
+
+        """ 
+
+        # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), 
+        # while b is a vector of n_out elements, making theta a vector of
+        # n_in*n_out + n_out elements
+        self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out, dtype = theano.config.floatX) )
+        # W is represented by the fisr n_in*n_out elements of theta
+        self.W     = self.theta[0:n_in*n_out].reshape((n_in,n_out))
+        # b is the rest (last n_out elements)
+        self.b     = self.theta[n_in*n_out:n_in*n_out+n_out]
+
+
+        # compute vector of class-membership probabilities in symbolic form
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
+
+        # compute prediction as class whose probability is maximal in 
+        # symbolic form
+        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+
+
+
+
+
+    def negative_log_likelihood(self, y):
+        """Return the negative log-likelihood of the prediction of this model
+        under a given target distribution.  
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
+            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D}) 
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+        """
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
+
+
+
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch 
+        over the total number of examples of the minibatch 
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example
+                  the correct label
+        """
+
+        # check if y has same dimension of y_pred 
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred', 
+                ('y', target.type, 'y_pred', self.y_pred.type))
+        # check if y is of the correct datatype        
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+
+
+
+
+
+def cg_optimization_mnist( n_epochs=50, mnist_pkl_gz='mnist.pkl.gz' ):
+    """Demonstrate conjugate gradient optimization of a log-linear model 
+
+    This is demonstrated on MNIST.
+    
+    :type n_epochs: int
+    :param n_epochs: number of epochs to run the optimizer 
+
+    :type mnist_pkl_gz: string
+    :param mnist_pkl_gz: the path of the mnist training file from 
+                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+
+    """
+    #############
+    # LOAD DATA #
+    #############
+    print '... loading data'
+
+    # Load the dataset 
+    f = gzip.open(mnist_pkl_gz,'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+    def shared_dataset(data_xy):
+        """ Function that loads the dataset into shared variables
+        
+        The reason we store our dataset in shared variables is to allow 
+        Theano to copy it into the GPU memory (when code is run on GPU). 
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared 
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are 
+        # floats it doesn't make sense) therefore instead of returning 
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast(shared_y, 'int32')
+
+
+    test_set_x,  test_set_y  = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    batch_size = 600    # size of the minibatch
+
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+
+    ishape     = (28,28) # this is the size of MNIST images
+    n_in       = 28*28   # number of input units
+    n_out      = 10      # number of output units
+
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ###################### 
+    print '... building the model'
+
+    # allocate symbolic variables for the data
+    minibatch_offset = T.lscalar() # offset to the start of a [mini]batch 
+    x = T.matrix()   # the data is presented as rasterized images
+    y = T.ivector()  # the labels are presented as 1D vector of 
+                     # [int] labels
+
+ 
+    # construct the logistic regression class
+    classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model in symbolic format
+    cost = classifier.negative_log_likelihood(y).mean() 
+
+    # compile a theano function that computes the mistakes that are made by 
+    # the model on a minibatch
+    test_model = theano.function([minibatch_offset], classifier.errors(y),
+            givens={
+                x:test_set_x[minibatch_offset:minibatch_offset+batch_size],
+                y:test_set_y[minibatch_offset:minibatch_offset+batch_size]})
+
+    validate_model = theano.function([minibatch_offset],classifier.errors(y),
+            givens={
+                x:valid_set_x[minibatch_offset:minibatch_offset+batch_size],
+                y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
+
+    #  compile a thenao function that returns the cost of a minibatch 
+    batch_cost = theano.function([minibatch_offset], cost, 
+            givens= {
+                x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
+                y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
+
+
+    
+    # compile a theano function that returns the gradient of the minibatch 
+    # with respect to theta
+    batch_grad = theano.function([minibatch_offset], T.grad(cost,classifier.theta), 
+            givens= {
+                x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
+                y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
+
+
+    # creates a function that computes the average cost on the training set
+    def train_fn(theta_value):
+        classifier.theta.value = theta_value
+        train_losses = [batch_cost(i*batch_size) for i in xrange(n_train_batches)]
+        return numpy.mean(train_losses)
+
+    # creates a function that computes the average gradient of cost with 
+    # respect to theta
+    def train_fn_grad(theta_value):
+        classifier.theta.value = theta_value
+        grad = batch_grad(0)
+        for i in xrange(1,n_train_batches):
+            grad += batch_grad(i*batch_size)
+        return grad/n_train_batches
+
+
+    validation_scores = [float('inf'), 0]
+ 
+    # creates the validation function
+    def callback(theta_value):
+        classifier.theta.value = theta_value
+        #compute the validation loss
+        validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)]
+        this_validation_loss = numpy.mean(validation_losses)
+        print('validation error %f %%' % (this_validation_loss*100.,))
+        
+        # check if it is better then best validation score got until now
+        if this_validation_loss < validation_scores[0]:
+            # if so, replace the old one, and compute the score on the 
+            # testing dataset
+            validation_scores[0] = this_validation_loss
+            test_loses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
+            validation_scores[1] = numpy.mean(test_loses)
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+ 
+    # using scipy conjugate gradient optimizer 
+    import scipy.optimize
+    print ("Optimizing using scipy.optimize.fmin_cg...")
+    start_time = time.clock()
+    best_w_b = scipy.optimize.fmin_cg(
+               f        = train_fn, 
+               x0       = numpy.zeros((n_in+1)*n_out, dtype=x.dtype),
+               fprime   = train_fn_grad,
+               callback = callback,
+               disp     = 0,
+               maxiter  = n_epochs)
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%, with '
+          'test performance %f %%') % 
+               (validation_scores[0]*100., validation_scores[1]*100.))
+
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+
+if __name__ == '__main__':
+    cg_optimization_mnist()
+
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/logistic_sgd.py
--- a/code_tutoriel/logistic_sgd.py	Thu Feb 25 18:53:02 2010 -0500
+++ b/code_tutoriel/logistic_sgd.py	Fri Feb 26 13:55:27 2010 -0500
@@ -32,20 +32,14 @@
     - textbooks: "Pattern Recognition and Machine Learning" - 
                  Christopher M. Bishop, section 4.3.2
 
-
 """
 __docformat__ = 'restructedtext en'
 
-
-import numpy, cPickle, gzip
-
-import time
+import numpy, time, cPickle, gzip
 
 import theano
 import theano.tensor as T
 
-import theano.tensor.nnet
-
 
 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -62,23 +56,26 @@
     def __init__(self, input, n_in, n_out):
         """ Initialize the parameters of the logistic regression
 
+        :type input: theano.tensor.TensorType
         :param input: symbolic variable that describes the input of the 
-        architecture (one minibatch)
-
+                      architecture (one minibatch)
+        
+        :type n_in: int
         :param n_in: number of input units, the dimension of the space in 
-        which the datapoints lie
+                     which the datapoints lie
 
+        :type n_out: int
         :param n_out: number of output units, the dimension of the space in 
-        which the labels lie
+                      which the labels lie
 
         """ 
 
         # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
-        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
-                                            dtype = theano.config.floatX) )
+        self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX),
+                                name='W')
         # initialize the baises b as a vector of n_out 0s
-        self.b = theano.shared( value=numpy.zeros((n_out,), 
-                                            dtype = theano.config.floatX) )
+        self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX),
+                               name='b')
 
 
         # compute vector of class-membership probabilities in symbolic form
@@ -88,6 +85,9 @@
         # symbolic form
         self.y_pred=T.argmax(self.p_y_given_x, axis=1)
 
+        # parameters of the model
+        self.params = [self.W, self.b]
+
 
 
 
@@ -102,23 +102,30 @@
             \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                 \ell (\theta=\{W,b\}, \mathcal{D})
 
-
+        :type y: theano.tensor.TensorType
         :param y: corresponds to a vector that gives for each example the
-        :correct label
+                  correct label
 
         Note: we use the mean instead of the sum so that
-        the learning rate is less dependent on the batch size
+              the learning rate is less dependent on the batch size
         """
+        # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
+        # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
+        # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class 
+        # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
+        # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
+        # i.e., the mean log-likelihood across the minibatch.
         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
 
 
-
-
-
     def errors(self, y):
         """Return a float representing the number of errors in the minibatch 
         over the total number of examples of the minibatch ; zero one
         loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the 
+                  correct label
         """
 
         # check if y has same dimension of y_pred 
@@ -134,72 +141,103 @@
             raise NotImplementedError()
 
 
+def load_data(dataset):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+    print '... loading data'
+
+    # Load the dataset 
+    f = gzip.open(dataset,'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+
+    def shared_dataset(data_xy):
+        """ Function that loads the dataset into shared variables
+        
+        The reason we store our dataset in shared variables is to allow 
+        Theano to copy it into the GPU memory (when code is run on GPU). 
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared 
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are 
+        # floats it doesn't make sense) therefore instead of returning 
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast(shared_y, 'int32')
+
+    test_set_x,  test_set_y  = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
+    return rval
 
 
 
-def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
+
+def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'):
     """
     Demonstrate stochastic gradient descent optimization of a log-linear 
     model
 
     This is demonstrated on MNIST.
     
+    :type learning_rate: float
     :param learning_rate: learning rate used (factor for the stochastic 
-    gradient
+                          gradient)
 
-    :param n_iter: maximal number of iterations ot run the optimizer 
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer 
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from 
+                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 
     """
+    datasets = load_data(dataset)
 
-    # Load the dataset 
-    f = gzip.open('mnist.pkl.gz','rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-
-    # make minibatches of size 20 
-    batch_size = 20    # sized of the minibatch
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
 
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = train_set
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+    batch_size = 600    # size of the minibatch
 
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = valid_set
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = test_set
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
 
 
-    ishape     = (28,28) # this is the size of MNIST images
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print '... building the model'
+
 
     # allocate symbolic variables for the data
-    x = T.fmatrix()  # the data is presented as rasterized images
-    y = T.lvector()  # the labels are presented as 1D vector of 
-                     # [long int] labels
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
 
     # construct the logistic regression class
-    classifier = LogisticRegression( \
-                   input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
+    # Each MNIST image has size 28*28
+    classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
 
     # the cost we minimize during training is the negative log likelihood of 
     # the model in symbolic format
@@ -207,11 +245,21 @@
 
     # compiling a Theano function that computes the mistakes that are made by 
     # the model on a minibatch
-    test_model = theano.function([x,y], classifier.errors(y))
+    test_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:test_set_x[index*batch_size:(index+1)*batch_size],
+                y:test_set_y[index*batch_size:(index+1)*batch_size]})
+
+    validate_model = theano.function( inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:valid_set_x[index*batch_size:(index+1)*batch_size],
+                y:valid_set_y[index*batch_size:(index+1)*batch_size]})
 
     # compute the gradient of cost with respect to theta = (W,b) 
-    g_W = T.grad(cost, classifier.W)
-    g_b = T.grad(cost, classifier.b)
+    g_W = T.grad(cost = cost, wrt = classifier.W)
+    g_b = T.grad(cost = cost, wrt = classifier.b)
 
     # specify how to update the parameters of the model as a dictionary
     updates ={classifier.W: classifier.W - learning_rate*g_W,\
@@ -220,17 +268,25 @@
     # compiling a Theano function `train_model` that returns the cost, but in 
     # the same time updates the parameter of the model based on the rules 
     # defined in `updates`
-    train_model = theano.function([x, y], cost, updates = updates )
+    train_model = theano.function(inputs = [index], 
+            outputs = cost, 
+            updates = updates,
+            givens={
+                x:train_set_x[index*batch_size:(index+1)*batch_size],
+                y:train_set_y[index*batch_size:(index+1)*batch_size]})
 
-    n_minibatches        = len(train_batches) # number of minibatchers
- 
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training the model'
     # early-stopping parameters
     patience              = 5000  # look as this many examples regardless
     patience_increase     = 2     # wait this much longer when a new best is 
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = n_minibatches  # go through this many 
+    validation_frequency  = min(n_train_batches, patience/2)  
+                                  # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
@@ -239,29 +295,24 @@
     best_validation_loss = float('inf')
     test_score           = 0.
     start_time = time.clock()
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter* n_minibatches):
 
-        # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
+    done_looping = False 
+    epoch = 0  
+    while (epoch < n_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
 
-        # get the minibatches corresponding to `iter` modulo
-        # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
-        cost_ij = train_model(x,y)
+        minibatch_avg_cost = train_model(minibatch_index)
+        # iteration number
+        iter = epoch * n_train_batches + minibatch_index
 
         if (iter+1) % validation_frequency == 0: 
             # compute zero-one loss on validation set 
-            this_validation_loss = 0.
-            for x,y in valid_batches:
-                # sum up the errors for each minibatch
-                this_validation_loss += test_model(x,y)
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(valid_batches)
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+            this_validation_loss = numpy.mean(validation_losses)
 
             print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                 (epoch, minibatch_index+1,n_minibatches, \
+                 (epoch, minibatch_index+1,n_train_batches, \
                   this_validation_loss*100.))
 
 
@@ -275,15 +326,15 @@
                 best_validation_loss = this_validation_loss
                 # test it on the test set
 
-                test_score = 0.
-                for x,y in test_batches:
-                    test_score += test_model(x,y)
-                test_score /= len(test_batches)
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_score  = numpy.mean(test_losses)
+
                 print(('     epoch %i, minibatch %i/%i, test error of best ' 
                        'model %f %%') % \
-                  (epoch, minibatch_index+1, n_minibatches,test_score*100.))
+                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
 
         if patience <= iter :
+                done_looping = True
                 break
 
     end_time = time.clock()
@@ -292,12 +343,6 @@
                  (best_validation_loss * 100., test_score*100.))
     print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
 
-
-
-
-
-
-
 if __name__ == '__main__':
     sgd_optimization_mnist()
 
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/mlp.py
--- a/code_tutoriel/mlp.py	Thu Feb 25 18:53:02 2010 -0500
+++ b/code_tutoriel/mlp.py	Fri Feb 26 13:55:27 2010 -0500
@@ -17,22 +17,65 @@
     - textbooks: "Pattern Recognition and Machine Learning" - 
                  Christopher M. Bishop, section 5
 
-TODO: recommended preprocessing, lr ranges, regularization ranges (explain 
-      to do lr first, then add regularization)
-
 """
 __docformat__ = 'restructedtext en'
 
 
-import numpy, cPickle, gzip
-
+import numpy, time, cPickle, gzip
 
 import theano
 import theano.tensor as T
 
-import time 
+
+from logistic_sgd import LogisticRegression, load_data
+
+
+class HiddenLayer(object):
+    def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
+        """
+        Typical hidden layer of a MLP: units are fully-connected and have
+        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
+
+        NOTE : The nonlinearity used here is tanh
+        
+        Hidden unit activation is given by: tanh(dot(input,W) + b)
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.dmatrix
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+
+        :type n_in: int
+        :param n_in: dimensionality of input
 
-import theano.tensor.nnet
+        :type n_out: int
+        :param n_out: number of hidden units
+
+        :type activation: theano.Op or function
+        :param activation: Non linearity to be applied in the hidden 
+                              layer
+        """
+        self.input = input
+
+        # `W` is initialized with `W_values` which is uniformely sampled
+        # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        W_values = numpy.asarray( rng.uniform( \
+              low = -numpy.sqrt(6./(n_in+n_out)), \
+              high = numpy.sqrt(6./(n_in+n_out)), \
+              size = (n_in, n_out)), dtype = theano.config.floatX)
+        self.W = theano.shared(value = W_values)
+
+        b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
+        self.b = theano.shared(value= b_values)
+
+        self.output = activation(T.dot(input, self.W) + self.b)
+        # parameters of the model
+        self.params = [self.W, self.b]
+
 
 class MLP(object):
     """Multi-Layer Perceptron Class
@@ -40,188 +83,132 @@
     A multilayer perceptron is a feedforward artificial neural network model 
     that has one layer or more of hidden units and nonlinear activations. 
     Intermidiate layers usually have as activation function thanh or the 
-    sigmoid function  while the top layer is a softamx layer. 
+    sigmoid function (defined here by a ``SigmoidalLayer`` class)  while the 
+    top layer is a softamx layer (defined here by a ``LogisticRegression`` 
+    class). 
     """
 
 
 
-    def __init__(self, input, n_in, n_hidden, n_out):
+    def __init__(self, rng, input, n_in, n_hidden, n_out):
         """Initialize the parameters for the multilayer perceptron
 
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.TensorType
         :param input: symbolic variable that describes the input of the 
         architecture (one minibatch)
 
+        :type n_in: int
         :param n_in: number of input units, the dimension of the space in 
         which the datapoints lie
 
+        :type n_hidden: int
         :param n_hidden: number of hidden units 
 
+        :type n_out: int
         :param n_out: number of output units, the dimension of the space in 
         which the labels lie
 
         """
 
-        # initialize the parameters theta = (W1,b1,W2,b2) ; note that this 
-        # example contains only one hidden layer, but one can have as many 
-        # layers as he/she wishes, making the network deeper. The only 
-        # problem making the network deep this way is during learning, 
-        # backpropagation being unable to move the network from the starting
-        # point towards; this is where pre-training helps, giving a good 
-        # starting point for backpropagation, but more about this in the 
-        # other tutorials
-        
-        # `W1` is initialized with `W1_values` which is uniformely sampled
-        # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
-        # the output of uniform if converted using asarray to dtype 
-        # theano.config.floatX so that the code is runable on GPU
-        W1_values = numpy.asarray( numpy.random.uniform( \
-              low = -numpy.sqrt(6./(n_in+n_hidden)), \
-              high = numpy.sqrt(6./(n_in+n_hidden)), \
-              size = (n_in, n_hidden)), dtype = theano.config.floatX)
-        # `W2` is initialized with `W2_values` which is uniformely sampled 
-        # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
-        # the output of uniform if converted using asarray to dtype 
-        # theano.config.floatX so that the code is runable on GPU
-        W2_values = numpy.asarray( numpy.random.uniform( 
-              low = -numpy.sqrt(6./(n_hidden+n_out)), \
-              high= numpy.sqrt(6./(n_hidden+n_out)),\
-              size= (n_hidden, n_out)), dtype = theano.config.floatX)
+        # Since we are dealing with a one hidden layer MLP, this will 
+        # translate into a TanhLayer connected to the LogisticRegression
+        # layer; this can be replaced by a SigmoidalLayer, or a layer 
+        # implementing any other nonlinearity
+        self.hiddenLayer = HiddenLayer(rng = rng, input = input, 
+                                 n_in = n_in, n_out = n_hidden,
+                                 activation = T.tanh)
 
-        self.W1 = theano.shared( value = W1_values )
-        self.b1 = theano.shared( value = numpy.zeros((n_hidden,), 
-                                                dtype= theano.config.floatX))
-        self.W2 = theano.shared( value = W2_values )
-        self.b2 = theano.shared( value = numpy.zeros((n_out,), 
-                                                dtype= theano.config.floatX))
+        # The logistic regression layer gets as input the hidden units 
+        # of the hidden layer
+        self.logRegressionLayer = LogisticRegression( 
+                                    input = self.hiddenLayer.output,
+                                    n_in  = n_hidden,
+                                    n_out = n_out)
 
-        # symbolic expression computing the values of the hidden layer
-        self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
-
-        # symbolic expression computing the values of the top layer 
-        self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
-
-        # compute prediction as class whose probability is maximal in 
-        # symbolic form
-        self.y_pred = T.argmax( self.p_y_given_x, axis =1)
-        
         # L1 norm ; one regularization option is to enforce L1 norm to 
         # be small 
-        self.L1     = abs(self.W1).sum() + abs(self.W2).sum()
+        self.L1 = abs(self.hiddenLayer.W).sum() \
+                + abs(self.logRegressionLayer.W).sum()
 
         # square of L2 norm ; one regularization option is to enforce 
         # square of L2 norm to be small
-        self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
-
-
-
-    def negative_log_likelihood(self, y):
-        """Return the mean of the negative log-likelihood of the prediction
-        of this model under a given target distribution.
-
-        .. math::
+        self.L2_sqr = (self.hiddenLayer.W**2).sum() \
+                    + (self.logRegressionLayer.W**2).sum()
 
-            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D}) 
-
+        # negative log likelihood of the MLP is given by the negative 
+        # log likelihood of the output of the model, computed in the 
+        # logistic regression layer
+        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
+        # same holds for the function computing the number of errors
+        self.errors = self.logRegressionLayer.errors
 
-        :param y: corresponds to a vector that gives for each example the
-        :correct label
-        """
-        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
-
+        # the parameters of the model are the parameters of the two layer it is
+        # made out of
+        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
 
 
-
-    def errors(self, y):
-        """Return a float representing the number of errors in the minibatch 
-        over the total number of examples of the minibatch 
-        """
-
-        # check if y has same dimension of y_pred 
-        if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred', 
-                ('y', target.type, 'y_pred', self.y_pred.type))
-        # check if y is of the correct datatype        
-        if y.dtype.startswith('int'):
-            # the T.neq operator returns a vector of 0s and 1s, where 1
-            # represents a mistake in prediction
-            return T.mean(T.neq(self.y_pred, y))
-        else:
-            raise NotImplementedError()
-
-
-
-def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \
-                            L2_reg = 0.0001, n_iter=100):
+def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
+            dataset = 'mnist.pkl.gz'):
     """
     Demonstrate stochastic gradient descent optimization for a multilayer 
     perceptron
 
     This is demonstrated on MNIST.
 
+    :type learning_rate: float
     :param learning_rate: learning rate used (factor for the stochastic 
     gradient
 
+    :type L1_reg: float
     :param L1_reg: L1-norm's weight when added to the cost (see 
     regularization)
 
+    :type L2_reg: float
     :param L2_reg: L2-norm's weight when added to the cost (see 
     regularization)
  
-    :param n_iter: maximal number of iterations ot run the optimizer 
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer 
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from 
+                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+
 
    """
-
-    # Load the dataset 
-    f = gzip.open('mnist.pkl.gz','rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-
-    # make minibatches of size 20 
-    batch_size = 20    # sized of the minibatch
+    datasets = load_data(dataset)
 
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = train_set
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
 
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = valid_set
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = test_set
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
 
 
-    ishape     = (28,28) # this is the size of MNIST images
+    batch_size = 20    # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ###################### 
+    print '... building the model'
 
     # allocate symbolic variables for the data
-    x = T.fmatrix()  # the data is presented as rasterized images
-    y = T.lvector()  # the labels are presented as 1D vector of 
-                          # [long int] labels
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
 
-    # construct the logistic regression class
-    classifier = MLP( input=x.reshape((batch_size,28*28)),\
-                      n_in=28*28, n_hidden = 500, n_out=10)
+    rng = numpy.random.RandomState(1234)
+
+    # construct the MLP class
+    classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10)
 
     # the cost we minimize during training is the negative log likelihood of 
     # the model plus the regularization terms (L1 and L2); cost is expressed
@@ -230,36 +217,59 @@
          + L1_reg * classifier.L1 \
          + L2_reg * classifier.L2_sqr 
 
-    # compiling a theano function that computes the mistakes that are made by 
-    # the model on a minibatch
-    test_model = theano.function([x,y], classifier.errors(y))
+    # compiling a Theano function that computes the mistakes that are made
+    # by the model on a minibatch
+    test_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:test_set_x[index*batch_size:(index+1)*batch_size],
+                y:test_set_y[index*batch_size:(index+1)*batch_size]})
 
-    # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) 
-    g_W1 = T.grad(cost, classifier.W1)
-    g_b1 = T.grad(cost, classifier.b1)
-    g_W2 = T.grad(cost, classifier.W2)
-    g_b2 = T.grad(cost, classifier.b2)
+    validate_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:valid_set_x[index*batch_size:(index+1)*batch_size],
+                y:valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+    # compute the gradient of cost with respect to theta (sotred in params)
+    # the resulting gradients will be stored in a list gparams
+    gparams = []
+    for param in classifier.params:
+        gparam  = T.grad(cost, param)
+        gparams.append(gparam)
+
 
     # specify how to update the parameters of the model as a dictionary
-    updates = \
-        { classifier.W1: classifier.W1 - learning_rate*g_W1 \
-        , classifier.b1: classifier.b1 - learning_rate*g_b1 \
-        , classifier.W2: classifier.W2 - learning_rate*g_W2 \
-        , classifier.b2: classifier.b2 - learning_rate*g_b2 }
+    updates = {}
+    # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of 
+    # same length, zip generates a list C of same size, where each element
+    # is a pair formed from the two lists : 
+    #    C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ] 
+    for param, gparam in zip(classifier.params, gparams):
+        updates[param] = param - learning_rate*gparam
 
-    # compiling a theano function `train_model` that returns the cost, but in 
-    # the same time updates the parameter of the model based on the rules 
+    # compiling a Theano function `train_model` that returns the cost, but  
+    # in the same time updates the parameter of the model based on the rules 
     # defined in `updates`
-    train_model = theano.function([x, y], cost, updates = updates )
-    n_minibatches        = len(train_batches) 
- 
+    train_model =theano.function( inputs = [index], outputs = cost, 
+            updates = updates,
+            givens={
+                x:train_set_x[index*batch_size:(index+1)*batch_size],
+                y:train_set_y[index*batch_size:(index+1)*batch_size]})
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training'
+
     # early-stopping parameters
     patience              = 10000 # look as this many examples regardless
     patience_increase     = 2     # wait this much longer when a new best is 
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = n_minibatches  # go through this many 
+    validation_frequency  = min(n_train_batches,patience/2)  
+                                  # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
@@ -270,56 +280,49 @@
     best_iter            = 0
     test_score           = 0.
     start_time = time.clock()
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter* n_minibatches):
+
+    epoch = 0
+    done_looping = False
 
-        # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
+    while (epoch < n_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
 
-        # get the minibatches corresponding to `iter` modulo
-        # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
-        cost_ij = train_model(x,y)
+        minibatch_avg_cost = train_model(minibatch_index)
+        # iteration number
+        iter = epoch * n_train_batches + minibatch_index
 
         if (iter+1) % validation_frequency == 0: 
             # compute zero-one loss on validation set 
-            this_validation_loss = 0.
-            for x,y in valid_batches:
-                # sum up the errors for each minibatch
-                this_validation_loss += test_model(x,y)
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(valid_batches)
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+            this_validation_loss = numpy.mean(validation_losses)
 
             print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                   (epoch, minibatch_index+1, n_minibatches, \
-                    this_validation_loss*100.))
+                 (epoch, minibatch_index+1,n_train_batches, \
+                  this_validation_loss*100.))
 
 
             # if we got the best validation score until now
             if this_validation_loss < best_validation_loss:
-
                 #improve patience if loss improvement is good enough
                 if this_validation_loss < best_validation_loss *  \
                        improvement_threshold :
                     patience = max(patience, iter * patience_increase)
 
-                # save best validation score and iteration number
                 best_validation_loss = this_validation_loss
-                best_iter = iter
-
                 # test it on the test set
-                test_score = 0.
-                for x,y in test_batches:
-                    test_score += test_model(x,y)
-                test_score /= len(test_batches)
-                print(('     epoch %i, minibatch %i/%i, test error of best '
-                      'model %f %%') % 
-                             (epoch, minibatch_index+1, n_minibatches,
-                              test_score*100.))
+
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_score = numpy.mean(test_losses)
+
+                print(('     epoch %i, minibatch %i/%i, test error of best ' 
+                       'model %f %%') % \
+                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
 
         if patience <= iter :
-            break
+                done_looping = True
+                break
+
 
     end_time = time.clock()
     print(('Optimization complete. Best validation score of %f %% '
@@ -329,5 +332,5 @@
 
 
 if __name__ == '__main__':
-    sgd_optimization_mnist()
+    test_mlp()
 
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/rbm.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/rbm.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,360 @@
+"""This tutorial introduces restricted boltzmann machines (RBM) using Theano.
+
+Boltzmann Machines (BMs) are a particular form of energy-based model which
+contain hidden variables. Restricted Boltzmann Machines further restrict BMs 
+to those without visible-visible and hidden-hidden connections. 
+"""
+
+
+import numpy, time, cPickle, gzip, PIL.Image
+
+import theano
+import theano.tensor as T
+import os
+
+from theano.tensor.shared_randomstreams import RandomStreams
+
+from utils import tile_raster_images
+from logistic_sgd import load_data
+
+class RBM(object):
+    """Restricted Boltzmann Machine (RBM)  """
+    def __init__(self, input=None, n_visible=784, n_hidden=500, \
+        W = None, hbias = None, vbias = None, numpy_rng = None, 
+        theano_rng = None):
+        """ 
+        RBM constructor. Defines the parameters of the model along with
+        basic operations for inferring hidden from visible (and vice-versa), 
+        as well as for performing CD updates.
+
+        :param input: None for standalone RBMs or symbolic variable if RBM is
+        part of a larger graph.
+
+        :param n_visible: number of visible units
+
+        :param n_hidden: number of hidden units
+
+        :param W: None for standalone RBMs or symbolic variable pointing to a
+        shared weight matrix in case RBM is part of a DBN network; in a DBN,
+        the weights are shared between RBMs and layers of a MLP
+
+        :param hbias: None for standalone RBMs or symbolic variable pointing 
+        to a shared hidden units bias vector in case RBM is part of a 
+        different network
+
+        :param vbias: None for standalone RBMs or a symbolic variable 
+        pointing to a shared visible units bias
+        """
+
+        self.n_visible = n_visible
+        self.n_hidden  = n_hidden
+
+
+        if W is None : 
+           # W is initialized with `initial_W` which is uniformely sampled
+           # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+           # the output of uniform if converted using asarray to dtype 
+           # theano.config.floatX so that the code is runable on GPU
+           initial_W = numpy.asarray( numpy.random.uniform( 
+                     low = -numpy.sqrt(6./(n_hidden+n_visible)), 
+                     high = numpy.sqrt(6./(n_hidden+n_visible)), 
+                     size = (n_visible, n_hidden)), 
+                     dtype = theano.config.floatX)
+           # theano shared variables for weights and biases
+           W = theano.shared(value = initial_W, name = 'W')
+
+        if hbias is None :
+           # create shared variable for hidden units bias
+           hbias = theano.shared(value = numpy.zeros(n_hidden, 
+                               dtype = theano.config.floatX), name='hbias')
+
+        if vbias is None :
+            # create shared variable for visible units bias
+            vbias = theano.shared(value =numpy.zeros(n_visible, 
+                                dtype = theano.config.floatX),name='vbias')
+
+        if numpy_rng is None:    
+            # create a number generator 
+            numpy_rng = numpy.random.RandomState(1234)
+
+        if theano_rng is None : 
+            theano_rng = RandomStreams(numpy_rng.randint(2**30))
+
+
+        # initialize input layer for standalone RBM or layer0 of DBN
+        self.input = input if input else T.dmatrix('input')
+
+        self.W          = W
+        self.hbias      = hbias
+        self.vbias      = vbias
+        self.theano_rng = theano_rng
+        # **** WARNING: It is not a good idea to put things in this list 
+        # other than shared variables created in this function.
+        self.params     = [self.W, self.hbias, self.vbias]
+        self.batch_size = self.input.shape[0]
+
+    def free_energy(self, v_sample):
+        ''' Function to compute the free energy '''
+        wx_b = T.dot(v_sample, self.W) + self.hbias
+        vbias_term = T.sum(T.dot(v_sample, self.vbias))
+        hidden_term = T.sum(T.log(1+T.exp(wx_b)))
+        return -hidden_term - vbias_term
+
+    def sample_h_given_v(self, v0_sample):
+        ''' This function infers state of hidden units given visible units '''
+        # compute the activation of the hidden units given a sample of the visibles
+        h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
+        # get a sample of the hiddens given their activation
+        h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
+        return [h1_mean, h1_sample]
+
+    def sample_v_given_h(self, h0_sample):
+        ''' This function infers state of visible units given hidden units '''
+        # compute the activation of the visible given the hidden sample
+        v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
+        # get a sample of the visible given their activation
+        v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
+        return [v1_mean, v1_sample]
+
+    def gibbs_hvh(self, h0_sample):
+        ''' This function implements one step of Gibbs sampling, 
+            starting from the hidden state'''
+        v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
+        h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
+        return [v1_mean, v1_sample, h1_mean, h1_sample]
+ 
+    def gibbs_vhv(self, v0_sample):
+        ''' This function implements one step of Gibbs sampling, 
+            starting from the visible state'''
+        h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
+        v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
+        return [h1_mean, h1_sample, v1_mean, v1_sample]
+ 
+    def cd(self, lr = 0.1, persistent=None):
+        """ 
+        This functions implements one step of CD-1 or PCD-1
+
+        :param lr: learning rate used to train the RBM 
+        :param persistent: None for CD. For PCD, shared variable containing old state
+        of Gibbs chain. This must be a shared variable of size (batch size, number of
+        hidden units).
+
+        Returns the updates dictionary. The dictionary contains the update rules for weights
+        and biases but also an update of the shared variable used to store the persistent
+        chain, if one is used.
+        """
+
+        # compute positive phase
+        ph_mean, ph_sample = self.sample_h_given_v(self.input)
+
+        # decide how to initialize persistent chain:
+        # for CD, we use the newly generate hidden sample
+        # for PCD, we initialize from the old state of the chain
+        if persistent is None:
+            chain_start = ph_sample
+        else:
+            chain_start = persistent
+
+        # perform actual negative phase
+        [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
+
+        # determine gradients on RBM parameters
+        g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
+        g_hbias = T.sum( ph_mean    - nh_mean, axis = 0)/self.batch_size
+        g_W = T.dot(ph_mean.T, self.input   )/ self.batch_size - \
+              T.dot(nh_mean.T, nv_mean      )/ self.batch_size
+
+        gparams = [g_W.T, g_hbias, g_vbias]
+
+        # constructs the update dictionary
+        updates = {}
+        for gparam, param in zip(gparams, self.params):
+           updates[param] = param + gparam * lr
+
+        if persistent:
+            # Note that this works only if persistent is a shared variable
+            updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX)
+            # pseudo-likelihood is a better proxy for PCD
+            cost = self.get_pseudo_likelihood_cost(updates)
+        else:
+            # reconstruction cross-entropy is a better proxy for CD
+            cost = self.get_reconstruction_cost(updates, nv_mean)
+
+        return cost, updates
+
+    def get_pseudo_likelihood_cost(self, updates):
+        """Stochastic approximation to the pseudo-likelihood"""
+
+        # index of bit i in expression p(x_i | x_{\i})
+        bit_i_idx = theano.shared(value=0, name = 'bit_i_idx')
+
+        # binarize the input image by rounding to nearest integer
+        xi = T.iround(self.input)
+
+        # calculate free energy for the given bit configuration
+        fe_xi = self.free_energy(xi)
+
+        # flip bit x_i of matrix xi and preserve all other bits x_{\i}
+        # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
+        # NB: slice(start,stop,step) is the python object used for
+        # slicing, e.g. to index matrix x as follows: x[start:stop:step]
+        xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], 
+                                 idx_list=(slice(None,None,None),bit_i_idx))
+
+        # calculate free energy with bit flipped
+        fe_xi_flip = self.free_energy(xi_flip)
+
+        # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) 
+        cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))
+
+        # increment bit_i_idx % number as part of updates
+        updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
+
+        return cost
+
+    def get_reconstruction_cost(self, updates, nv_mean):
+        """Approximation to the reconstruction error"""
+
+        cross_entropy = T.mean(
+                T.sum(self.input*T.log(nv_mean) + 
+                (1 - self.input)*T.log(1-nv_mean), axis = 1))
+
+        return cross_entropy
+
+
+
+def test_rbm(learning_rate=0.1, training_epochs = 15,
+             dataset='mnist.pkl.gz'):
+    """
+    Demonstrate ***
+
+    This is demonstrated on MNIST.
+
+    :param learning_rate: learning rate used for training the RBM 
+
+    :param training_epochs: number of epochs used for training
+
+    :param dataset: path the the pickled dataset
+
+    """
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    test_set_x , test_set_y  = datasets[2]
+
+
+    batch_size = 20    # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+
+    rng        = numpy.random.RandomState(123)
+    theano_rng = RandomStreams( rng.randint(2**30))
+
+    # initialize storage fot the persistent chain (state = hidden layer of chain)
+    persistent_chain = theano.shared(numpy.zeros((batch_size, 500)))
+
+    # construct the RBM class
+    rbm = RBM( input = x, n_visible=28*28, \
+               n_hidden = 500,numpy_rng = rng, theano_rng = theano_rng)
+
+    # get the cost and the gradient corresponding to one step of CD
+    cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain)
+
+
+    #################################
+    #     Training the RBM          #
+    #################################
+    dirname = 'lr=%.5f'%learning_rate
+    os.makedirs(dirname)
+    os.chdir(dirname)
+
+    # it is ok for a theano function to have no output
+    # the purpose of train_rbm is solely to update the RBM parameters
+    train_rbm = theano.function([index], cost,
+           updates = updates, 
+           givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]})
+
+    plotting_time = 0.
+    start_time = time.clock()  
+
+
+    # go through training epochs 
+    for epoch in xrange(training_epochs):
+
+        # go through the training set
+        mean_cost = []
+        for batch_index in xrange(n_train_batches):
+           mean_cost += [train_rbm(batch_index)]
+
+        print 'Training epoch %d, cost is '%epoch, numpy.mean(mean_cost)
+
+        # Plot filters after each training epoch
+        plotting_start = time.clock()
+        # Construct image from the weight matrix 
+        image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T,
+                 img_shape = (28,28),tile_shape = (10,10), 
+                 tile_spacing=(1,1)))
+        image.save('filters_at_epoch_%i.png'%epoch)
+        plotting_stop = time.clock()
+        plotting_time += (plotting_stop - plotting_start)
+
+    end_time = time.clock()
+
+    pretraining_time = (end_time - start_time) - plotting_time
+
+    print ('Training took %f minutes' %(pretraining_time/60.))
+
+  
+    #################################
+    #     Sampling from the RBM     #
+    #################################
+
+    # find out the number of test samples  
+    number_of_test_samples = test_set_x.value.shape[0]
+
+    # pick random test examples, with which to initialize the persistent chain
+    test_idx = rng.randint(number_of_test_samples-20)
+    persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+20])
+
+    # define one step of Gibbs sampling (mf = mean-field)
+    [hid_mf, hid_sample, vis_mf, vis_sample] =  rbm.gibbs_vhv(persistent_vis_chain)
+
+    # the sample at the end of the channel is returned by ``gibbs_1`` as 
+    # its second output; note that this is computed as a binomial draw, 
+    # therefore it is formed of ints (0 and 1) and therefore needs to 
+    # be converted to the same dtype as ``persistent_vis_chain``
+    vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
+
+    # construct the function that implements our persistent chain 
+    # we generate the "mean field" activations for plotting and the actual samples for
+    # reinitializing the state of our persistent chain
+    sample_fn = theano.function([], [vis_mf, vis_sample],
+                      updates = { persistent_vis_chain:vis_sample})
+
+    # sample the RBM, plotting every `plot_every`-th sample; do this 
+    # until you plot at least `n_samples`
+    n_samples = 10
+    plot_every = 1000
+
+    for idx in xrange(n_samples):
+
+        # do `plot_every` intermediate samplings of which we do not care
+        for jdx in  xrange(plot_every):
+            vis_mf, vis_sample = sample_fn()
+
+        # construct image
+        image = PIL.Image.fromarray(tile_raster_images( 
+                                         X          = vis_mf,
+                                         img_shape  = (28,28),
+                                         tile_shape = (10,10),
+                                         tile_spacing = (1,1) ) )
+        print ' ... plotting sample ', idx
+        image.save('sample_%i_step_%i.png'%(idx,idx*jdx))
+
+if __name__ == '__main__':
+    test_rbm()
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/test.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/test.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,18 @@
+#import convolutional_mlp, dbn, logistic_cg, logistic_sgd, mlp, rbm, SdA_loops, SdA
+import convolutional_mlp, logistic_cg, logistic_sgd, mlp, SdA
+from nose.plugins.skip import SkipTest
+#TODO: dbn, rbm, SdA, SdA_loops, convolutional_mlp
+def test_logistic_sgd():
+    logistic_sgd.sgd_optimization_mnist(n_epochs=10)
+def test_logistic_cg():
+    logistic_cg.cg_optimization_mnist(n_epochs=10)
+def test_mlp():
+    mlp.test_mlp(n_epochs=5)
+def test_convolutional_mlp():
+    convolutional_mlp.evaluate_lenet5(n_epochs=5,nkerns=[5,5])
+def test_dbn():
+    raise SkipTest('Implementation not finished')
+def test_rbm():
+    raise SkipTest('Implementation not finished')
+def test_SdA():
+    SdA.test_SdA(pretraining_epochs = 2, training_epochs = 3)
diff -r e3de934a98b6 -r 4bc5eeec6394 code_tutoriel/utils.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/utils.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,125 @@
+""" This file contains different utility functions that are not connected 
+in anyway to the networks presented in the tutorials, but rather help in 
+processing the outputs into a more understandable way. 
+
+For example ``tile_raster_images`` helps in generating a easy to grasp 
+image from a set of samples or weights.
+"""
+
+
+import numpy
+
+
+def scale_to_unit_interval(ndar,eps=1e-8):
+    """ Scales all values in the ndarray ndar to be between 0 and 1 """
+    ndar = ndar.copy()
+    ndar -= ndar.min()
+    ndar *= 1.0 / (ndar.max()+eps)
+    return ndar
+
+
+def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0), 
+              scale_rows_to_unit_interval = True, output_pixel_vals = True):
+    """
+    Transform an array with one flattened image per row, into an array in 
+    which images are reshaped and layed out like tiles on a floor.
+
+    This function is useful for visualizing datasets whose rows are images, 
+    and also columns of matrices for transforming those rows 
+    (such as the first layer of a neural net).
+
+    :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 
+    be 2-D ndarrays or None;
+    :param X: a 2-D array in which every row is a flattened image.
+
+    :type img_shape: tuple; (height, width)
+    :param img_shape: the original shape of each image
+
+    :type tile_shape: tuple; (rows, cols)
+    :param tile_shape: the number of images to tile (rows, cols)
+    
+    :param output_pixel_vals: if output should be pixel values (i.e. int8
+    values) or floats
+
+    :param scale_rows_to_unit_interval: if the values need to be scaled before
+    being plotted to [0,1] or not
+
+
+    :returns: array suitable for viewing as an image.  
+    (See:`PIL.Image.fromarray`.)
+    :rtype: a 2-d array with same dtype as X.
+
+    """
+ 
+    assert len(img_shape) == 2
+    assert len(tile_shape) == 2
+    assert len(tile_spacing) == 2
+
+    # The expression below can be re-written in a more C style as 
+    # follows : 
+    #
+    # out_shape    = [0,0]
+    # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
+    #                tile_spacing[0]
+    # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
+    #                tile_spacing[1]
+    out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 
+                        in zip(img_shape, tile_shape, tile_spacing)]
+
+    if isinstance(X, tuple):
+        assert len(X) == 4
+        # Create an output numpy ndarray to store the image 
+        if output_pixel_vals:
+            out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
+        else:
+            out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
+
+        #colors default to 0, alpha defaults to 1 (opaque)
+        if output_pixel_vals:
+            channel_defaults = [0,0,0,255]
+        else:
+            channel_defaults = [0.,0.,0.,1.]
+
+        for i in xrange(4):
+            if X[i] is None:
+                # if channel is None, fill it with zeros of the correct 
+                # dtype
+                out_array[:,:,i] = numpy.zeros(out_shape,
+                        dtype='uint8' if output_pixel_vals else out_array.dtype
+                        )+channel_defaults[i]
+            else:
+                # use a recurrent call to compute the channel and store it 
+                # in the output
+                out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
+        return out_array
+
+    else:
+        # if we are dealing with only one channel 
+        H, W = img_shape
+        Hs, Ws = tile_spacing
+
+        # generate a matrix to store the output
+        out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
+
+
+        for tile_row in xrange(tile_shape[0]):
+            for tile_col in xrange(tile_shape[1]):
+                if tile_row * tile_shape[1] + tile_col < X.shape[0]:
+                    if scale_rows_to_unit_interval:
+                        # if we should scale values to be between 0 and 1 
+                        # do this by calling the `scale_to_unit_interval`
+                        # function
+                        this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
+                    else:
+                        this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
+                    # add the slice to the corresponding position in the 
+                    # output array
+                    out_array[
+                        tile_row * (H+Hs):tile_row*(H+Hs)+H,
+                        tile_col * (W+Ws):tile_col*(W+Ws)+W
+                        ] \
+                        = this_img * (255 if output_pixel_vals else 1)
+        return out_array
+
+
+