changeset 0:fda5f787baa6

commit initial
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Thu, 21 Jan 2010 11:26:43 -0500
parents
children 0fda55a7de99
files code_tutoriel/convolutional_mlp.py code_tutoriel/dae.py code_tutoriel/dbn.py code_tutoriel/logistic_cg.py code_tutoriel/logistic_sgd.py code_tutoriel/mlp.py code_tutoriel/rbm.py
diffstat 7 files changed, 1543 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/convolutional_mlp.py	Thu Jan 21 11:26:43 2010 -0500
@@ -0,0 +1,230 @@
+
+"""
+This tutorial introduces the LeNet5 neural network architecture using Theano.  LeNet5 is a
+convolutional neural network, good for classifying images. This tutorial shows how to build the
+architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST
+results.
+
+The best results are obtained after X iterations of the main program loop, which takes ***
+minutes on my workstation (an Intel Core i7, circa July 2009), and *** minutes on my GPU (an
+NVIDIA GTX 285 graphics processor).
+
+This implementation simplifies the model in the following ways:
+
+ - LeNetConvPool doesn't implement location-specific gain and bias parameters
+
+ - LeNetConvPool doesn't implement pooling by average, it implements pooling by max.
+
+ - Digit classification is implemented with a logistic regression rather than an RBF network
+
+ - LeNet5 was not fully-connected convolutions at second layer
+
+References:
+
+ - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document
+   Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
+   http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
+
+
+"""
+import numpy
+from theano.compile.sandbox import shared, pfunc
+from theano import tensor
+from pylearn.shared.layers import LogisticRegression, SigmoidalLayer
+import theano.sandbox.softsign
+import pylearn.datasets.MNIST
+
+
+try:
+    # this tells theano to use the GPU if possible
+    from theano.sandbox.cuda import use
+    use()
+except Exception, e:
+    print('Warning: Attempt to use GPU resulted in error "%s"' % str(e))
+
+class LeNetConvPool(object):
+    """WRITEME 
+
+    Math of what the layer does, and what symbolic variables are created by the class (w, b,
+    output).
+
+    """
+
+    #TODO: implement biases & scales properly. There are supposed to be more parameters.
+    #    - one bias & scale per filter
+    #    - one bias & scale per downsample feature location (a 2d bias)
+    #    - more?
+
+    def __init__(self, rng, input, n_examples, n_imgs, img_shape, n_filters, filter_shape=(5,5),
+            poolsize=(2,2)):
+        """
+        Allocate a LeNetConvPool layer with shared variable internal parameters.
+
+        :param rng: a random number generator used to initialize weights
+        
+        :param input: symbolic images.  Shape: (n_examples, n_imgs, img_shape[0], img_shape[1])
+
+        :param n_examples: input's shape[0] at runtime
+
+        :param n_imgs: input's shape[1] at runtime
+
+        :param img_shape: input's shape[2:4] at runtime
+
+        :param n_filters: the number of filters to apply to the image.
+
+        :param filter_shape: the size of the filters to apply
+        :type filter_shape: pair (rows, cols)
+
+        :param poolsize: the downsampling (pooling) factor
+        :type poolsize: pair (rows, cols)
+        """
+
+        #TODO: make a simpler convolution constructor!!
+        #    - make dx and dy optional
+        #    - why do we have to pass shapes? (Can we make them optional at least?)
+        conv_op = ConvOp((n_imgs,)+img_shape, filter_shape, n_filters, n_examples,
+                dx=1, dy=1, output_mode='valid')
+
+        # - why is poolsize an op parameter here?
+        # - can we just have a maxpool function that creates this Op internally?
+        ds_op = DownsampleFactorMax(poolsize, ignore_border=True)
+
+        # the filter tensor that we will apply is a 4D tensor
+        w_shp = (n_filters, n_imgs) + filter_shape
+
+        # the bias we add is a 1D tensor
+        b_shp = (n_filters,)
+
+        self.w = shared(
+                numpy.asarray(
+                    rng.uniform(
+                        low=-1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), 
+                        high=1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs),
+                        size=w_shp), 
+                    dtype=input.dtype))
+        self.b = shared(
+                numpy.asarray(
+                    rng.uniform(low=-.0, high=0., size=(n_filters,)),
+                    dtype=input.dtype))
+
+        self.input = input
+        conv_out = conv_op(input, self.w)
+        self.output = tensor.tanh(ds_op(conv_out) + b.dimshuffle('x', 0, 'x', 'x'))
+        self.params = [self.w, self.b]
+
+class SigmoidalLayer(object):
+    def __init__(self, input, n_in, n_out):
+        """
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+        :param w: a symbolic weight matrix of shape (n_in, n_out)
+        :param b: symbolic bias terms of shape (n_out,)
+        :param squash: an squashing function
+        """
+        self.input = input
+        self.w = shared(
+                numpy.asarray(
+                    rng.uniform(low=-2/numpy.sqrt(n_in), high=2/numpy.sqrt(n_in),
+                    size=(n_in, n_out)), dtype=input.dtype))
+        self.b = shared(numpy.asarray(numpy.zeros(n_out), dtype=input.dtype))
+        self.output = tensor.tanh(tensor.dot(input, self.w) + self.b)
+        self.params = [self.w, self.b]
+
+class LogisticRegression(object):
+    """WRITEME"""
+
+    def __init__(self, input, n_in, n_out):
+        self.w = shared(numpy.zeros((n_in, n_out), dtype=input.dtype))
+        self.b = shared(numpy.zeros((n_out,), dtype=input.dtype))
+        self.l1=abs(self.w).sum()
+        self.l2_sqr = (self.w**2).sum()
+        self.output=nnet.softmax(theano.dot(input, self.w)+self.b)
+        self.argmax=theano.tensor.argmax(self.output, axis=1)
+        self.params = [self.w, self.b]
+
+    def nll(self, target):
+        """Return the negative log-likelihood of the prediction of this model under a given
+        target distribution.  Passing symbolic integers here means 1-hot.
+        WRITEME
+        """
+        return nnet.categorical_crossentropy(self.output, target)
+
+    def errors(self, target):
+        """Return a vector of 0s and 1s, with 1s on every line that was mis-classified.
+        """
+        if target.ndim != self.argmax.ndim:
+            raise TypeError('target should have the same shape as self.argmax', ('target', target.type,
+                'argmax', self.argmax.type))
+        if target.dtype.startswith('int'):
+            return theano.tensor.neq(self.argmax, target)
+        else:
+            raise NotImplementedError()
+
+def evaluate_lenet5(batch_size=30, n_iter=1000):
+    rng = numpy.random.RandomState(23455)
+
+    mnist = pylearn.datasets.MNIST.train_valid_test()
+
+    ishape=(28,28) #this is the size of MNIST images
+
+    # allocate symbolic variables for the data
+    x = tensor.fmatrix()  # the data is presented as rasterized images
+    y = tensor.lvector()  # the labels are presented as 1D vector of [long int] labels
+
+    # construct the first convolutional pooling layer
+    layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size,1,28,28)), n_examples=batch_size, 
+            n_imgs=1, img_shape=ishape, 
+            n_filters=6, filter_shape=(5,5), 
+            poolsize=(2,2))
+
+    # construct the second convolutional pooling layer
+    layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size, 
+            n_imgs=6, img_shape=(12,12),
+            n_filters=16, filter_shape=(5,5),
+            poolsize=(2,2))
+
+    # construct a fully-connected sigmoidal layer
+    layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16*16, n_out=128) # 128 ?
+
+    # classify the values of the fully-connected sigmoidal layer
+    layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10)
+
+    # the cost we minimize during training is the NLL of the model
+    cost = layer3.nll(y).mean()
+
+    # create a function to compute the mistakes that are made by the model
+    test_model = pfunc([x,y], layer3.errors(y))
+
+    # create a list of all model parameters to be fit by gradient descent
+    params = layer3.params+ layer2.params+ layer1.params + layer0.params
+    learning_rate = numpy.asarray(0.01, dtype='float32')
+
+    # train_model is a function that updates the model parameters by SGD
+    train_model = pfunc([x, y], cost, 
+            updates=[(p, p - learning_rate*gp) for p,gp in zip(params, tensor.grad(cost, params))])
+
+    # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE?
+
+    best_valid_score = float('inf')
+    for i in xrange(n_iter):
+        for j in xrange(len(mnist.train.x)/batch_size):
+            cost_ij = train_model(
+                    mnist.train.x[j*batch_size:(j+1)*batch_size],
+                    mnist.train.y[j*batch_size:(j+1)*batch_size])
+            #if 0 == j % 100:
+                #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij))
+        valid_score = numpy.mean([test_model(
+                    mnist.valid.x[j*batch_size:(j+1)*batch_size],
+                    mnist.valid.y[j*batch_size:(j+1)*batch_size])
+                for j in xrange(len(mnist.valid.x)/batch_size)])
+        print('epoch %i, validation error %f' % (i, valid_score))
+        if valid_score < best_valid_score:
+            best_valid_score = valid_score
+            test_score = numpy.mean([test_model(
+                        mnist.test.x[j*batch_size:(j+1)*batch_size],
+                        mnist.test.y[j*batch_size:(j+1)*batch_size])
+                    for j in xrange(len(mnist.test.x)/batch_size)])
+            print('epoch %i, test error of best model %f' % (i, test_score))
+
+if __name__ == '__main__':
+    evaluate_lenet5()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/dae.py	Thu Jan 21 11:26:43 2010 -0500
@@ -0,0 +1,240 @@
+"""
+ This tutorial introduces denoising auto-encoders using Theano. 
+
+ Denoising autoencoders can be used as building blocks for deep networks. 
+ They are based on auto-encoders as the ones used in Bengio et al. 2007.
+ An autoencoder takes an input x and first maps it to a hidden representation
+ y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting 
+ latent representation y is then mapped back to a "reconstructed" vector 
+ z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight 
+ matrix W' can optionally be constrained such that W' = W^T, in which case 
+ the autoencoder is said to have tied weights. The network is trained such 
+ that to minimize the reconstruction error (the error between x and z).
+
+ For the denosing autoencoder, during training, first x is corrupted into 
+ \tilde{x}, where \tilde{x} is a partially destroyed version of x by means 
+ of a stochastic mapping. Afterwards y is computed as before (using 
+ \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction 
+ error is now measured between z and the uncorrupted input x, which is 
+ computed as the cross-entropy : 
+      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+ 
+ For X iteration of the main program loop it takes *** minutes on an 
+ Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
+
+
+ References :
+   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and 
+   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
+   2008
+   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
+   Training of Deep Networks, Advances in Neural Information Processing 
+   Systems 19, 2007
+
+"""
+
+import numpy 
+from theano import tensor
+from theano.compile.sandbox import shared, pfunc
+from theano.compile.sandbox.shared_randomstreams import RandomStreams
+from theano.tensor import nnet
+import pylearn.datasets.MNIST
+
+
+try:
+    #this tells theano to use the GPU if possible
+    from theano.sandbox.cuda import use
+    use()
+except Exception,e:
+    print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e))
+
+
+def load_mnist_batches(batch_size):
+    """
+    We should remove the dependency on pylearn.datasets.MNIST .. and maybe
+    provide a pickled version of the dataset.. 
+    """
+    mnist = pylearn.datasets.MNIST.train_valid_test()
+    train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size])
+            for i in xrange(0, len(mnist.train.x), batch_size)]
+    valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size])
+            for i in xrange(0, len(mnist.valid.x), batch_size)]
+    test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size])
+            for i in xrange(0, len(mnist.test.x), batch_size)]
+    return train_batches, valid_batches, test_batches
+
+
+
+
+class DAE():
+  """Denoising Auto-Encoder class 
+
+  A denoising autoencoders tried to reconstruct the input from a corrupted 
+  version of it by projecting it first in a latent space and reprojecting 
+  it in the input space. Please refer to Vincent et al.,2008 for more 
+  details. If x is the input then equation (1) computes a partially destroyed
+  version of x by means of a stochastic mapping q_D. Equation (2) computes 
+  the projection of the input into the latent space. Equation (3) computes 
+  the reconstruction of the input, while equation (4) computes the 
+  reconstruction error.
+  
+  .. latex-eqn:
+    \tilde{x} ~ q_D(\tilde{x}|x)                                         (1)
+    y = s(W \tilde{x} + b)                                               (2)
+    x = s(W' y  + b')                                                    (3)
+    L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]          (4)
+
+  Tricks and thumbrules for DAE 
+     - learning rate should be used in a logarithmic scale ...
+  """
+
+  def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None):
+    """
+    Initialize the DAE class by specifying the number of visible units (the 
+    dimension d of the input ), the number of hidden units ( the dimension 
+    d' of the latent or hidden space ), a initial value for the learning rate
+    and by giving a symbolic description of the input. Such a symbolic 
+    description is of no importance for the simple DAE and therefore can be 
+    ignored. This feature is useful when stacking DAEs, since the input of 
+    intermediate layers can be symbolically described in terms of the hidden
+    units of the previous layer. See the tutorial on SDAE for more details.
+    
+    :param n_visible: number of visible units
+    :param n_hidden:  number of hidden units
+    :param lr:        a initial value for the learning rate
+    :param input:     a symbolic description of the input or None 
+    """
+    self.n_visible = n_visible
+    self.n_hidden  = n_hidden
+    
+    # create a Theano random generator that gives symbolic random values
+    theano_rng = RandomStreams( seed = 1234 )
+    # create a numpy random generator
+    numpy_rng = numpy.random.RandomState( seed = 52432 )
+    
+     
+    # initial values for weights and biases
+    # note : W' was written as W_prime and b' as b_prime
+    initial_W       = numpy_rng.uniform(size = (n_visible, n_hidden))
+    # transform W such that all values are between -.01 and .01
+    initial_W       = (initial_W*2.0       - 1.0)*.01 
+    initial_b       = numpy.zeros(n_hidden)
+    initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible))
+    # transform W_prime such that all values are between -.01 and .01
+    initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01 
+    initial_b_prime= numpy.zeros(n_visible)
+     
+    
+    # theano shared variables for weights and biases
+    self.W       = shared(value = initial_W      , name = "W")
+    self.b       = shared(value = initial_b      , name = "b")
+    self.W_prime = shared(value = initial_W_prime, name = "W'") 
+    self.b_prime = shared(value = initial_b_prime, name = "b'")
+
+    # theano shared variable for the learning rate 
+    self.lr      = shared(value = lr             , name = "learning_rate")
+      
+    # if no input is given generate a variable representing the input
+    if input == None : 
+        # we use a matrix because we expect a minibatch of several examples,
+        # each example being a row
+        x = tensor.dmatrix(name = 'input') 
+    else:
+        x = input
+    # Equation (1)
+    # note : first argument of theano.rng.binomial is the shape(size) of 
+    #        random numbers that it should produce
+    #        second argument is the number of trials 
+    #        third argument is the probability of success of any trial
+    #
+    #        this will produce an array of 0s and 1s where 1 has a 
+    #        probability of 0.9 and 0 if 0.1
+    tilde_x  = theano_rng.binomial( x.shape,  1,  0.9) * x
+    # Equation (2)
+    # note  : y is stored as an attribute of the class so that it can be 
+    #         used later when stacking DAEs. 
+    self.y   = nnet.sigmoid(tensor.dot(tilde_x, self.W      ) + self.b)
+    # Equation (3)
+    z        = nnet.sigmoid(tensor.dot(self.y,  self.W_prime) + self.b_prime)
+    # Equation (4)
+    L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 ) 
+    # note : L is now a vector, where each element is the cross-entropy cost 
+    #        of the reconstruction of the corresponding example of the 
+    #        minibatch. We need to sum all these to get the cost of the
+    #        minibatch
+    cost = tensor.sum(L)
+    # parameters with respect to whom we need to compute the gradient
+    self.params = [ self.W, self.b, self.W_prime, self.b_prime]
+    # use theano automatic differentiation to get the gradients
+    gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params)
+    # update the parameters in the direction of the gradient using the 
+    # learning rate
+    updated_W       = self.W       - gW       * self.lr
+    updated_b       = self.b       - gb       * self.lr
+    updated_W_prime = self.W_prime - gW_prime * self.lr
+    updated_b_prime = self.b_prime - gb_prime * self.lr
+
+    # defining the function that evaluate the symbolic description of 
+    # one update step 
+    self.update  = pfunc(params = [x], outputs = cost, updates = 
+                                { self.W       : updated_W, 
+                                  self.b       : updated_b,
+                                  self.W_prime : updated_W_prime,
+                                  self.b_prime : updated_b_prime } )
+    self.get_cost = pfunc(params = [x], outputs = cost)
+
+
+
+
+
+
+
+
+
+   
+
+def train_DAE_mnist():
+  """
+  Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist)
+  """
+
+  # load dataset as batches  
+  train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16)
+
+  # Create a denoising auto-encoders with 28*28 = 784 input units, and 500
+  # units in the hidden layer (latent layer); Learning rate is set to 1e-1
+  dae = DAE( n_visible = 784,  n_hidden = 500, lr = 1e-2)
+
+  # Number of iterations (epochs) to run
+  n_iter = 30
+  best_valid_score = float('inf')
+  test_score       = float('inf')
+  for i in xrange(n_iter):
+    # train once over the dataset
+    for x,y in train_batches:
+        cost = dae.update(x)
+     
+    # compute validation error
+    valid_cost = 0.
+    for x,y in valid_batches:
+        valid_cost = valid_cost + dae.get_cost(x)
+    valid_cost = valid_cost / len(valid_batches)
+    print('epoch %i, validation reconstruction error %f '%(i,valid_cost))
+
+    if valid_cost < best_valid_score :
+        best_valid_score = valid_cost
+        # compute test error !?
+        test_score = 0.
+        for x,y in test_batches:
+            test_score = test_score + dae.get_cost(x)
+        test_score = test_score / len(test_batches)
+        print('epoch %i, test error of best model %f' % (i, test_score))
+    
+  print('Optimization done. Best validation score %f, test performance %f' %
+            (best_valid_score, test_score))
+
+
+
+if __name__ == "__main__":
+    train_DAE_mnist()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/dbn.py	Thu Jan 21 11:26:43 2010 -0500
@@ -0,0 +1,24 @@
+import numpy
+import theano
+import theano.tensor as T
+
+from deeplearning import rbm
+
+class DBN():
+
+    def __init__(self, vsize=None, hsizes=[], lr=None, bsize=10, seed=123):
+        assert vsize and hsizes and lr
+
+        input = T.dmatrix('global_input')
+
+        self.layers = []
+        for hsize in hsizes:
+            r = rbm.RBM(input=input, vsize=vsize, hsize=hsize, bsize=bsize,
+                        lr=lr, seed=seed)
+            self.layers.append(r)
+
+            # configure inputs for subsequent layer
+            input = self.layers[-1].hid
+            vsize = hsize
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/logistic_cg.py	Thu Jan 21 11:26:43 2010 -0500
@@ -0,0 +1,282 @@
+"""
+This tutorial introduces logistic regression using Theano and conjugate 
+gradient descent.  
+
+Logistic regression is a probabilistic, linear classifier. It is parametrized
+by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
+done by projecting data points onto a set of hyperplanes, the distance to
+which is used to determine a class membership probability. 
+
+Mathematically, this can be written as:
+
+.. math::
+  P(Y=i|x, W,b) &= softmax_i(W x + b) \\
+                &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
+
+
+The output of the model or prediction is then done by taking the argmax of 
+the vector whose i'th element is P(Y=i|x).
+
+.. math::
+
+  y_{pred} = argmax_i P(Y=i|x,W,b)
+
+
+This tutorial presents a stochastic gradient descent optimization method 
+suitable for large datasets, and a conjugate gradient optimization method 
+that is suitable for smaller datasets.
+
+
+References:
+
+   - textbooks: "Pattern Recognition and Machine Learning" - 
+                 Christopher M. Bishop, section 4.3.2
+
+
+"""
+__docformat__ = 'restructedtext en'
+
+
+import numpy, cPickle, gzip
+
+import time
+
+import theano
+import theano.tensor as T
+import theano.tensor.nnet
+
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W` 
+    and bias vector :math:`b`. Classification is done by projecting data 
+    points onto a set of hyperplanes, the distance to which is used to 
+    determine a class membership probability. 
+    """
+
+
+
+
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
+
+        :param input: symbolic variable that describes the input of the 
+        architecture ( one minibatch)
+
+        :param n_in: number of input units, the dimension of the space in 
+        which the datapoint lies
+
+        :param n_out: number of output units, the dimension of the space in 
+        which the target lies
+
+        """ 
+
+        # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), 
+        # while b is a vector of n_out elements, making theta a vector of
+        # n_in*n_out + n_out elements
+        self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out) )
+        # W is represented by the fisr n_in*n_out elements of theta
+        self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out))
+        # b is the rest (last n_out elements)
+        self.b = self.theta[n_in*n_out:n_in*n_out+n_out]
+
+
+        # compute vector of class-membership probabilities in symbolic form
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
+
+        # compute prediction as class whose probability is maximal in 
+        # symbolic form
+        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+
+
+
+
+
+    def negative_log_likelihood(self, y):
+        """Return the negative log-likelihood of the prediction of this model
+        under a given target distribution.  
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
+            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D}) 
+
+
+        :param y: corresponds to a vector that gives for each example the
+        :correct label
+        """
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
+
+
+
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch 
+        over the total number of examples of the minibatch 
+        """
+
+        # check if y has same dimension of y_pred 
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred', 
+                ('y', target.type, 'y_pred', self.y_pred.type))
+        # check if y is of the correct datatype        
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+
+
+
+
+
+def cg_optimization_mnist( n_iter=50 ):
+    """Demonstrate conjugate gradient optimization of a log-linear model 
+
+    This is demonstrated on MNIST.
+    
+    :param n_iter: number of iterations ot run the optimizer 
+
+    """
+
+    # Load the dataset 
+    f = gzip.open('mnist.pkl.gz','rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+    # make minibatches of size 20 
+    batch_size = 20    # sized of the minibatch
+
+    # Dealing with the training set
+    # get the list of training images (x) and their labels (y)
+    (train_set_x, train_set_y) = train_set
+    # initialize the list of training minibatches with empty list
+    train_batches = []
+    for i in xrange(0, len(train_set_x), batch_size):
+        # add to the list of minibatches the minibatch starting at 
+        # position i, ending at position i+batch_size
+        # a minibatch is a pair ; the first element of the pair is a list 
+        # of datapoints, the second element is the list of corresponding 
+        # labels
+        train_batches = train_batches + \
+               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+
+    # Dealing with the validation set
+    (valid_set_x, valid_set_y) = valid_set
+    # initialize the list of validation minibatches 
+    valid_batches = []
+    for i in xrange(0, len(valid_set_x), batch_size):
+        valid_batches = valid_batches + \
+               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
+
+    # Dealing with the testing set
+    (test_set_x, test_set_y) = test_set
+    # initialize the list of testing minibatches 
+    test_batches = []
+    for i in xrange(0, len(test_set_x), batch_size):
+        test_batches = test_batches + \
+              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+
+
+    ishape     = (28,28) # this is the size of MNIST images
+    n_in       = 28*28   # number of input units
+    n_out      = 10      # number of output units
+    # allocate symbolic variables for the data
+    x = T.fmatrix()  # the data is presented as rasterized images
+    y = T.lvector()  # the labels are presented as 1D vector of 
+                          # [long int] labels
+
+ 
+    # construct the logistic regression class
+    classifier = LogisticRegression( \
+                   input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model in symbolic format
+    cost = classifier.negative_log_likelihood(y).mean() 
+
+    # compile a theano function that computes the mistakes that are made by 
+    # the model on a minibatch
+    test_model = theano.function([x,y], classifier.errors(y))
+    # compile a theano function that returns the gradient of the minibatch 
+    # with respect to theta
+    batch_grad = theano.function([x, y], T.grad(cost, classifier.theta))
+    #  compile a thenao function that returns the cost of a minibatch
+    batch_cost = theano.function([x, y], cost)
+
+    # creates a function that computes the average cost on the training set
+    def train_fn(theta_value):
+        classifier.theta.value = theta_value
+        cost = 0.
+        for x,y in train_batches :
+            cost += batch_cost(x,y)
+        return cost / len(train_batches)
+
+    # creates a function that computes the average gradient of cost with 
+    # respect to theta
+    def train_fn_grad(theta_value):
+        classifier.theta.value = theta_value
+        grad = numpy.zeros(n_in * n_out + n_out)
+        for x,y in train_batches:
+            grad += batch_grad(x,y)
+        return grad/ len(train_batches)
+
+
+
+    validation_scores = [float('inf'), 0]
+ 
+    # creates the validation function
+    def callback(theta_value):
+        classifier.theta.value = theta_value
+        #compute the validation loss
+        this_validation_loss = 0.
+        for x,y in valid_batches:
+            this_validation_loss += test_model(x,y)
+
+        this_validation_loss /= len(valid_batches)
+
+        print('validation error %f %%' % (this_validation_loss*100.,))
+        
+        # check if it is better then best validation score got until now
+        if this_validation_loss < validation_scores[0]:
+            # if so, replace the old one, and compute the score on the 
+            # testing dataset
+            validation_scores[0] = this_validation_loss
+            test_score = 0.
+            for x,y in test_batches:
+                test_score += test_model(x,y)
+            validation_scores[1] = test_score / len(test_batches)
+
+    # using scipy conjugate gradient optimizer 
+    import scipy.optimize
+    print ("Optimizing using scipy.optimize.fmin_cg...")
+    start_time = time.clock()
+    best_w_b = scipy.optimize.fmin_cg(
+            f=train_fn, 
+            x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype),
+            fprime=train_fn_grad,
+            callback=callback,
+            disp=0,
+            maxiter=n_iter)
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%, with '
+          'test performance %f %%') % 
+               (validation_scores[0]*100., validation_scores[1]*100.))
+
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+
+
+
+
+
+
+if __name__ == '__main__':
+    cg_optimization_mnist()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/logistic_sgd.py	Thu Jan 21 11:26:43 2010 -0500
@@ -0,0 +1,303 @@
+"""
+This tutorial introduces logistic regression using Theano and stochastic 
+gradient descent.  
+
+Logistic regression is a probabilistic, linear classifier. It is parametrized
+by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
+done by projecting data points onto a set of hyperplanes, the distance to
+which is used to determine a class membership probability. 
+
+Mathematically, this can be written as:
+
+.. math::
+  P(Y=i|x, W,b) &= softmax_i(W x + b) \\
+                &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
+
+
+The output of the model or prediction is then done by taking the argmax of 
+the vector whose i'th element is P(Y=i|x).
+
+.. math::
+
+  y_{pred} = argmax_i P(Y=i|x,W,b)
+
+
+This tutorial presents a stochastic gradient descent optimization method 
+suitable for large datasets, and a conjugate gradient optimization method 
+that is suitable for smaller datasets.
+
+
+References:
+
+    - textbooks: "Pattern Recognition and Machine Learning" - 
+                 Christopher M. Bishop, section 4.3.2
+
+
+"""
+__docformat__ = 'restructedtext en'
+
+
+import numpy, cPickle, gzip
+
+import time
+
+import theano
+import theano.tensor as T
+
+import theano.tensor.nnet
+
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W` 
+    and bias vector :math:`b`. Classification is done by projecting data 
+    points onto a set of hyperplanes, the distance to which is used to 
+    determine a class membership probability. 
+    """
+
+
+
+
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
+
+        :param input: symbolic variable that describes the input of the 
+        architecture (one minibatch)
+
+        :param n_in: number of input units, the dimension of the space in 
+        which the datapoints lie
+
+        :param n_out: number of output units, the dimension of the space in 
+        which the labels lie
+
+        """ 
+
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
+        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
+                                            dtype = theano.config.floatX) )
+        # initialize the baises b as a vector of n_out 0s
+        self.b = theano.shared( value=numpy.zeros((n_out,), 
+                                            dtype = theano.config.floatX) )
+
+
+        # compute vector of class-membership probabilities in symbolic form
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
+
+        # compute prediction as class whose probability is maximal in 
+        # symbolic form
+        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+
+
+
+
+
+    def negative_log_likelihood(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D})
+
+
+        :param y: corresponds to a vector that gives for each example the
+        :correct label
+
+        Note: we use the mean instead of the sum so that
+        the learning rate is less dependent on the batch size
+        """
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
+
+
+
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch 
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+        """
+
+        # check if y has same dimension of y_pred 
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred', 
+                ('y', target.type, 'y_pred', self.y_pred.type))
+        # check if y is of the correct datatype        
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+
+
+
+def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
+    """
+    Demonstrate stochastic gradient descent optimization of a log-linear 
+    model
+
+    This is demonstrated on MNIST.
+    
+    :param learning_rate: learning rate used (factor for the stochastic 
+    gradient
+
+    :param n_iter: number of iterations ot run the optimizer 
+
+    """
+
+    # Load the dataset 
+    f = gzip.open('mnist.pkl.gz','rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+    # make minibatches of size 20 
+    batch_size = 20    # sized of the minibatch
+
+    # Dealing with the training set
+    # get the list of training images (x) and their labels (y)
+    (train_set_x, train_set_y) = train_set
+    # initialize the list of training minibatches with empty list
+    train_batches = []
+    for i in xrange(0, len(train_set_x), batch_size):
+        # add to the list of minibatches the minibatch starting at 
+        # position i, ending at position i+batch_size
+        # a minibatch is a pair ; the first element of the pair is a list 
+        # of datapoints, the second element is the list of corresponding 
+        # labels
+        train_batches = train_batches + \
+               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+
+    # Dealing with the validation set
+    (valid_set_x, valid_set_y) = valid_set
+    # initialize the list of validation minibatches 
+    valid_batches = []
+    for i in xrange(0, len(valid_set_x), batch_size):
+        valid_batches = valid_batches + \
+               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
+
+    # Dealing with the testing set
+    (test_set_x, test_set_y) = test_set
+    # initialize the list of testing minibatches 
+    test_batches = []
+    for i in xrange(0, len(test_set_x), batch_size):
+        test_batches = test_batches + \
+              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+
+
+    ishape     = (28,28) # this is the size of MNIST images
+
+    # allocate symbolic variables for the data
+    x = T.fmatrix()  # the data is presented as rasterized images
+    y = T.lvector()  # the labels are presented as 1D vector of 
+                     # [long int] labels
+
+    # construct the logistic regression class
+    classifier = LogisticRegression( \
+                   input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model in symbolic format
+    cost = classifier.negative_log_likelihood(y) 
+
+    # compiling a Theano function that computes the mistakes that are made by 
+    # the model on a minibatch
+    test_model = theano.function([x,y], classifier.errors(y))
+
+    # compute the gradient of cost with respect to theta = (W,b) 
+    g_W = T.grad(cost, classifier.W)
+    g_b = T.grad(cost, classifier.b)
+
+    # specify how to update the parameters of the model as a dictionary
+    updates ={classifier.W: classifier.W - learning_rate*g_W,\
+              classifier.b: classifier.b - learning_rate*g_b}
+
+    # compiling a Theano function `train_model` that returns the cost, but in 
+    # the same time updates the parameter of the model based on the rules 
+    # defined in `updates`
+    train_model = theano.function([x, y], cost, updates = updates )
+
+    n_minibatches        = len(train_batches) # number of minibatchers
+ 
+    # early-stopping parameters
+    patience              = 5000  # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = n_minibatches  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    test_score           = 0.
+    start_time = time.clock()
+    # have a maximum of `n_iter` iterations through the entire dataset
+    for iter in xrange(n_iter* n_minibatches):
+
+        # get epoch and minibatch index
+        epoch           = iter / n_minibatches
+        minibatch_index =  iter % n_minibatches
+
+        # get the minibatches corresponding to `iter` modulo
+        # `len(train_batches)`
+        x,y = train_batches[ minibatch_index ]
+        cost_ij = train_model(x,y)
+
+        if (iter+1) % validation_frequency == 0: 
+            # compute zero-one loss on validation set 
+            this_validation_loss = 0.
+            for x,y in valid_batches:
+                # sum up the errors for each minibatch
+                this_validation_loss += test_model(x,y)
+            # get the average by dividing with the number of minibatches
+            this_validation_loss /= len(valid_batches)
+
+            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                 (epoch, minibatch_index+1,n_minibatches, \
+                  this_validation_loss*100.))
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                best_validation_loss = this_validation_loss
+                # test it on the test set
+
+                test_score = 0.
+                for x,y in test_batches:
+                    test_score += test_model(x,y)
+                test_score /= len(test_batches)
+                print(('     epoch %i, minibatch %i/%i, test error of best ' 
+                       'model %f %%') % \
+                  (epoch, minibatch_index+1, n_minibatches,test_score*100.))
+
+        if patience <= iter :
+                break
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %  
+                 (best_validation_loss * 100., test_score*100.))
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+
+
+
+
+
+
+if __name__ == '__main__':
+    sgd_optimization_mnist()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/mlp.py	Thu Jan 21 11:26:43 2010 -0500
@@ -0,0 +1,331 @@
+"""
+This tutorial introduces the multilayer perceptron using Theano.  
+
+ A multilayer perceptron is a logistic regressor where
+instead of feeding the input to the logistic regression you insert a
+intermidiate layer, called the hidden layer, that has a nonlinear 
+activation function (usually tanh or sigmoid) . One can use many such 
+hidden layers making the architecture deep. The tutorial will also tackle 
+the problem of MNIST digit classification.
+
+.. math::
+
+    f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
+
+References:
+
+    - textbooks: "Pattern Recognition and Machine Learning" - 
+                 Christopher M. Bishop, section 5
+
+TODO: recommended preprocessing, lr ranges, regularization ranges (explain 
+      to do lr first, then add regularization)
+
+"""
+__docformat__ = 'restructedtext en'
+
+
+import numpy, cPickle, gzip
+
+
+import theano
+import theano.tensor as T
+
+import time 
+
+import theano.tensor.nnet
+
+class MLP(object):
+    """Multi-Layer Perceptron Class
+
+    A multilayer perceptron is a feedforward artificial neural network model 
+    that has one layer or more of hidden units and nonlinear activations. 
+    Intermidiate layers usually have as activation function thanh or the 
+    sigmoid function  while the top layer is a softamx layer. 
+    """
+
+
+
+    def __init__(self, input, n_in, n_hidden, n_out):
+        """Initialize the parameters for the multilayer perceptron
+
+        :param input: symbolic variable that describes the input of the 
+        architecture (one minibatch)
+
+        :param n_in: number of input units, the dimension of the space in 
+        which the datapoints lie
+
+        :param n_hidden: number of hidden units 
+
+        :param n_out: number of output units, the dimension of the space in 
+        which the labels lie
+
+        """
+
+        # initialize the parameters theta = (W1,b1,W2,b2) ; note that this 
+        # example contains only one hidden layer, but one can have as many 
+        # layers as he/she wishes, making the network deeper. The only 
+        # problem making the network deep this way is during learning, 
+        # backpropagation being unable to move the network from the starting
+        # point towards; this is where pre-training helps, giving a good 
+        # starting point for backpropagation, but more about this in the 
+        # other tutorials
+        
+        # `W1` is initialized with `W1_values` which is uniformely sampled
+        # from -1/sqrt(n_in) and 1/sqrt(n_in)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        W1_values = numpy.asarray( numpy.random.uniform( \
+              low = -numpy.sqrt(6./(n_in+n_hidden)), high = numpy.sqrt(6./(n_in+n_hidden)), \
+              size = (n_in, n_hidden)), dtype = theano.config.floatX)
+        # `W2` is initialized with `W2_values` which is uniformely sampled 
+        # from -1/sqrt(n_hidden) and 1/sqrt(n_hidden)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        W2_values = numpy.asarray( numpy.random.uniform( 
+              low = numpy.sqrt(6./(n_hidden+n_out)), high= numpy.sqrt(6./(n_hidden+n_out)),\
+              size= (n_hidden, n_out)), dtype = theano.config.floatX)
+
+        self.W1 = theano.shared( value = W1_values )
+        self.b1 = theano.shared( value = numpy.zeros((n_hidden,), 
+                                                dtype= theano.config.floatX))
+        self.W2 = theano.shared( value = W2_values )
+        self.b2 = theano.shared( value = numpy.zeros((n_out,), 
+                                                dtype= theano.config.floatX))
+
+        # symbolic expression computing the values of the hidden layer
+        self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
+
+        # symbolic expression computing the values of the top layer 
+        self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
+
+        # compute prediction as class whose probability is maximal in 
+        # symbolic form
+        self.y_pred = T.argmax( self.p_y_given_x, axis =1)
+        
+        # L1 norm ; one regularization option is to enforce L1 norm to 
+        # be small 
+        self.L1     = abs(self.W1).sum() + abs(self.W2).sum()
+
+        # square of L2 norm ; one regularization option is to enforce 
+        # square of L2 norm to be small
+        self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
+
+
+
+    def negative_log_likelihood(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
+            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D}) 
+
+
+        :param y: corresponds to a vector that gives for each example the
+        :correct label
+        """
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
+
+
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch 
+        over the total number of examples of the minibatch 
+        """
+
+        # check if y has same dimension of y_pred 
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred', 
+                ('y', target.type, 'y_pred', self.y_pred.type))
+        # check if y is of the correct datatype        
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+
+def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \
+                            L2_reg = 0.0001, n_iter=100):
+    """
+    Demonstrate stochastic gradient descent optimization for a multilayer 
+    perceptron
+
+    This is demonstrated on MNIST.
+
+    :param learning_rate: learning rate used (factor for the stochastic 
+    gradient
+
+    :param n_iter: number of iterations ot run the optimizer 
+
+    :param L1_reg: L1-norm's weight when added to the cost (see 
+    regularization)
+
+    :param L2_reg: L2-norm's weight when added to the cost (see 
+    regularization)
+    """
+
+    # Load the dataset 
+    f = gzip.open('mnist.pkl.gz','rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+    # make minibatches of size 20 
+    batch_size = 20    # sized of the minibatch
+
+    # Dealing with the training set
+    # get the list of training images (x) and their labels (y)
+    (train_set_x, train_set_y) = train_set
+    # initialize the list of training minibatches with empty list
+    train_batches = []
+    for i in xrange(0, len(train_set_x), batch_size):
+        # add to the list of minibatches the minibatch starting at 
+        # position i, ending at position i+batch_size
+        # a minibatch is a pair ; the first element of the pair is a list 
+        # of datapoints, the second element is the list of corresponding 
+        # labels
+        train_batches = train_batches + \
+               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+
+    # Dealing with the validation set
+    (valid_set_x, valid_set_y) = valid_set
+    # initialize the list of validation minibatches 
+    valid_batches = []
+    for i in xrange(0, len(valid_set_x), batch_size):
+        valid_batches = valid_batches + \
+               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
+
+    # Dealing with the testing set
+    (test_set_x, test_set_y) = test_set
+    # initialize the list of testing minibatches 
+    test_batches = []
+    for i in xrange(0, len(test_set_x), batch_size):
+        test_batches = test_batches + \
+              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+
+
+    ishape     = (28,28) # this is the size of MNIST images
+
+    # allocate symbolic variables for the data
+    x = T.fmatrix()  # the data is presented as rasterized images
+    y = T.lvector()  # the labels are presented as 1D vector of 
+                          # [long int] labels
+
+    # construct the logistic regression class
+    classifier = MLP( input=x.reshape((batch_size,28*28)),\
+                      n_in=28*28, n_hidden = 500, n_out=10)
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model plus the regularization terms (L1 and L2); cost is expressed
+    # here symbolically
+    cost = classifier.negative_log_likelihood(y) \
+         + L1_reg * classifier.L1 \
+         + L2_reg * classifier.L2_sqr 
+
+    # compiling a theano function that computes the mistakes that are made by 
+    # the model on a minibatch
+    test_model = theano.function([x,y], classifier.errors(y))
+
+    # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) 
+    g_W1 = T.grad(cost, classifier.W1)
+    g_b1 = T.grad(cost, classifier.b1)
+    g_W2 = T.grad(cost, classifier.W2)
+    g_b2 = T.grad(cost, classifier.b2)
+
+    # specify how to update the parameters of the model as a dictionary
+    updates = \
+        { classifier.W1: classifier.W1 - learning_rate*g_W1 \
+        , classifier.b1: classifier.b1 - learning_rate*g_b1 \
+        , classifier.W2: classifier.W2 - learning_rate*g_W2 \
+        , classifier.b2: classifier.b2 - learning_rate*g_b2 }
+
+    # compiling a theano function `train_model` that returns the cost, but in 
+    # the same time updates the parameter of the model based on the rules 
+    # defined in `updates`
+    train_model = theano.function([x, y], cost, updates = updates )
+    n_minibatches        = len(train_batches) 
+ 
+    # early-stopping parameters
+    patience              = 10000 # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = n_minibatches  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    test_score           = 0.
+    start_time = time.clock()
+    # have a maximum of `n_iter` iterations through the entire dataset
+    for iter in xrange(n_iter* n_minibatches):
+
+        # get epoch and minibatch index
+        epoch           = iter / n_minibatches
+        minibatch_index =  iter % n_minibatches
+
+        # get the minibatches corresponding to `iter` modulo
+        # `len(train_batches)`
+        x,y = train_batches[ minibatch_index ]
+        cost_ij = train_model(x,y)
+
+        if (iter+1) % validation_frequency == 0: 
+            # compute zero-one loss on validation set 
+            this_validation_loss = 0.
+            for x,y in valid_batches:
+                # sum up the errors for each minibatch
+                this_validation_loss += test_model(x,y)
+            # get the average by dividing with the number of minibatches
+            this_validation_loss /= len(valid_batches)
+
+            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                   (epoch, minibatch_index+1, n_minibatches, \
+                    this_validation_loss*100.))
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max(patience, iter * patience_increase)
+
+                best_validation_loss = this_validation_loss
+                # test it on the test set
+            
+                test_score = 0.
+                for x,y in test_batches:
+                    test_score += test_model(x,y)
+                test_score /= len(test_batches)
+                print(('     epoch %i, minibatch %i/%i, test error of best '
+                      'model %f %%') % 
+                             (epoch, minibatch_index+1, n_minibatches,
+                              test_score*100.))
+
+        if patience <= iter :
+                break
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %  
+                 (best_validation_loss * 100., test_score*100.))
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
+
+
+
+
+
+if __name__ == '__main__':
+    sgd_optimization_mnist()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/rbm.py	Thu Jan 21 11:26:43 2010 -0500
@@ -0,0 +1,133 @@
+import numpy
+import theano
+import theano.tensor as T
+
+from theano.compile.sandbox.sharedvalue import shared
+from theano.compile.sandbox.pfunc import pfunc
+from theano.compile.sandbox.shared_randomstreams import RandomStreams
+from theano.tensor.nnet import sigmoid
+
+class A():
+
+    @execute
+    def propup();
+        # do symbolic prop
+        self.hid = T.dot(
+
+class RBM():
+
+    def __init__(self, input=None, vsize=None, hsize=None, bsize=10, lr=1e-1, seed=123):
+        """ 
+        RBM constructor. Defines the parameters of the model along with
+        basic operations for inferring hidden from visible (and vice-versa), as well
+        as for performing CD updates.
+        param input: None for standalone RBMs or symbolic variable if RBM is
+                     part of a larger graph.
+        param vsize: number of visible units
+        param hsize: number of hidden units
+        param bsize: size of minibatch
+        param lr: unsupervised learning rate
+        param seed: seed for random number generator
+        """
+        assert vsize and hsize
+
+        self.vsize = vsize
+        self.hsize = hsize
+        self.lr = shared(lr, 'lr')
+        
+        # setup theano random number generator
+        self.random = RandomStreams(seed)
+       
+        #### INITIALIZATION ####
+
+        # initialize input layer for standalone RBM or layer0 of DBN
+        self.input = input if input else T.dmatrix('input')
+        # initialize biases
+        self.b = shared(numpy.zeros(vsize), 'b')
+        self.c = shared(numpy.zeros(hsize), 'c')
+        # initialize random weights
+        rngseed = numpy.random.RandomState(seed).randint(2**30)
+        rng = numpy.random.RandomState(rngseed)
+        ubound = 1./numpy.sqrt(max(self.vsize,self.hsize))
+        self.w = shared(rng.uniform(low=-ubound, high=ubound, size=(hsize,vsize)), 'w')
+      
+
+        #### POSITIVE AND NEGATIVE PHASE ####
+
+        # define graph for positive phase
+        ph, ph_s = self.def_propup(self.input)
+        # function which computes p(h|v=x) and ~ p(h|v=x)
+        self.pos_phase = pfunc([self.input], [ph, ph_s])
+
+        # define graph for negative phase
+        nv, nv_s = self.def_propdown(ph_s)
+        nh, nh_s = self.def_propup(nv_s)
+        # function which computes p(v|h=ph_s), ~ p(v|h=ph_s) and p(h|v=nv_s)
+        self.neg_phase = pfunc([ph_s], [nv, nv_s, nh, nh_s])
+        
+        # calculate CD gradients for each parameter
+        db = T.mean(self.input, axis=0) - T.mean(nv, axis=0)
+        dc = T.mean(ph, axis=0) - T.mean(nh, axis=0)
+        dwp = T.dot(ph.T, self.input)/nv.shape[0]
+        dwn = T.dot(nh.T, nv)/nv.shape[0]
+        dw = dwp - dwn
+
+        # define dictionary of stochastic gradient update equations
+        updates = {self.b: self.b - self.lr * db,
+                   self.c: self.c - self.lr * dc,
+                   self.w: self.w - self.lr * dw}
+
+        # define private function, which performs one step in direction of CD gradient
+        self.cd_step = pfunc([self.input, ph, nv, nh], [], updates=updates)
+
+
+    def def_propup(self, vis):
+        """ Symbolic definition of p(hid|vis) """
+        hid_activation = T.dot(vis, self.w.T) + self.c
+        hid = sigmoid(hid_activation)
+        hid_sample = self.random.binomial(T.shape(hid), 1, hid)*1.0
+        return hid, hid_sample
+    
+    def def_propdown(self, hid):
+        """ Symbolic definition of p(vis|hid) """
+        vis_activation = T.dot(hid, self.w) + self.b
+        vis = sigmoid(vis_activation)
+        vis_sample = self.random.binomial(T.shape(vis), 1, vis)*1.0
+        return vis, vis_sample
+
+    def cd(self, x, k=1):
+        """ Performs actual CD update """
+        ph, ph_s = self.pos_phase(x)
+        
+        nh_s = ph_s
+        for ki in range(k):
+            nv, nv_s, nh, nh_s = self.neg_phase(nh_s)
+
+        self.cd_step(x, ph, nv_s, nh)
+
+
+
+import os
+from pylearn.datasets import MNIST
+
+if __name__ == '__main__':
+
+    bsize = 10
+
+    # initialize dataset
+    dataset = MNIST.first_1k() 
+    # initialize RBM with 784 visible units and 500 hidden units
+    r = RBM(vsize=784, hsize=500, bsize=bsize, lr=0.1)
+
+    # for a fixed number of epochs ...
+    for e in range(10):
+
+        print '@epoch %i ' % e
+
+        # iterate over all training set mini-batches
+        for i in range(len(dataset.train.x)/bsize):
+
+            rng = range(i*bsize,(i+1)*bsize) # index range of subsequent mini-batch
+            x = dataset.train.x[rng]         # next mini-batch
+            r.cd(x)                          # perform cd update
+