diff code_tutoriel/dA.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/code_tutoriel/dA.py	Fri Feb 26 13:55:27 2010 -0500
@@ -0,0 +1,330 @@
+"""
+ This tutorial introduces denoising auto-encoders (dA) using Theano.
+
+ Denoising autoencoders are the building blocks for SdA. 
+ They are based on auto-encoders as the ones used in Bengio et al. 2007.
+ An autoencoder takes an input x and first maps it to a hidden representation
+ y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting 
+ latent representation y is then mapped back to a "reconstructed" vector 
+ z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight 
+ matrix W' can optionally be constrained such that W' = W^T, in which case 
+ the autoencoder is said to have tied weights. The network is trained such 
+ that to minimize the reconstruction error (the error between x and z).
+
+ For the denosing autoencoder, during training, first x is corrupted into 
+ \tilde{x}, where \tilde{x} is a partially destroyed version of x by means 
+ of a stochastic mapping. Afterwards y is computed as before (using 
+ \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction 
+ error is now measured between z and the uncorrupted input x, which is 
+ computed as the cross-entropy : 
+      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+
+
+ References :
+   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and 
+   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
+   2008
+   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
+   Training of Deep Networks, Advances in Neural Information Processing 
+   Systems 19, 2007
+
+"""
+
+import numpy, time, cPickle, gzip 
+
+import theano
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+from logistic_sgd import load_data
+from utils import tile_raster_images
+
+import PIL.Image
+
+
+class dA(object):
+    """Denoising Auto-Encoder class (dA) 
+
+    A denoising autoencoders tries to reconstruct the input from a corrupted 
+    version of it by projecting it first in a latent space and reprojecting 
+    it afterwards back in the input space. Please refer to Vincent et al.,2008
+    for more details. If x is the input then equation (1) computes a partially
+    destroyed version of x by means of a stochastic mapping q_D. Equation (2) 
+    computes the projection of the input into the latent space. Equation (3) 
+    computes the reconstruction of the input, while equation (4) computes the 
+    reconstruction error.
+  
+    .. math::
+
+        \tilde{x} ~ q_D(\tilde{x}|x)                                     (1)
+
+        y = s(W \tilde{x} + b)                                           (2)
+
+        x = s(W' y  + b')                                                (3)
+
+        L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]      (4)
+
+    """
+
+    def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n_hidden= 500, 
+               W = None, bhid = None, bvis = None):
+        """
+        Initialize the dA class by specifying the number of visible units (the 
+        dimension d of the input ), the number of hidden units ( the dimension 
+        d' of the latent or hidden space ) and the corruption level. The 
+        constructor also receives symbolic variables for the input, weights and 
+        bias. Such a symbolic variables are useful when, for example the input is 
+        the result of some computations, or when weights are shared between the 
+        dA and an MLP layer. When dealing with SdAs this always happens,
+        the dA on layer 2 gets as input the output of the dA on layer 1, 
+        and the weights of the dA are used in the second stage of training 
+        to construct an MLP.
+   
+        :type numpy_rng: numpy.random.RandomState
+        :param numpy_rng: number random generator used to generate weights
+
+        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
+        :param theano_rng: Theano random generator; if None is given one is generated
+                     based on a seed drawn from `rng`
+    
+        :type input: theano.tensor.TensorType
+        :paran input: a symbolic description of the input or None for standalone
+                      dA
+
+        :type n_visible: int
+        :param n_visible: number of visible units
+
+        :type n_hidden: int
+        :param n_hidden:  number of hidden units
+
+        :type W: theano.tensor.TensorType
+        :param W: Theano variable pointing to a set of weights that should be 
+                  shared belong the dA and another architecture; if dA should 
+                  be standalone set this to None
+              
+        :type bhid: theano.tensor.TensorType
+        :param bhid: Theano variable pointing to a set of biases values (for 
+                     hidden units) that should be shared belong dA and another 
+                     architecture; if dA should be standalone set this to None
+
+        :type bvis: theano.tensor.TensorType
+        :param bvis: Theano variable pointing to a set of biases values (for 
+                     visible units) that should be shared belong dA and another
+                     architecture; if dA should be standalone set this to None
+        
+    
+        """
+        self.n_visible = n_visible
+        self.n_hidden  = n_hidden
+        
+        # create a Theano random generator that gives symbolic random values
+        if not theano_rng : 
+            theano_rng = RandomStreams(rng.randint(2**30))
+    
+        # note : W' was written as `W_prime` and b' as `b_prime`
+        if not W:
+            # W is initialized with `initial_W` which is uniformely sampled
+            # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+            # the output of uniform if converted using asarray to dtype 
+            # theano.config.floatX so that the code is runable on GPU
+            initial_W = numpy.asarray( numpy_rng.uniform( 
+                      low  = -numpy.sqrt(6./(n_hidden+n_visible)), 
+                      high = numpy.sqrt(6./(n_hidden+n_visible)), 
+                      size = (n_visible, n_hidden)), dtype = theano.config.floatX)
+            W = theano.shared(value = initial_W, name ='W')  
+    
+        if not bvis:
+            bvis = theano.shared(value = numpy.zeros(n_visible, 
+                                         dtype = theano.config.floatX))
+
+        if not bhid:
+            bhid = theano.shared(value = numpy.zeros(n_hidden,
+                                              dtype = theano.config.floatX))
+
+
+        self.W = W
+        # b corresponds to the bias of the hidden 
+        self.b = bhid
+        # b_prime corresponds to the bias of the visible
+        self.b_prime = bvis
+        # tied weights, therefore W_prime is W transpose
+        self.W_prime = self.W.T 
+        self.theano_rng = theano_rng
+        # if no input is given, generate a variable representing the input
+        if input == None : 
+            # we use a matrix because we expect a minibatch of several examples,
+            # each example being a row
+            self.x = T.dmatrix(name = 'input') 
+        else:
+            self.x = input
+
+        self.params = [self.W, self.b, self.b_prime]
+
+    def get_corrupted_input(self, input, corruption_level):
+        """ This function keeps ``1-corruption_level`` entries of the inputs the same 
+        and zero-out randomly selected subset of size ``coruption_level`` 
+        Note : first argument of theano.rng.binomial is the shape(size) of 
+               random numbers that it should produce
+               second argument is the number of trials 
+               third argument is the probability of success of any trial
+        
+                this will produce an array of 0s and 1s where 1 has a probability of 
+                1 - ``corruption_level`` and 0 with ``corruption_level``
+        """
+        return  self.theano_rng.binomial( size = input.shape, n = 1, prob =  1 - corruption_level) * input
+
+    
+    def get_hidden_values(self, input):
+        """ Computes the values of the hidden layer """
+        return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
+
+    def get_reconstructed_input(self, hidden ):
+        """ Computes the reconstructed input given the values of the hidden layer """
+        return  T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
+    
+    def get_cost_updates(self, corruption_level, learning_rate):
+        """ This function computes the cost and the updates for one trainng
+        step of the dA """
+
+        tilde_x = self.get_corrupted_input(self.x, corruption_level)
+        y       = self.get_hidden_values( tilde_x)
+        z       = self.get_reconstructed_input(y)
+        # note : we sum over the size of a datapoint; if we are using minibatches,
+        #        L will  be a vector, with one entry per example in minibatch
+        L = - T.sum( self.x*T.log(z) + (1-self.x)*T.log(1-z), axis=1 ) 
+        # note : L is now a vector, where each element is the cross-entropy cost 
+        #        of the reconstruction of the corresponding example of the 
+        #        minibatch. We need to compute the average of all these to get 
+        #        the cost of the minibatch
+        cost = T.mean(L)
+
+        # compute the gradients of the cost of the `dA` with respect
+        # to its parameters 
+        gparams = T.grad(cost, self.params)
+        # generate the list of updates
+        updates = {}
+        for param, gparam in zip(self.params, gparams):
+            updates[param] = param -  learning_rate*gparam
+    
+        return (cost, updates)
+
+
+
+
+def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz' ):
+
+    """
+    This demo is tested on MNIST
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used for training the DeNosing AutoEncoder
+
+    :type training_epochs: int
+    :param training_epochs: number of epochs used for training 
+
+    :type dataset: string
+    :param dataset: path to the picked dataset
+
+    """
+    datasets = load_data(dataset)
+    train_set_x, train_set_y = datasets[0]
+
+    batch_size = 20   # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+
+    # allocate symbolic variables for the data
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+
+    ####################################
+    # BUILDING THE MODEL NO CORRUPTION #
+    ####################################
+
+    rng        = numpy.random.RandomState(123)
+    theano_rng = RandomStreams( rng.randint(2**30))
+
+    da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x,
+            n_visible = 28*28, n_hidden = 500)
+
+    cost, updates = da.get_cost_updates(corruption_level = 0.,
+                                learning_rate = learning_rate)
+
+    
+    train_da = theano.function([index], cost, updates = updates,
+         givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
+
+    start_time = time.clock()
+
+    ############
+    # TRAINING #
+    ############
+
+    # go through training epochs
+    for epoch in xrange(training_epochs):
+        # go through trainng set
+        c = []
+        for batch_index in xrange(n_train_batches):
+            c.append(train_da(batch_index))
+
+        print 'Training epoch %d, cost '%epoch, numpy.mean(c)
+
+    end_time = time.clock()
+
+    training_time = (end_time - start_time)
+
+    print ('Training took %f minutes' %(training_time/60.))
+
+    image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T,
+                 img_shape = (28,28),tile_shape = (10,10), 
+                 tile_spacing=(1,1)))
+    image.save('filters_corruption_0.png') 
+ 
+    #####################################
+    # BUILDING THE MODEL CORRUPTION 30% #
+    #####################################
+
+    rng        = numpy.random.RandomState(123)
+    theano_rng = RandomStreams( rng.randint(2**30))
+
+    da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x,
+            n_visible = 28*28, n_hidden = 500)
+
+    cost, updates = da.get_cost_updates(corruption_level = 0.3,
+                                learning_rate = learning_rate)
+
+    
+    train_da = theano.function([index], cost, updates = updates,
+         givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
+
+    start_time = time.clock()
+
+    ############
+    # TRAINING #
+    ############
+
+    # go through training epochs
+    for epoch in xrange(training_epochs):
+        # go through trainng set
+        c = []
+        for batch_index in xrange(n_train_batches):
+            c.append(train_da(batch_index))
+
+        print 'Training epoch %d, cost '%epoch, numpy.mean(c)
+
+    end_time = time.clock()
+
+    training_time = (end_time - start_time)
+
+    print ('Training took %f minutes' %(training_time/60.))
+
+    image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T,
+                 img_shape = (28,28),tile_shape = (10,10), 
+                 tile_spacing=(1,1)))
+    image.save('filters_corruption_30.png') 
+ 
+
+
+if __name__ == '__main__':
+    test_dA()