Mercurial > ift6266
diff code_tutoriel/dae.py @ 0:fda5f787baa6
commit initial
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Thu, 21 Jan 2010 11:26:43 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/dae.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,240 @@ +""" + This tutorial introduces denoising auto-encoders using Theano. + + Denoising autoencoders can be used as building blocks for deep networks. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + For X iteration of the main program loop it takes *** minutes on an + Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor). + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy +from theano import tensor +from theano.compile.sandbox import shared, pfunc +from theano.compile.sandbox.shared_randomstreams import RandomStreams +from theano.tensor import nnet +import pylearn.datasets.MNIST + + +try: + #this tells theano to use the GPU if possible + from theano.sandbox.cuda import use + use() +except Exception,e: + print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e)) + + +def load_mnist_batches(batch_size): + """ + We should remove the dependency on pylearn.datasets.MNIST .. and maybe + provide a pickled version of the dataset.. + """ + mnist = pylearn.datasets.MNIST.train_valid_test() + train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size]) + for i in xrange(0, len(mnist.train.x), batch_size)] + valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size]) + for i in xrange(0, len(mnist.valid.x), batch_size)] + test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size]) + for i in xrange(0, len(mnist.test.x), batch_size)] + return train_batches, valid_batches, test_batches + + + + +class DAE(): + """Denoising Auto-Encoder class + + A denoising autoencoders tried to reconstruct the input from a corrupted + version of it by projecting it first in a latent space and reprojecting + it in the input space. Please refer to Vincent et al.,2008 for more + details. If x is the input then equation (1) computes a partially destroyed + version of x by means of a stochastic mapping q_D. Equation (2) computes + the projection of the input into the latent space. Equation (3) computes + the reconstruction of the input, while equation (4) computes the + reconstruction error. + + .. latex-eqn: + \tilde{x} ~ q_D(\tilde{x}|x) (1) + y = s(W \tilde{x} + b) (2) + x = s(W' y + b') (3) + L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) + + Tricks and thumbrules for DAE + - learning rate should be used in a logarithmic scale ... + """ + + def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None): + """ + Initialize the DAE class by specifying the number of visible units (the + dimension d of the input ), the number of hidden units ( the dimension + d' of the latent or hidden space ), a initial value for the learning rate + and by giving a symbolic description of the input. Such a symbolic + description is of no importance for the simple DAE and therefore can be + ignored. This feature is useful when stacking DAEs, since the input of + intermediate layers can be symbolically described in terms of the hidden + units of the previous layer. See the tutorial on SDAE for more details. + + :param n_visible: number of visible units + :param n_hidden: number of hidden units + :param lr: a initial value for the learning rate + :param input: a symbolic description of the input or None + """ + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams( seed = 1234 ) + # create a numpy random generator + numpy_rng = numpy.random.RandomState( seed = 52432 ) + + + # initial values for weights and biases + # note : W' was written as W_prime and b' as b_prime + initial_W = numpy_rng.uniform(size = (n_visible, n_hidden)) + # transform W such that all values are between -.01 and .01 + initial_W = (initial_W*2.0 - 1.0)*.01 + initial_b = numpy.zeros(n_hidden) + initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible)) + # transform W_prime such that all values are between -.01 and .01 + initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01 + initial_b_prime= numpy.zeros(n_visible) + + + # theano shared variables for weights and biases + self.W = shared(value = initial_W , name = "W") + self.b = shared(value = initial_b , name = "b") + self.W_prime = shared(value = initial_W_prime, name = "W'") + self.b_prime = shared(value = initial_b_prime, name = "b'") + + # theano shared variable for the learning rate + self.lr = shared(value = lr , name = "learning_rate") + + # if no input is given generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + x = tensor.dmatrix(name = 'input') + else: + x = input + # Equation (1) + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 0.9 and 0 if 0.1 + tilde_x = theano_rng.binomial( x.shape, 1, 0.9) * x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking DAEs. + self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W ) + self.b) + # Equation (3) + z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to sum all these to get the cost of the + # minibatch + cost = tensor.sum(L) + # parameters with respect to whom we need to compute the gradient + self.params = [ self.W, self.b, self.W_prime, self.b_prime] + # use theano automatic differentiation to get the gradients + gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params) + # update the parameters in the direction of the gradient using the + # learning rate + updated_W = self.W - gW * self.lr + updated_b = self.b - gb * self.lr + updated_W_prime = self.W_prime - gW_prime * self.lr + updated_b_prime = self.b_prime - gb_prime * self.lr + + # defining the function that evaluate the symbolic description of + # one update step + self.update = pfunc(params = [x], outputs = cost, updates = + { self.W : updated_W, + self.b : updated_b, + self.W_prime : updated_W_prime, + self.b_prime : updated_b_prime } ) + self.get_cost = pfunc(params = [x], outputs = cost) + + + + + + + + + + + +def train_DAE_mnist(): + """ + Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist) + """ + + # load dataset as batches + train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16) + + # Create a denoising auto-encoders with 28*28 = 784 input units, and 500 + # units in the hidden layer (latent layer); Learning rate is set to 1e-1 + dae = DAE( n_visible = 784, n_hidden = 500, lr = 1e-2) + + # Number of iterations (epochs) to run + n_iter = 30 + best_valid_score = float('inf') + test_score = float('inf') + for i in xrange(n_iter): + # train once over the dataset + for x,y in train_batches: + cost = dae.update(x) + + # compute validation error + valid_cost = 0. + for x,y in valid_batches: + valid_cost = valid_cost + dae.get_cost(x) + valid_cost = valid_cost / len(valid_batches) + print('epoch %i, validation reconstruction error %f '%(i,valid_cost)) + + if valid_cost < best_valid_score : + best_valid_score = valid_cost + # compute test error !? + test_score = 0. + for x,y in test_batches: + test_score = test_score + dae.get_cost(x) + test_score = test_score / len(test_batches) + print('epoch %i, test error of best model %f' % (i, test_score)) + + print('Optimization done. Best validation score %f, test performance %f' % + (best_valid_score, test_score)) + + + +if __name__ == "__main__": + train_DAE_mnist() +