# HG changeset patch # User Yoshua Bengio # Date 1277149632 14400 # Node ID daa355332b6601628426e8c498fc5e49a2cf8a5d # Parent f732ec90e249c1a05fe7690b111ac95769f5ea93 added sigmoid_output_SdA.py diff -r f732ec90e249 -r daa355332b66 pylearn/algorithms/sigmoid_output_SdA.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/algorithms/sigmoid_output_SdA.py Mon Jun 21 15:47:12 2010 -0400 @@ -0,0 +1,540 @@ +""" + This tutorial introduces stacked denoising auto-encoders (SdA) using Theano. + + Denoising autoencoders are the building blocks for SdA. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy, time, cPickle, gzip, sys, os + +import theano +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from logistic_sgd import load_data +from mlp import HiddenLayer +from dA import dA + + + +class BinaryLogisticRegressions(object): + """Multiple 2-class Logistic Regressions Class + + The logistic regressions are fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + + + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX), + name='W') + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX), + name='b') + + + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.sigmoid(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # parameters of the model + self.params = [self.W, self.b] + + + + + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y^{(i)}=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + return -T.mean(T.sum( y*T.log(self.p_y_given_x) + (1-y)*T.log(1-self.p_y_given_x), axis=1 ) ) + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + + +class SdA(object): + """Stacked denoising auto-encoder class (SdA) + + A stacked denoising autoencoder model is obtained by stacking several + dAs. The hidden layer of the dA at layer `i` becomes the input of + the dA at layer `i+1`. The first layer dA gets as input the input of + the SdA, and the hidden layer of the last dA represents the output. + Note that after pretraining, the SdA is dealt with as a normal MLP, + the dAs are only used to initialize the weights. + """ + + def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, + hidden_layers_sizes = [500,500], n_outs = 10, + corruption_levels = [0.1, 0.1]): + """ This class is made to support a variable number of layers. + + :type numpy_rng: numpy.random.RandomState + :param numpy_rng: numpy random number generator used to draw initial + weights + + :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams + :param theano_rng: Theano random generator; if None is given one is + generated based on a seed drawn from `rng` + + :type n_ins: int + :param n_ins: dimension of the input to the sdA + + :type n_layers_sizes: list of ints + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :type n_outs: int + :param n_outs: dimension of the output of the network + + :type corruption_levels: list of float + :param corruption_levels: amount of corruption to use for each + layer + """ + + self.sigmoid_layers = [] + self.dA_layers = [] + self.params = [] + self.n_layers = len(hidden_layers_sizes) + + assert self.n_layers > 0 + + if not theano_rng: + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + # allocate symbolic variables for the data + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # The SdA is an MLP, for which all weights of intermidiate layers + # are shared with a different denoising autoencoders + # We will first construct the SdA as a deep multilayer perceptron, + # and when constructing each sigmoidal layer we also construct a + # denoising autoencoder that shares weights with that layer + # During pretraining we will train these autoencoders (which will + # lead to chainging the weights of the MLP as well) + # During finetunining we will finish training the SdA by doing + # stochastich gradient descent on the MLP + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.sigmoid_layers[-1].output + + sigmoid_layer = HiddenLayer(rng = numpy_rng, + input = layer_input, + n_in = input_size, + n_out = hidden_layers_sizes[i], + activation = T.nnet.sigmoid) + # add the layer to our list of layers + self.sigmoid_layers.append(sigmoid_layer) + # its arguably a philosophical question... + # but we are going to only declare that the parameters of the + # sigmoid_layers are parameters of the StackedDAA + # the visible biases in the dA are parameters of those + # dA, but not the SdA + self.params.extend(sigmoid_layer.params) + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, + n_visible = input_size, + n_hidden = hidden_layers_sizes[i], + W = sigmoid_layer.W, bhid = sigmoid_layer.b) + self.dA_layers.append(dA_layer) + + + # We now need to add a logistic layer on top of the MLP + #self.logLayer = LogisticRegression(\ + # input = self.sigmoid_layers[-1].output,\ + # n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.logLayer = BinaryLogisticRegressions(\ + input = self.sigmoid_layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params.extend(self.logLayer.params) + # construct a function that implements one step of finetunining + + # compute the cost for second phase of training, + # defined as the negative log likelihood + #self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + + # compute the gradients with respect to the model parameters + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + self.errors = self.logLayer.errors(self.y) + + def pretraining_functions(self, train_set_x, batch_size): + ''' Generates a list of functions, each of them implementing one + step in trainnig the dA corresponding to the layer with same index. + The function will require as input the minibatch index, and to train + a dA you just need to iterate, calling the corresponding function on + all minibatch indexes. + + :type train_set_x: theano.tensor.TensorType + :param train_set_x: Shared variable that contains all datapoints used + for training the dA + + :type batch_size: int + :param batch_size: size of a [mini]batch + + :type learning_rate: float + :param learning_rate: learning rate used during training for any of + the dA layers + ''' + + # index to a [mini]batch + index = T.lscalar('index') # index to a minibatch + corruption_level = T.scalar('corruption') # amount of corruption to use + learning_rate = T.scalar('lr') # learning rate to use + # number of batches + n_batches = train_set_x.value.shape[0] / batch_size + # begining of a batch, given `index` + batch_begin = index * batch_size + # ending of a batch given `index` + batch_end = batch_begin+batch_size + + pretrain_fns = [] + for dA in self.dA_layers: + # get the cost and the updates list + cost,updates = dA.get_cost_updates( corruption_level, learning_rate) + # compile the theano function + fn = theano.function( inputs = [index, + theano.Param(corruption_level, default = 0.2), + theano.Param(learning_rate, default = 0.1)], + outputs = cost, + updates = updates, + givens = {self.x :train_set_x[batch_begin:batch_end]}) + # append `fn` to the list of functions + pretrain_fns.append(fn) + + return pretrain_fns + + + def build_finetune_functions(self, datasets, batch_size, learning_rate): + '''Generates a function `train` that implements one step of + finetuning, a function `validate` that computes the error on + a batch from the validation set, and a function `test` that + computes the error on a batch from the testing set + + :type datasets: list of pairs of theano.tensor.TensorType + :param datasets: It is a list that contain all the datasets; + the has to contain three pairs, `train`, + `valid`, `test` in this order, where each pair + is formed of two Theano variables, one for the + datapoints, the other for the labels + + :type batch_size: int + :param batch_size: size of a minibatch + + :type learning_rate: float + :param learning_rate: learning rate used during finetune stage + ''' + + (train_set_x, train_set_y) = datasets[0] + (valid_set_x, valid_set_y) = datasets[1] + (test_set_x , test_set_y ) = datasets[2] + + # compute number of minibatches for training, validation and testing + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + index = T.lscalar('index') # index to a [mini]batch + + # compute the gradients with respect to the model parameters + gparams = T.grad(self.finetune_cost, self.params) + + # compute list of fine-tuning updates + updates = {} + for param, gparam in zip(self.params, gparams): + updates[param] = param - gparam*learning_rate + + train_fn = theano.function(inputs = [index], + outputs = self.finetune_cost, + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size], + self.y : train_set_y[index*batch_size:(index+1)*batch_size]}) + + test_score_i = theano.function([index], self.errors, + givens = { + self.x: test_set_x[index*batch_size:(index+1)*batch_size], + self.y: test_set_y[index*batch_size:(index+1)*batch_size]}) + + valid_score_i = theano.function([index], self.errors, + givens = { + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], + self.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # Create a function that scans the entire validation set + def valid_score(): + return [valid_score_i(i) for i in xrange(n_valid_batches)] + + # Create a function that scans the entire test set + def test_score(): + return [test_score_i(i) for i in xrange(n_test_batches)] + + return train_fn, valid_score, test_score + + + + + + +def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \ + pretrain_lr = 0.05, training_epochs = 1000, \ + dataset='../data/mnist.pkl.gz', batch_size = 1): + """ + Demonstrates how to train and test a stochastic denoising autoencoder. + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used in the finetune stage + (factor for the stochastic gradient) + + :type pretraining_epochs: int + :param pretraining_epochs: number of epoch to do pretraining + + :type pretrain_lr: float + :param pretrain_lr: learning rate to be used during pre-training + + :type n_iter: int + :param n_iter: maximal number of iterations ot run the optimizer + + :type dataset: string + :param dataset: path the the pickled dataset + + """ + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] + + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # numpy random generator + numpy_rng = numpy.random.RandomState(123) + print '... building the model' + # construct the stacked denoising autoencoder class + sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28, + hidden_layers_sizes = [1000,1000,1000], + n_outs = 10) + + + ######################### + # PRETRAINING THE MODEL # + ######################### + print '... getting the pretraining functions' + pretraining_fns = sda.pretraining_functions( + train_set_x = train_set_x, + batch_size = batch_size ) + + print '... pre-training the model' + start_time = time.clock() + ## Pre-train layer-wise + corruption_levels = [.1,.1,.0] + for i in xrange(sda.n_layers): + # go through pretraining epochs + for epoch in xrange(pretraining_epochs): + # go through the training set + c = [] + for batch_index in xrange(n_train_batches): + c.append( pretraining_fns[i](index = batch_index, + corruption = corruption_levels[i], + lr = pretrain_lr ) ) + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c) + + end_time = time.clock() + + print >> sys.stderr, ('The pretraining code for file '+os.path.split(__file__)[1]+' ran for %.2fm expected 4.58m in our buildbot' % ((end_time-start_time)/60.)) + + ######################## + # FINETUNING THE MODEL # + ######################## + + # get the training, validation and testing function for the model + print '... getting the finetuning functions' + train_fn, validate_model, test_model = sda.build_finetune_functions ( + datasets = datasets, batch_size = batch_size, + learning_rate = finetune_lr) + + print '... finetunning the model' + # early-stopping parameters + patience = 10*n_train_batches # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < training_epochs) and (not done_looping): + for minibatch_index in xrange(n_train_batches): + minibatch_avg_cost = train_fn(minibatch_index) + iter = epoch * n_train_batches + minibatch_index + + if (iter+1) % validation_frequency == 0: + validation_losses = validate_model() + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = test_model() + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_train_batches, + test_score*100.)) + + + if patience <= iter : + done_looping = True + break + epoch = epoch + 1 + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print >> sys.stderr, ('The training code for file '+os.path.split(__file__)[1]+' ran for %.2fm expected 3.91m in our buildbot' % ((end_time-start_time)/60.)) + + + + + + +if __name__ == '__main__': + test_SdA() + +