Mercurial > ift6266
changeset 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | e3de934a98b6 |
children | 17ae5a1a4dd1 |
files | code_tutoriel/DBN.py code_tutoriel/SdA.py code_tutoriel/convolutional_mlp.py code_tutoriel/dA.py code_tutoriel/deep.py code_tutoriel/logistic_cg.py code_tutoriel/logistic_sgd.py code_tutoriel/mlp.py code_tutoriel/rbm.py code_tutoriel/test.py code_tutoriel/utils.py |
diffstat | 11 files changed, 3472 insertions(+), 284 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/DBN.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,384 @@ +""" +""" +import os + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from logistic_sgd import LogisticRegression, load_data +from mlp import HiddenLayer +from rbm import RBM + + + +class DBN(object): + """ + """ + + def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, + hidden_layers_sizes = [500,500], n_outs = 10): + """This class is made to support a variable number of layers. + + :type numpy_rng: numpy.random.RandomState + :param numpy_rng: numpy random number generator used to draw initial + weights + + :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams + :param theano_rng: Theano random generator; if None is given one is + generated based on a seed drawn from `rng` + + :type n_ins: int + :param n_ins: dimension of the input to the DBN + + :type n_layers_sizes: list of ints + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :type n_outs: int + :param n_outs: dimension of the output of the network + """ + + self.sigmoid_layers = [] + self.rbm_layers = [] + self.params = [] + self.n_layers = len(hidden_layers_sizes) + + assert self.n_layers > 0 + + if not theano_rng: + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + # allocate symbolic variables for the data + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # The DBN is an MLP, for which all weights of intermidiate layers are shared with a + # different RBM. We will first construct the DBN as a deep multilayer perceptron, and + # when constructing each sigmoidal layer we also construct an RBM that shares weights + # with that layer. During pretraining we will train these RBMs (which will lead + # to chainging the weights of the MLP as well) During finetuning we will finish + # training the DBN by doing stochastic gradient descent on the MLP. + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of the layer below or + # the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden layer below or the + # input of the DBN if you are on the first layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.sigmoid_layers[-1].output + + sigmoid_layer = HiddenLayer(rng = numpy_rng, + input = layer_input, + n_in = input_size, + n_out = hidden_layers_sizes[i], + activation = T.nnet.sigmoid) + + # add the layer to our list of layers + self.sigmoid_layers.append(sigmoid_layer) + + # its arguably a philosophical question... but we are going to only declare that + # the parameters of the sigmoid_layers are parameters of the DBN. The visible + # biases in the RBM are parameters of those RBMs, but not of the DBN. + self.params.extend(sigmoid_layer.params) + + # Construct an RBM that shared weights with this layer + rbm_layer = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng, + input = layer_input, + n_visible = input_size, + n_hidden = hidden_layers_sizes[i], + W = sigmoid_layer.W, + hbias = sigmoid_layer.b) + self.rbm_layers.append(rbm_layer) + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.sigmoid_layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + self.params.extend(self.logLayer.params) + + # construct a function that implements one step of fine-tuning compute the cost for + # second phase of training, defined as the negative log likelihood + self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + + # compute the gradients with respect to the model parameters + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + self.errors = self.logLayer.errors(self.y) + + def pretraining_functions(self, train_set_x, batch_size): + ''' Generates a list of functions, for performing one step of gradient descent at a + given layer. The function will require as input the minibatch index, and to train an + RBM you just need to iterate, calling the corresponding function on all minibatch + indexes. + + :type train_set_x: theano.tensor.TensorType + :param train_set_x: Shared var. that contains all datapoints used for training the RBM + :type batch_size: int + :param batch_size: size of a [mini]batch + ''' + + # index to a [mini]batch + index = T.lscalar('index') # index to a minibatch + learning_rate = T.scalar('lr') # learning rate to use + + # number of batches + n_batches = train_set_x.value.shape[0] / batch_size + # begining of a batch, given `index` + batch_begin = index * batch_size + # ending of a batch given `index` + batch_end = batch_begin+batch_size + + pretrain_fns = [] + for rbm in self.rbm_layers: + + # get the cost and the updates list + # TODO: change cost function to reconstruction error + cost,updates = rbm.cd(learning_rate, persistent=None) + + # compile the theano function + fn = theano.function(inputs = [index, + theano.Param(learning_rate, default = 0.1)], + outputs = cost, + updates = updates, + givens = {self.x :train_set_x[batch_begin:batch_end]}) + # append `fn` to the list of functions + pretrain_fns.append(fn) + + return pretrain_fns + + + def build_finetune_functions(self, datasets, batch_size, learning_rate): + '''Generates a function `train` that implements one step of finetuning, a function + `validate` that computes the error on a batch from the validation set, and a function + `test` that computes the error on a batch from the testing set + + :type datasets: list of pairs of theano.tensor.TensorType + :param datasets: It is a list that contain all the datasets; the has to contain three + pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano + variables, one for the datapoints, the other for the labels + :type batch_size: int + :param batch_size: size of a minibatch + :type learning_rate: float + :param learning_rate: learning rate used during finetune stage + ''' + + (train_set_x, train_set_y) = datasets[0] + (valid_set_x, valid_set_y) = datasets[1] + (test_set_x , test_set_y ) = datasets[2] + + # compute number of minibatches for training, validation and testing + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + index = T.lscalar('index') # index to a [mini]batch + + # compute the gradients with respect to the model parameters + gparams = T.grad(self.finetune_cost, self.params) + + # compute list of fine-tuning updates + updates = {} + for param, gparam in zip(self.params, gparams): + updates[param] = param - gparam*learning_rate + + train_fn = theano.function(inputs = [index], + outputs = self.finetune_cost, + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size], + self.y : train_set_y[index*batch_size:(index+1)*batch_size]}) + + test_score_i = theano.function([index], self.errors, + givens = { + self.x: test_set_x[index*batch_size:(index+1)*batch_size], + self.y: test_set_y[index*batch_size:(index+1)*batch_size]}) + + valid_score_i = theano.function([index], self.errors, + givens = { + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], + self.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # Create a function that scans the entire validation set + def valid_score(): + return [valid_score_i(i) for i in xrange(n_valid_batches)] + + # Create a function that scans the entire test set + def test_score(): + return [test_score_i(i) for i in xrange(n_test_batches)] + + return train_fn, valid_score, test_score + + + + + + +def test_DBN( finetune_lr = 0.1, pretraining_epochs = 10, \ + pretrain_lr = 0.1, training_epochs = 1000, \ + dataset='mnist.pkl.gz'): + """ + Demonstrates how to train and test a Deep Belief Network. + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used in the finetune stage + :type pretraining_epochs: int + :param pretraining_epochs: number of epoch to do pretraining + :type pretrain_lr: float + :param pretrain_lr: learning rate to be used during pre-training + :type n_iter: int + :param n_iter: maximal number of iterations ot run the optimizer + :type dataset: string + :param dataset: path the the pickled dataset + """ + + print 'finetune_lr = ', finetune_lr + print 'pretrain_lr = ', pretrain_lr + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] + + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # numpy random generator + numpy_rng = numpy.random.RandomState(123) + print '... building the model' + # construct the Deep Belief Network + dbn = DBN(numpy_rng = numpy_rng, n_ins = 28*28, + hidden_layers_sizes = [1000,1000,1000], + n_outs = 10) + + + ######################### + # PRETRAINING THE MODEL # + ######################### + print '... getting the pretraining functions' + pretraining_fns = dbn.pretraining_functions( + train_set_x = train_set_x, + batch_size = batch_size ) + + print '... pre-training the model' + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(dbn.n_layers): + # go through pretraining epochs + for epoch in xrange(pretraining_epochs): + # go through the training set + c = [] + for batch_index in xrange(n_train_batches): + c.append(pretraining_fns[i](index = batch_index, + lr = pretrain_lr ) ) + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + + ######################## + # FINETUNING THE MODEL # + ######################## + + # get the training, validation and testing function for the model + print '... getting the finetuning functions' + train_fn, validate_model, test_model = dbn.build_finetune_functions ( + datasets = datasets, batch_size = batch_size, + learning_rate = finetune_lr) + + print '... finetunning the model' + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < training_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): + + minibatch_avg_cost = train_fn(minibatch_index) + iter = epoch * n_train_batches + minibatch_index + + if (iter+1) % validation_frequency == 0: + + validation_losses = validate_model() + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = test_model() + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_train_batches, + test_score*100.)) + + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + + + +if __name__ == '__main__': + pretrain_lr = numpy.float(os.sys.argv[1]) + finetune_lr = numpy.float(os.sys.argv[2]) + test_DBN(pretrain_lr=pretrain_lr, finetune_lr=finetune_lr)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/SdA.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,441 @@ +""" + This tutorial introduces stacked denoising auto-encoders (SdA) using Theano. + + Denoising autoencoders are the building blocks for SdA. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from logistic_sgd import LogisticRegression, load_data +from mlp import HiddenLayer +from dA import dA + + + +class SdA(object): + """Stacked denoising auto-encoder class (SdA) + + A stacked denoising autoencoder model is obtained by stacking several + dAs. The hidden layer of the dA at layer `i` becomes the input of + the dA at layer `i+1`. The first layer dA gets as input the input of + the SdA, and the hidden layer of the last dA represents the output. + Note that after pretraining, the SdA is dealt with as a normal MLP, + the dAs are only used to initialize the weights. + """ + + def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, + hidden_layers_sizes = [500,500], n_outs = 10, + corruption_levels = [0.1, 0.1]): + """ This class is made to support a variable number of layers. + + :type numpy_rng: numpy.random.RandomState + :param numpy_rng: numpy random number generator used to draw initial + weights + + :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams + :param theano_rng: Theano random generator; if None is given one is + generated based on a seed drawn from `rng` + + :type n_ins: int + :param n_ins: dimension of the input to the sdA + + :type n_layers_sizes: list of ints + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :type n_outs: int + :param n_outs: dimension of the output of the network + + :type corruption_levels: list of float + :param corruption_levels: amount of corruption to use for each + layer + """ + + self.sigmoid_layers = [] + self.dA_layers = [] + self.params = [] + self.n_layers = len(hidden_layers_sizes) + + assert self.n_layers > 0 + + if not theano_rng: + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + # allocate symbolic variables for the data + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # The SdA is an MLP, for which all weights of intermidiate layers + # are shared with a different denoising autoencoders + # We will first construct the SdA as a deep multilayer perceptron, + # and when constructing each sigmoidal layer we also construct a + # denoising autoencoder that shares weights with that layer + # During pretraining we will train these autoencoders (which will + # lead to chainging the weights of the MLP as well) + # During finetunining we will finish training the SdA by doing + # stochastich gradient descent on the MLP + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.sigmoid_layers[-1].output + + sigmoid_layer = HiddenLayer(rng = numpy_rng, + input = layer_input, + n_in = input_size, + n_out = hidden_layers_sizes[i], + activation = T.nnet.sigmoid) + # add the layer to our list of layers + self.sigmoid_layers.append(sigmoid_layer) + # its arguably a philosophical question... + # but we are going to only declare that the parameters of the + # sigmoid_layers are parameters of the StackedDAA + # the visible biases in the dA are parameters of those + # dA, but not the SdA + self.params.extend(sigmoid_layer.params) + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, + n_visible = input_size, + n_hidden = hidden_layers_sizes[i], + W = sigmoid_layer.W, bhid = sigmoid_layer.b) + self.dA_layers.append(dA_layer) + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.sigmoid_layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params.extend(self.logLayer.params) + # construct a function that implements one step of finetunining + + # compute the cost for second phase of training, + # defined as the negative log likelihood + self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + self.errors = self.logLayer.errors(self.y) + + def pretraining_functions(self, train_set_x, batch_size): + ''' Generates a list of functions, each of them implementing one + step in trainnig the dA corresponding to the layer with same index. + The function will require as input the minibatch index, and to train + a dA you just need to iterate, calling the corresponding function on + all minibatch indexes. + + :type train_set_x: theano.tensor.TensorType + :param train_set_x: Shared variable that contains all datapoints used + for training the dA + + :type batch_size: int + :param batch_size: size of a [mini]batch + + :type learning_rate: float + :param learning_rate: learning rate used during training for any of + the dA layers + ''' + + # index to a [mini]batch + index = T.lscalar('index') # index to a minibatch + corruption_level = T.scalar('corruption') # amount of corruption to use + learning_rate = T.scalar('lr') # learning rate to use + # number of batches + n_batches = train_set_x.value.shape[0] / batch_size + # begining of a batch, given `index` + batch_begin = index * batch_size + # ending of a batch given `index` + batch_end = batch_begin+batch_size + + pretrain_fns = [] + for dA in self.dA_layers: + # get the cost and the updates list + cost,updates = dA.get_cost_updates( corruption_level, learning_rate) + # compile the theano function + fn = theano.function( inputs = [index, + theano.Param(corruption_level, default = 0.2), + theano.Param(learning_rate, default = 0.1)], + outputs = cost, + updates = updates, + givens = {self.x :train_set_x[batch_begin:batch_end]}) + # append `fn` to the list of functions + pretrain_fns.append(fn) + + return pretrain_fns + + + def build_finetune_functions(self, datasets, batch_size, learning_rate): + '''Generates a function `train` that implements one step of + finetuning, a function `validate` that computes the error on + a batch from the validation set, and a function `test` that + computes the error on a batch from the testing set + + :type datasets: list of pairs of theano.tensor.TensorType + :param datasets: It is a list that contain all the datasets; + the has to contain three pairs, `train`, + `valid`, `test` in this order, where each pair + is formed of two Theano variables, one for the + datapoints, the other for the labels + + :type batch_size: int + :param batch_size: size of a minibatch + + :type learning_rate: float + :param learning_rate: learning rate used during finetune stage + ''' + + (train_set_x, train_set_y) = datasets[0] + (valid_set_x, valid_set_y) = datasets[1] + (test_set_x , test_set_y ) = datasets[2] + + # compute number of minibatches for training, validation and testing + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + index = T.lscalar('index') # index to a [mini]batch + + # compute the gradients with respect to the model parameters + gparams = T.grad(self.finetune_cost, self.params) + + # compute list of fine-tuning updates + updates = {} + for param, gparam in zip(self.params, gparams): + updates[param] = param - gparam*learning_rate + + train_fn = theano.function(inputs = [index], + outputs = self.finetune_cost, + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size], + self.y : train_set_y[index*batch_size:(index+1)*batch_size]}) + + test_score_i = theano.function([index], self.errors, + givens = { + self.x: test_set_x[index*batch_size:(index+1)*batch_size], + self.y: test_set_y[index*batch_size:(index+1)*batch_size]}) + + valid_score_i = theano.function([index], self.errors, + givens = { + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], + self.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # Create a function that scans the entire validation set + def valid_score(): + return [valid_score_i(i) for i in xrange(n_valid_batches)] + + # Create a function that scans the entire test set + def test_score(): + return [test_score_i(i) for i in xrange(n_test_batches)] + + return train_fn, valid_score, test_score + + + + + + +def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \ + pretrain_lr = 0.1, training_epochs = 1000, \ + dataset='mnist.pkl.gz'): + """ + Demonstrates how to train and test a stochastic denoising autoencoder. + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used in the finetune stage + (factor for the stochastic gradient) + + :type pretraining_epochs: int + :param pretraining_epochs: number of epoch to do pretraining + + :type pretrain_lr: float + :param pretrain_lr: learning rate to be used during pre-training + + :type n_iter: int + :param n_iter: maximal number of iterations ot run the optimizer + + :type dataset: string + :param dataset: path the the pickled dataset + + """ + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] + + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # numpy random generator + numpy_rng = numpy.random.RandomState(123) + print '... building the model' + # construct the stacked denoising autoencoder class + sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28, + hidden_layers_sizes = [1000,1000,1000], + n_outs = 10) + + + ######################### + # PRETRAINING THE MODEL # + ######################### + print '... getting the pretraining functions' + pretraining_fns = sda.pretraining_functions( + train_set_x = train_set_x, + batch_size = batch_size ) + + print '... pre-training the model' + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(sda.n_layers): + # go through pretraining epochs + for epoch in xrange(pretraining_epochs): + # go through the training set + c = [] + for batch_index in xrange(n_train_batches): + c.append( pretraining_fns[i](index = batch_index, + corruption = 0.2, lr = pretrain_lr ) ) + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + + ######################## + # FINETUNING THE MODEL # + ######################## + + # get the training, validation and testing function for the model + print '... getting the finetuning functions' + train_fn, validate_model, test_model = sda.build_finetune_functions ( + datasets = datasets, batch_size = batch_size, + learning_rate = finetune_lr) + + print '... finetunning the model' + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < training_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): + + minibatch_avg_cost = train_fn(minibatch_index) + iter = epoch * n_train_batches + minibatch_index + + if (iter+1) % validation_frequency == 0: + + validation_losses = validate_model() + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = test_model() + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_train_batches, + test_score*100.)) + + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + + + + +if __name__ == '__main__': + test_SdA() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/convolutional_mlp.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,292 @@ +""" +This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a +convolutional neural network, good for classifying images. This tutorial shows how to build the +architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST +results. + + +This implementation simplifies the model in the following ways: + + - LeNetConvPool doesn't implement location-specific gain and bias parameters + - LeNetConvPool doesn't implement pooling by average, it implements pooling by max. + - Digit classification is implemented with a logistic regression rather than an RBF network + - LeNet5 was not fully-connected convolutions at second layer + +References: + - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document + Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. + http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf +""" + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.signal import downsample +from theano.tensor.nnet import conv + +from logistic_sgd import LogisticRegression, load_data +from mlp import HiddenLayer + + +class LeNetConvPoolLayer(object): + """Pool Layer of a convolutional network """ + + def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2,2)): + """ + Allocate a LeNetConvPoolLayer with shared variable internal parameters. + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.dtensor4 + :param input: symbolic image tensor, of shape image_shape + + :type filter_shape: tuple or list of length 4 + :param filter_shape: (number of filters, num input feature maps, + filter height,filter width) + + :type image_shape: tuple or list of length 4 + :param image_shape: (batch size, num input feature maps, + image height, image width) + + :type poolsize: tuple or list of length 2 + :param poolsize: the downsampling (pooling) factor (#rows,#cols) + """ + + assert image_shape[1]==filter_shape[1] + self.input = input + + # initialize weights to temporary values until we know the shape of the output feature + # maps + W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX) + self.W = theano.shared(value = W_values) + + # the bias is a 1D tensor -- one bias per output feature map + b_values = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + # convolve input feature maps with filters + conv_out = conv.conv2d(input = input, filters = self.W, + filter_shape=filter_shape, image_shape=image_shape) + + # there are "num input feature maps * filter height * filter width" inputs + # to each hidden unit + fan_in = numpy.prod(filter_shape[1:]) + # each unit in the lower layer receives a gradient from: + # "num output feature maps * filter height * filter width" / pooling size + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize) + # replace weight values with random weights + W_bound = numpy.sqrt(6./(fan_in + fan_out)) + self.W.value = numpy.asarray( + rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), + dtype = theano.config.floatX) + + # downsample each feature map individually, using maxpooling + pooled_out = downsample.max_pool2D( input = conv_out, + ds = poolsize, ignore_border=True) + + # add the bias term. Since the bias is a vector (1D array), we first + # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will thus + # be broadcasted across mini-batches and feature map width & height + self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + + # store parameters of this layer + self.params = [self.W, self.b] + + + +def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20,50]): + """ Demonstrates lenet on MNIST dataset + + :type learning_rate: float + :param learning_rate: learning rate used (factor for the stochastic + gradient) + + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: path to the dataset used for training /testing (MNIST here) + + :type nkerns: list of ints + :param nkerns: number of kernels on each layer + """ + + rng = numpy.random.RandomState(23455) + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] + + + batch_size = 500 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + + ishape = (28,28) # this is the size of MNIST images + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + + # Reshape matrix of rasterized images of shape (batch_size,28*28) + # to a 4D tensor, compatible with our LeNetConvPoolLayer + layer0_input = x.reshape((batch_size,1,28,28)) + + # Construct the first convolutional pooling layer: + # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) + # maxpooling reduces this further to (24/2,24/2) = (12,12) + # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) + layer0 = LeNetConvPoolLayer(rng, input=layer0_input, + image_shape=(batch_size,1,28,28), + filter_shape=(nkerns[0],1,5,5), poolsize=(2,2)) + + # Construct the second convolutional pooling layer + # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) + # maxpooling reduces this further to (8/2,8/2) = (4,4) + # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) + layer1 = LeNetConvPoolLayer(rng, input=layer0.output, + image_shape=(batch_size,nkerns[0],12,12), + filter_shape=(nkerns[1],nkerns[0],5,5), poolsize=(2,2)) + + # the TanhLayer being fully-connected, it operates on 2D matrices of + # shape (batch_size,num_pixels) (i.e matrix of rasterized images). + # This will generate a matrix of shape (20,32*4*4) = (20,512) + layer2_input = layer1.output.flatten(2) + + # construct a fully-connected sigmoidal layer + layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1]*4*4, + n_out=500, activation = T.tanh) + + # classify the values of the fully-connected sigmoidal layer + layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) + + # the cost we minimize during training is the NLL of the model + cost = layer3.negative_log_likelihood(y) + + # create a function to compute the mistakes that are made by the model + test_model = theano.function([index], layer3.errors(y), + givens = { + x: test_set_x[index*batch_size:(index+1)*batch_size], + y: test_set_y[index*batch_size:(index+1)*batch_size]}) + + validate_model = theano.function([index], layer3.errors(y), + givens = { + x: valid_set_x[index*batch_size:(index+1)*batch_size], + y: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # create a list of all model parameters to be fit by gradient descent + params = layer3.params+ layer2.params+ layer1.params + layer0.params + + # create a list of gradients for all model parameters + grads = T.grad(cost, params) + + # train_model is a function that updates the model parameters by SGD + # Since this model has many parameters, it would be tedious to manually + # create an update rule for each model parameter. We thus create the updates + # dictionary by automatically looping over all (params[i],grads[i]) pairs. + updates = {} + for param_i, grad_i in zip(params, grads): + updates[param_i] = param_i - learning_rate * grad_i + + train_model = theano.function([index], cost, updates=updates, + givens = { + x: train_set_x[index*batch_size:(index+1)*batch_size], + y: train_set_y[index*batch_size:(index+1)*batch_size]}) + + + ############### + # TRAIN MODEL # + ############### + print '... training' + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + + epoch = 0 + done_looping = False + + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): + + iter = epoch * n_train_batches + minibatch_index + + if iter %100 == 0: + print 'training @ iter = ', iter + cost_ij = train_model(minibatch_index) + + if (iter+1) % validation_frequency == 0: + + # compute zero-one loss on validation set + validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_train_batches, + test_score*100.)) + + if patience <= iter : + done_looping = False + break + + end_time = time.clock() + print('Optimization complete.') + print('Best validation score of %f %% obtained at iteration %i,'\ + 'with test performance %f %%' % + (best_validation_loss * 100., best_iter, test_score*100.)) + print('The code ran for %f minutes' % ((end_time-start_time)/60.)) + +if __name__ == '__main__': + evaluate_lenet5() + +def experiment(state, channel): + evaluate_lenet5(state.learning_rate, dataset=state.dataset)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/dA.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,330 @@ +""" + This tutorial introduces denoising auto-encoders (dA) using Theano. + + Denoising autoencoders are the building blocks for SdA. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from logistic_sgd import load_data +from utils import tile_raster_images + +import PIL.Image + + +class dA(object): + """Denoising Auto-Encoder class (dA) + + A denoising autoencoders tries to reconstruct the input from a corrupted + version of it by projecting it first in a latent space and reprojecting + it afterwards back in the input space. Please refer to Vincent et al.,2008 + for more details. If x is the input then equation (1) computes a partially + destroyed version of x by means of a stochastic mapping q_D. Equation (2) + computes the projection of the input into the latent space. Equation (3) + computes the reconstruction of the input, while equation (4) computes the + reconstruction error. + + .. math:: + + \tilde{x} ~ q_D(\tilde{x}|x) (1) + + y = s(W \tilde{x} + b) (2) + + x = s(W' y + b') (3) + + L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) + + """ + + def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n_hidden= 500, + W = None, bhid = None, bvis = None): + """ + Initialize the dA class by specifying the number of visible units (the + dimension d of the input ), the number of hidden units ( the dimension + d' of the latent or hidden space ) and the corruption level. The + constructor also receives symbolic variables for the input, weights and + bias. Such a symbolic variables are useful when, for example the input is + the result of some computations, or when weights are shared between the + dA and an MLP layer. When dealing with SdAs this always happens, + the dA on layer 2 gets as input the output of the dA on layer 1, + and the weights of the dA are used in the second stage of training + to construct an MLP. + + :type numpy_rng: numpy.random.RandomState + :param numpy_rng: number random generator used to generate weights + + :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams + :param theano_rng: Theano random generator; if None is given one is generated + based on a seed drawn from `rng` + + :type input: theano.tensor.TensorType + :paran input: a symbolic description of the input or None for standalone + dA + + :type n_visible: int + :param n_visible: number of visible units + + :type n_hidden: int + :param n_hidden: number of hidden units + + :type W: theano.tensor.TensorType + :param W: Theano variable pointing to a set of weights that should be + shared belong the dA and another architecture; if dA should + be standalone set this to None + + :type bhid: theano.tensor.TensorType + :param bhid: Theano variable pointing to a set of biases values (for + hidden units) that should be shared belong dA and another + architecture; if dA should be standalone set this to None + + :type bvis: theano.tensor.TensorType + :param bvis: Theano variable pointing to a set of biases values (for + visible units) that should be shared belong dA and another + architecture; if dA should be standalone set this to None + + + """ + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + if not theano_rng : + theano_rng = RandomStreams(rng.randint(2**30)) + + # note : W' was written as `W_prime` and b' as `b_prime` + if not W: + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy_rng.uniform( + low = -numpy.sqrt(6./(n_hidden+n_visible)), + high = numpy.sqrt(6./(n_hidden+n_visible)), + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + W = theano.shared(value = initial_W, name ='W') + + if not bvis: + bvis = theano.shared(value = numpy.zeros(n_visible, + dtype = theano.config.floatX)) + + if not bhid: + bhid = theano.shared(value = numpy.zeros(n_hidden, + dtype = theano.config.floatX)) + + + self.W = W + # b corresponds to the bias of the hidden + self.b = bhid + # b_prime corresponds to the bias of the visible + self.b_prime = bvis + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.theano_rng = theano_rng + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + + self.params = [self.W, self.b, self.b_prime] + + def get_corrupted_input(self, input, corruption_level): + """ This function keeps ``1-corruption_level`` entries of the inputs the same + and zero-out randomly selected subset of size ``coruption_level`` + Note : first argument of theano.rng.binomial is the shape(size) of + random numbers that it should produce + second argument is the number of trials + third argument is the probability of success of any trial + + this will produce an array of 0s and 1s where 1 has a probability of + 1 - ``corruption_level`` and 0 with ``corruption_level`` + """ + return self.theano_rng.binomial( size = input.shape, n = 1, prob = 1 - corruption_level) * input + + + def get_hidden_values(self, input): + """ Computes the values of the hidden layer """ + return T.nnet.sigmoid(T.dot(input, self.W) + self.b) + + def get_reconstructed_input(self, hidden ): + """ Computes the reconstructed input given the values of the hidden layer """ + return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime) + + def get_cost_updates(self, corruption_level, learning_rate): + """ This function computes the cost and the updates for one trainng + step of the dA """ + + tilde_x = self.get_corrupted_input(self.x, corruption_level) + y = self.get_hidden_values( tilde_x) + z = self.get_reconstructed_input(y) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + L = - T.sum( self.x*T.log(z) + (1-self.x)*T.log(1-z), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + cost = T.mean(L) + + # compute the gradients of the cost of the `dA` with respect + # to its parameters + gparams = T.grad(cost, self.params) + # generate the list of updates + updates = {} + for param, gparam in zip(self.params, gparams): + updates[param] = param - learning_rate*gparam + + return (cost, updates) + + + + +def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz' ): + + """ + This demo is tested on MNIST + + :type learning_rate: float + :param learning_rate: learning rate used for training the DeNosing AutoEncoder + + :type training_epochs: int + :param training_epochs: number of epochs used for training + + :type dataset: string + :param dataset: path to the picked dataset + + """ + datasets = load_data(dataset) + train_set_x, train_set_y = datasets[0] + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + + #################################### + # BUILDING THE MODEL NO CORRUPTION # + #################################### + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x, + n_visible = 28*28, n_hidden = 500) + + cost, updates = da.get_cost_updates(corruption_level = 0., + learning_rate = learning_rate) + + + train_da = theano.function([index], cost, updates = updates, + givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]}) + + start_time = time.clock() + + ############ + # TRAINING # + ############ + + # go through training epochs + for epoch in xrange(training_epochs): + # go through trainng set + c = [] + for batch_index in xrange(n_train_batches): + c.append(train_da(batch_index)) + + print 'Training epoch %d, cost '%epoch, numpy.mean(c) + + end_time = time.clock() + + training_time = (end_time - start_time) + + print ('Training took %f minutes' %(training_time/60.)) + + image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T, + img_shape = (28,28),tile_shape = (10,10), + tile_spacing=(1,1))) + image.save('filters_corruption_0.png') + + ##################################### + # BUILDING THE MODEL CORRUPTION 30% # + ##################################### + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x, + n_visible = 28*28, n_hidden = 500) + + cost, updates = da.get_cost_updates(corruption_level = 0.3, + learning_rate = learning_rate) + + + train_da = theano.function([index], cost, updates = updates, + givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]}) + + start_time = time.clock() + + ############ + # TRAINING # + ############ + + # go through training epochs + for epoch in xrange(training_epochs): + # go through trainng set + c = [] + for batch_index in xrange(n_train_batches): + c.append(train_da(batch_index)) + + print 'Training epoch %d, cost '%epoch, numpy.mean(c) + + end_time = time.clock() + + training_time = (end_time - start_time) + + print ('Training took %f minutes' %(training_time/60.)) + + image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T, + img_shape = (28,28),tile_shape = (10,10), + tile_spacing=(1,1))) + image.save('filters_corruption_30.png') + + + +if __name__ == '__main__': + test_dA()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/deep.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,880 @@ +""" +Draft of DBN, DAA, SDAA, RBM tutorial code + +""" +import sys +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +from theano import shared, function + +import gzip +import cPickle +import pylearn.io.image_tiling +import PIL + +# NNET STUFF + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + """ + + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + :param y: corresponds to a vector that gives for each example the + correct label + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + """ + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + """ + Typical hidden layer of a MLP: units are fully-connected and have + sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + Hidden unit activation is given by: sigmoid(dot(input,W) + b) + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + :type input: theano.tensor.matrix + :param input: a symbolic tensor of shape (n_examples, n_in) + :type n_in: int + :param n_in: dimensionality of input + :type n_out: int + :param n_out: number of hidden units + """ + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + +# PRETRAINING LAYERS + +class RBM(object): + """ + *** WRITE THE ENERGY FUNCTION USE SAME LETTERS AS VARIABLE NAMES IN CODE + """ + + def __init__(self, input=None, n_visible=None, n_hidden=None, + W=None, hbias=None, vbias=None, + numpy_rng=None, theano_rng=None): + """ + RBM constructor. Defines the parameters of the model along with + basic operations for inferring hidden from visible (and vice-versa), + as well as for performing CD updates. + + :param input: None for standalone RBMs or symbolic variable if RBM is + part of a larger graph. + + :param n_visible: number of visible units (necessary when W or vbias is None) + + :param n_hidden: number of hidden units (necessary when W or hbias is None) + + :param W: weights to use for the RBM. None means that a shared variable will be + created with a randomly chosen matrix of size (n_visible, n_hidden). + + :param hbias: *** + + :param vbias: *** + + :param numpy_rng: random number generator (necessary when W is None) + + """ + + params = [] + if W is None: + # choose initial values for weight matrix of RBM + initial_W = numpy.asarray( + numpy_rng.uniform( \ + low=-numpy.sqrt(6./(n_hidden+n_visible)), \ + high=numpy.sqrt(6./(n_hidden+n_visible)), \ + size=(n_visible, n_hidden)), \ + dtype=theano.config.floatX) + W = theano.shared(value=initial_W, name='W') + params.append(W) + + if hbias is None: + # theano shared variables for hidden biases + hbias = theano.shared(value=numpy.zeros(n_hidden, + dtype=theano.config.floatX), name='hbias') + params.append(hbias) + + if vbias is None: + # theano shared variables for visible biases + vbias = theano.shared(value=numpy.zeros(n_visible, + dtype=theano.config.floatX), name='vbias') + params.append(vbias) + + if input is None: + # initialize input layer for standalone RBM or layer0 of DBN + input = T.matrix('input') + + # setup theano random number generator + if theano_rng is None: + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + self.visible = self.input = input + self.W = W + self.hbias = hbias + self.vbias = vbias + self.theano_rng = theano_rng + self.params = params + self.hidden_mean = T.nnet.sigmoid(T.dot(input, W)+hbias) + self.hidden_sample = theano_rng.binomial(self.hidden_mean.shape, 1, self.hidden_mean) + + def gibbs_k(self, v_sample, k): + ''' This function implements k steps of Gibbs sampling ''' + + # We compute the visible after k steps of Gibbs by iterating + # over ``gibs_1`` for k times; this can be done in Theano using + # the `scan op`. For a more comprehensive description of scan see + # http://deeplearning.net/software/theano/library/scan.html . + + def gibbs_1(v0_sample, W, hbias, vbias): + ''' This function implements one Gibbs step ''' + + # compute the activation of the hidden units given a sample of the + # vissibles + h0_mean = T.nnet.sigmoid(T.dot(v0_sample, W) + hbias) + # get a sample of the hiddens given their activation + h0_sample = self.theano_rng.binomial(h0_mean.shape, 1, h0_mean) + # compute the activation of the visible given the hidden sample + v1_mean = T.nnet.sigmoid(T.dot(h0_sample, W.T) + vbias) + # get a sample of the visible given their activation + v1_act = self.theano_rng.binomial(v1_mean.shape, 1, v1_mean) + return [v1_mean, v1_act] + + + # DEBUGGING TO DO ALL WITHOUT SCAN + if k == 1: + return gibbs_1(v_sample, self.W, self.hbias, self.vbias) + + + # Because we require as output two values, namely the mean field + # approximation of the visible and the sample obtained after k steps, + # scan needs to know the shape of those two outputs. Scan takes + # this information from the variables containing the initial state + # of the outputs. Since we do not need a initial state of ``v_mean`` + # we provide a dummy one used only to get the correct shape + v_mean = T.zeros_like(v_sample) + + # ``outputs_taps`` is an argument of scan which describes at each + # time step what past values of the outputs the function applied + # recursively needs. This is given in the form of a dictionary, + # where the keys are outputs indexes, and values are a list of + # of the offsets used by the corresponding outputs + # In our case the function ``gibbs_1`` applied recursively, requires + # at time k the past value k-1 for the first output (index 0) and + # no past value of the second output + outputs_taps = { 0 : [-1], 1 : [] } + + v_means, v_samples = theano.scan( fn = gibbs_1, + sequences = [], + initial_states = [v_sample, v_mean], + non_sequences = [self.W, self.hbias, self.vbias], + outputs_taps = outputs_taps, + n_steps = k) + return v_means[-1], v_samples[-1] + + def free_energy(self, v_sample): + wx_b = T.dot(v_sample, self.W) + self.hbias + vbias_term = T.sum(T.dot(v_sample, self.vbias)) + hidden_term = T.sum(T.log(1+T.exp(wx_b))) + return -hidden_term - vbias_term + + def cd(self, visible = None, persistent = None, steps = 1): + """ + Return a 5-tuple of values related to contrastive divergence: (cost, + end-state of negative-phase chain, gradient on weights, gradient on + hidden bias, gradient on visible bias) + + If visible is None, it defaults to self.input + If persistent is None, it defaults to self.input + + CD aka CD1 - cd() + CD-10 - cd(steps=10) + PCD - cd(persistent=shared(numpy.asarray(initializer))) + PCD-k - cd(persistent=shared(numpy.asarray(initializer)), + steps=10) + """ + if visible is None: + visible = self.input + + if visible is None: + raise TypeError('visible argument is required when self.input is None') + + if steps is None: + steps = self.gibbs_1 + + if persistent is None: + chain_start = visible + else: + chain_start = persistent + + chain_end_mean, chain_end_sample = self.gibbs_k(chain_start, steps) + + #print >> sys.stderr, "WARNING: DEBUGGING with wrong FREE ENERGY" + #free_energy_delta = - self.free_energy(chain_end_sample) + free_energy_delta = self.free_energy(visible) - self.free_energy(chain_end_sample) + + # we will return all of these regardless of what is in self.params + all_params = [self.W, self.hbias, self.vbias] + + gparams = T.grad(free_energy_delta, all_params, + consider_constant = [chain_end_sample]) + + cross_entropy = T.mean(T.sum( + visible*T.log(chain_end_mean) + (1 - visible)*T.log(1-chain_end_mean), + axis = 1)) + + return (cross_entropy, chain_end_sample,) + tuple(gparams) + + def cd_updates(self, lr, visible = None, persistent = None, steps = 1): + """ + Return the learning updates for the RBM parameters that are shared variables. + + Also returns an update for the persistent if it is a shared variable. + + These updates are returned as a dictionary. + + :param lr: [scalar] learning rate for contrastive divergence learning + :param visible: see `cd_grad` + :param persistent: see `cd_grad` + :param steps: see `cd_grad` + + """ + + cross_entropy, chain_end, gW, ghbias, gvbias = self.cd(visible, + persistent, steps) + + updates = {} + if hasattr(self.W, 'value'): + updates[self.W] = self.W - lr * gW + if hasattr(self.hbias, 'value'): + updates[self.hbias] = self.hbias - lr * ghbias + if hasattr(self.vbias, 'value'): + updates[self.vbias] = self.vbias - lr * gvbias + if persistent: + #if persistent is a shared var, then it means we should use + updates[persistent] = chain_end + + return updates + +# DEEP MODELS + +class DBN(object): + """ + *** WHAT IS A DBN? + """ + + def __init__(self, input_len, hidden_layers_sizes, n_classes, rng): + """ This class is made to support a variable number of layers. + + :param train_set_x: symbolic variable pointing to the training dataset + + :param train_set_y: symbolic variable pointing to the labels of the + training dataset + + :param input_len: dimension of the input to the sdA + + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :param n_classes: dimension of the output of the network + + :param corruption_levels: amount of corruption to use for each + layer + + :param rng: numpy random number generator used to draw initial weights + + :param pretrain_lr: learning rate used during pre-trainnig stage + + :param finetune_lr: learning rate used during finetune stage + """ + + self.sigmoid_layers = [] + self.rbm_layers = [] + self.pretrain_functions = [] + self.params = [] + + theano_rng = RandomStreams(rng.randint(2**30)) + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + input = self.x + + # The SdA is an MLP, for which all weights of intermidiate layers + # are shared with a different denoising autoencoders + # We will first construct the SdA as a deep multilayer perceptron, + # and when constructing each sigmoidal layer we also construct a + # denoising autoencoder that shares weights with that layer, and + # compile a training function for that denoising autoencoder + + for n_hid in hidden_layers_sizes: + # construct the sigmoidal layer + + sigmoid_layer = SigmoidalLayer(rng, input, input_len, n_hid) + self.sigmoid_layers.append(sigmoid_layer) + + self.rbm_layers.append(RBM(input=input, + W=sigmoid_layer.W, + hbias=sigmoid_layer.b, + n_visible = input_len, + n_hidden = n_hid, + numpy_rng=rng, + theano_rng=theano_rng)) + + # its arguably a philosophical question... + # but we are going to only declare that the parameters of the + # sigmoid_layers are parameters of the StackedDAA + # the hidden-layer biases in the daa_layers are parameters of those + # daa_layers, but not the StackedDAA + self.params.extend(self.sigmoid_layers[-1].params) + + # get ready for the next loop iteration + input_len = n_hid + input = self.sigmoid_layers[-1].output + + # We now need to add a logistic layer on top of the MLP + self.logistic_regressor = LogisticRegression(input = input, + n_in = input_len, n_out = n_classes) + + self.params.extend(self.logistic_regressor.params) + + def pretraining_functions(self, train_set_x, batch_size, learning_rate, k=1): + if k!=1: + raise NotImplementedError() + index = T.lscalar() # index to a [mini]batch + n_train_batches = train_set_x.value.shape[0] / batch_size + batch_begin = (index % n_train_batches) * batch_size + batch_end = batch_begin+batch_size + + print 'TRAIN_SET X', train_set_x.value.shape + rval = [] + for rbm in self.rbm_layers: + # N.B. these cd() samples are independent from the + # samples used for learning + outputs = list(rbm.cd())[0:2] + rval.append(function([index], outputs, + updates = rbm.cd_updates(lr=learning_rate), + givens = {self.x: train_set_x[batch_begin:batch_end]})) + if rbm is self.rbm_layers[0]: + f = rval[-1] + AA=len(outputs) + for i, implicit_out in enumerate(f.maker.env.outputs): #[len(outputs):]: + print 'OUTPUT ', i + theano.printing.debugprint(implicit_out, file=sys.stdout) + + return rval + + def finetune(self, datasets, lr, batch_size): + + # unpack the various datasets + (train_set_x, train_set_y) = datasets[0] + (valid_set_x, valid_set_y) = datasets[1] + (test_set_x, test_set_y) = datasets[2] + + # compute number of minibatches for training, validation and testing + assert train_set_x.value.shape[0] % batch_size == 0 + assert valid_set_x.value.shape[0] % batch_size == 0 + assert test_set_x.value.shape[0] % batch_size == 0 + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + index = T.lscalar() # index to a [mini]batch + target = self.y + + train_index = index % n_train_batches + + classifier = self.logistic_regressor + cost = classifier.negative_log_likelihood(target) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + + # compute list of fine-tuning updates + updates = [(param, param - gparam*finetune_lr) + for param,gparam in zip(self.params, gparams)] + + train_fn = theano.function([index], cost, + updates = updates, + givens = { + self.x : train_set_x[train_index*batch_size:(train_index+1)*batch_size], + target : train_set_y[train_index*batch_size:(train_index+1)*batch_size]}) + + test_score_i = theano.function([index], classifier.errors(target), + givens = { + self.x: test_set_x[index*batch_size:(index+1)*batch_size], + target: test_set_y[index*batch_size:(index+1)*batch_size]}) + + valid_score_i = theano.function([index], classifier.errors(target), + givens = { + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], + target: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + def test_scores(): + return [test_score_i(i) for i in xrange(n_test_batches)] + + def valid_scores(): + return [valid_score_i(i) for i in xrange(n_valid_batches)] + + return train_fn, valid_scores, test_scores + +def load_mnist(filename): + f = gzip.open(filename,'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + def shared_dataset(data_xy): + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + return shared_x, T.cast(shared_y, 'int32') + + n_train_examples = train_set[0].shape[0] + datasets = shared_dataset(train_set), shared_dataset(valid_set), shared_dataset(test_set) + + return n_train_examples, datasets + +def dbn_main(finetune_lr = 0.01, + pretraining_epochs = 10, + pretrain_lr = 0.1, + training_epochs = 1000, + batch_size = 20, + mnist_file='mnist.pkl.gz'): + """ + Demonstrate stochastic gradient descent optimization for a multilayer perceptron + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used in the finetune stage + (factor for the stochastic gradient) + + :param pretraining_epochs: number of epoch to do pretraining + + :param pretrain_lr: learning rate to be used during pre-training + + :param n_iter: maximal number of iterations ot run the optimizer + + :param mnist_file: path the the pickled mnist_file + + """ + + n_train_examples, train_valid_test = load_mnist(mnist_file) + + print "Creating a Deep Belief Network" + deep_model = DBN( + input_len=28*28, + hidden_layers_sizes = [500, 150, 100], + n_classes=10, + rng = numpy.random.RandomState()) + + #### + #### Phase 1: Pre-training + #### + print "Pretraining (unsupervised learning) ..." + + pretrain_functions = deep_model.pretraining_functions( + batch_size=batch_size, + train_set_x=train_valid_test[0][0], + learning_rate=pretrain_lr, + ) + + start_time = time.clock() + for layer_idx, pretrain_fn in enumerate(pretrain_functions): + # go through pretraining epochs + print 'Pre-training layer %i'% layer_idx + for i in xrange(pretraining_epochs * n_train_examples / batch_size): + outstuff = pretrain_fn(i) + xe, negsample = outstuff[:2] + print (layer_idx, i, + n_train_examples / batch_size, + float(xe), + 'Wmin', deep_model.rbm_layers[0].W.value.min(), + 'Wmax', deep_model.rbm_layers[0].W.value.max(), + 'vmin', deep_model.rbm_layers[0].vbias.value.min(), + 'vmax', deep_model.rbm_layers[0].vbias.value.max(), + #'x>0.3', (input_i>0.3).sum(), + ) + sys.stdout.flush() + if i % 1000 == 0: + PIL.Image.fromarray( + pylearn.io.image_tiling.tile_raster_images(negsample, (28,28), (10,10), + tile_spacing=(1,1))).save('samples_%i_%i.png'%(layer_idx,i)) + + PIL.Image.fromarray( + pylearn.io.image_tiling.tile_raster_images( + deep_model.rbm_layers[0].W.value.T, + (28,28), (10,10), + tile_spacing=(1,1))).save('filters_%i_%i.png'%(layer_idx,i)) + end_time = time.clock() + print 'Pretraining took %f minutes' %((end_time - start_time)/60.) + + return + + print "Fine tuning (supervised learning) ..." + train_fn, valid_scores, test_scores =\ + deep_model.finetune_functions(train_valid_test[0][0], + learning_rate=finetune_lr, # the learning rate + batch_size = batch_size) # number of examples to use at once + + #### + #### Phase 2: Fine Tuning + #### + + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_examples, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + patience_max = n_train_examples * training_epochs + + best_epoch = None + best_epoch_test_score = None + best_epoch_valid_score = float('inf') + start_time = time.clock() + + for i in xrange(patience_max): + if i >= patience: + break + + cost_i = train_fn(i) + + if i % validation_frequency == 0: + validation_i = numpy.mean([score for score in valid_scores()]) + + # if we got the best validation score until now + if validation_i < best_epoch_valid_score: + + # improve patience if loss improvement is good enough + threshold_i = best_epoch_valid_score * improvement_threshold + if validation_i < threshold_i: + patience = max(patience, i * patience_increase) + + # save best validation score and iteration number + best_epoch_valid_score = validation_i + best_epoch = i/validation_i + best_epoch_test_score = numpy.mean( + [score for score in test_scores()]) + + print('epoch %i, validation error %f %%, test error %f %%'%( + i/validation_frequency, validation_i*100., + best_epoch_test_score*100.)) + else: + print('epoch %i, validation error %f %%' % ( + i/validation_frequency, validation_i*100.)) + end_time = time.clock() + + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (finetune_status['best_validation_loss']*100., + finetune_status['test_score']*100.)) + print ('The code ran for %f minutes' % ((finetune_status['duration'])/60.)) + +def rbm_main(): + rbm = RBM(n_visible=20, n_hidden=30, + numpy_rng = numpy.random.RandomState(34)) + + cd_updates = rbm.cd_updates(lr=0.25) + + print cd_updates + + f = function([rbm.input], [], + updates={rbm.W:cd_updates[rbm.W]}) + + theano.printing.debugprint(f.maker.env.outputs[0], + file=sys.stdout) + + +if __name__ == '__main__': + dbn_main() + #rbm_main() + + +if 0: + class DAA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + """ + Initialize the dA class by specifying the number of visible units (the + dimension d of the input ), the number of hidden units ( the dimension + d' of the latent or hidden space ) and the corruption level. The + constructor also receives symbolic variables for the input, weights and + bias. Such a symbolic variables are useful when, for example the input is + the result of some computations, or when weights are shared between the + dA and an MLP layer. When dealing with SdAs this always happens, + the dA on layer 2 gets as input the output of the dA on layer 1, + and the weights of the dA are used in the second stage of training + to construct an MLP. + + :param n_visible: number of visible units + + :param n_hidden: number of hidden units + + :param input: a symbolic description of the input or None + + :param corruption_level: the corruption mechanism picks up randomly this + fraction of entries of the input and turns them to 0 + + + """ + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.matrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + class StackedDAA(DeepLayerwiseModel): + """Stacked denoising auto-encoder class (SdA) + + A stacked denoising autoencoder model is obtained by stacking several + dAs. The hidden layer of the dA at layer `i` becomes the input of + the dA at layer `i+1`. The first layer dA gets as input the input of + the SdA, and the hidden layer of the last dA represents the output. + Note that after pretraining, the SdA is dealt with as a normal MLP, + the dAs are only used to initialize the weights. + """ + + def __init__(self, n_ins, hidden_layers_sizes, n_outs, + corruption_levels, rng, ): + """ This class is made to support a variable number of layers. + + :param train_set_x: symbolic variable pointing to the training dataset + + :param train_set_y: symbolic variable pointing to the labels of the + training dataset + + :param n_ins: dimension of the input to the sdA + + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :param n_outs: dimension of the output of the network + + :param corruption_levels: amount of corruption to use for each + layer + + :param rng: numpy random number generator used to draw initial weights + + :param pretrain_lr: learning rate used during pre-trainnig stage + + :param finetune_lr: learning rate used during finetune stage + """ + + self.sigmoid_layers = [] + self.daa_layers = [] + self.pretrain_functions = [] + self.params = [] + self.n_layers = len(hidden_layers_sizes) + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + theano_rng = RandomStreams(rng.randint(2**30)) + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # The SdA is an MLP, for which all weights of intermidiate layers + # are shared with a different denoising autoencoders + # We will first construct the SdA as a deep multilayer perceptron, + # and when constructing each sigmoidal layer we also construct a + # denoising autoencoder that shares weights with that layer, and + # compile a training function for that denoising autoencoder + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + sigmoid_layer = SigmoidalLayer(rng, + self.layers[-1].output if i else self.x, + hidden_layers_sizes[i-1] if i else n_ins, + hidden_layers_sizes[i]) + + daa_layer = DAA(corruption_level = corruption_levels[i], + input = sigmoid_layer.input, + W = sigmoid_layer.W, + b = sigmoid_layer.b) + + # add the layer to the + self.sigmoid_layers.append(sigmoid_layer) + self.daa_layers.append(daa_layer) + + # its arguably a philosophical question... + # but we are going to only declare that the parameters of the + # sigmoid_layers are parameters of the StackedDAA + # the hidden-layer biases in the daa_layers are parameters of those + # daa_layers, but not the StackedDAA + self.params.extend(sigmoid_layer.params) + + # We now need to add a logistic layer on top of the MLP + self.logistic_regressor = LogisticRegression( + input = self.sigmoid_layers[-1].output, + n_in = hidden_layers_sizes[-1], + n_out = n_outs) + + self.params.extend(self.logLayer.params) + + def pretraining_functions(self, train_set_x, batch_size): + + # compiles update functions for each layer, and + # returns them as a list + # + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([index], dA_layer.cost, \ + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size]}) + # collect this function into a list + self.pretrain_functions += [update_fn] + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/logistic_cg.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,310 @@ +""" +This tutorial introduces logistic regression using Theano and conjugate +gradient descent. + +Logistic regression is a probabilistic, linear classifier. It is parametrized +by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is +done by projecting data points onto a set of hyperplanes, the distance to +which is used to determine a class membership probability. + +Mathematically, this can be written as: + +.. math:: + P(Y=i|x, W,b) &= softmax_i(W x + b) \\ + &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} + + +The output of the model or prediction is then done by taking the argmax of +the vector whose i'th element is P(Y=i|x). + +.. math:: + + y_{pred} = argmax_i P(Y=i|x,W,b) + + +This tutorial presents a stochastic gradient descent optimization method +suitable for large datasets, and a conjugate gradient optimization method +that is suitable for smaller datasets. + + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 4.3.2 + + +""" +__docformat__ = 'restructedtext en' + + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T + + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + + + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture ( one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoint lies + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the target lies + + """ + + # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), + # while b is a vector of n_out elements, making theta a vector of + # n_in*n_out + n_out elements + self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out, dtype = theano.config.floatX) ) + # W is represented by the fisr n_in*n_out elements of theta + self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out)) + # b is the rest (last n_out elements) + self.b = self.theta[n_in*n_out:n_in*n_out+n_out] + + + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + + + + + def negative_log_likelihood(self, y): + """Return the negative log-likelihood of the prediction of this model + under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example + the correct label + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + + + + + + +def cg_optimization_mnist( n_epochs=50, mnist_pkl_gz='mnist.pkl.gz' ): + """Demonstrate conjugate gradient optimization of a log-linear model + + This is demonstrated on MNIST. + + :type n_epochs: int + :param n_epochs: number of epochs to run the optimizer + + :type mnist_pkl_gz: string + :param mnist_pkl_gz: the path of the mnist training file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + + """ + ############# + # LOAD DATA # + ############# + print '... loading data' + + # Load the dataset + f = gzip.open(mnist_pkl_gz,'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + def shared_dataset(data_xy): + """ Function that loads the dataset into shared variables + + The reason we store our dataset in shared variables is to allow + Theano to copy it into the GPU memory (when code is run on GPU). + Since copying data into the GPU is slow, copying a minibatch everytime + is needed (the default behaviour if the data is not in a shared + variable) would lead to a large decrease in performance. + """ + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + # When storing data on the GPU it has to be stored as floats + # therefore we will store the labels as ``floatX`` as well + # (``shared_y`` does exactly that). But during our computations + # we need them as ints (we use labels as index, and if they are + # floats it doesn't make sense) therefore instead of returning + # ``shared_y`` we will have to cast it to int. This little hack + # lets ous get around this issue + return shared_x, T.cast(shared_y, 'int32') + + + test_set_x, test_set_y = shared_dataset(test_set) + valid_set_x, valid_set_y = shared_dataset(valid_set) + train_set_x, train_set_y = shared_dataset(train_set) + + batch_size = 600 # size of the minibatch + + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + + ishape = (28,28) # this is the size of MNIST images + n_in = 28*28 # number of input units + n_out = 10 # number of output units + + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + + # allocate symbolic variables for the data + minibatch_offset = T.lscalar() # offset to the start of a [mini]batch + x = T.matrix() # the data is presented as rasterized images + y = T.ivector() # the labels are presented as 1D vector of + # [int] labels + + + # construct the logistic regression class + classifier = LogisticRegression( input=x, n_in=28*28, n_out=10) + + # the cost we minimize during training is the negative log likelihood of + # the model in symbolic format + cost = classifier.negative_log_likelihood(y).mean() + + # compile a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([minibatch_offset], classifier.errors(y), + givens={ + x:test_set_x[minibatch_offset:minibatch_offset+batch_size], + y:test_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + validate_model = theano.function([minibatch_offset],classifier.errors(y), + givens={ + x:valid_set_x[minibatch_offset:minibatch_offset+batch_size], + y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + # compile a thenao function that returns the cost of a minibatch + batch_cost = theano.function([minibatch_offset], cost, + givens= { + x : train_set_x[minibatch_offset:minibatch_offset+batch_size], + y : train_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + + + # compile a theano function that returns the gradient of the minibatch + # with respect to theta + batch_grad = theano.function([minibatch_offset], T.grad(cost,classifier.theta), + givens= { + x : train_set_x[minibatch_offset:minibatch_offset+batch_size], + y : train_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + + # creates a function that computes the average cost on the training set + def train_fn(theta_value): + classifier.theta.value = theta_value + train_losses = [batch_cost(i*batch_size) for i in xrange(n_train_batches)] + return numpy.mean(train_losses) + + # creates a function that computes the average gradient of cost with + # respect to theta + def train_fn_grad(theta_value): + classifier.theta.value = theta_value + grad = batch_grad(0) + for i in xrange(1,n_train_batches): + grad += batch_grad(i*batch_size) + return grad/n_train_batches + + + validation_scores = [float('inf'), 0] + + # creates the validation function + def callback(theta_value): + classifier.theta.value = theta_value + #compute the validation loss + validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) + print('validation error %f %%' % (this_validation_loss*100.,)) + + # check if it is better then best validation score got until now + if this_validation_loss < validation_scores[0]: + # if so, replace the old one, and compute the score on the + # testing dataset + validation_scores[0] = this_validation_loss + test_loses = [test_model(i*batch_size) for i in xrange(n_test_batches)] + validation_scores[1] = numpy.mean(test_loses) + + ############### + # TRAIN MODEL # + ############### + + # using scipy conjugate gradient optimizer + import scipy.optimize + print ("Optimizing using scipy.optimize.fmin_cg...") + start_time = time.clock() + best_w_b = scipy.optimize.fmin_cg( + f = train_fn, + x0 = numpy.zeros((n_in+1)*n_out, dtype=x.dtype), + fprime = train_fn_grad, + callback = callback, + disp = 0, + maxiter = n_epochs) + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%, with ' + 'test performance %f %%') % + (validation_scores[0]*100., validation_scores[1]*100.)) + + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + +if __name__ == '__main__': + cg_optimization_mnist() +
--- a/code_tutoriel/logistic_sgd.py Thu Feb 25 18:53:02 2010 -0500 +++ b/code_tutoriel/logistic_sgd.py Fri Feb 26 13:55:27 2010 -0500 @@ -32,20 +32,14 @@ - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 4.3.2 - """ __docformat__ = 'restructedtext en' - -import numpy, cPickle, gzip - -import time +import numpy, time, cPickle, gzip import theano import theano.tensor as T -import theano.tensor.nnet - class LogisticRegression(object): """Multi-class Logistic Regression Class @@ -62,23 +56,26 @@ def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression + :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the - architecture (one minibatch) - + architecture (one minibatch) + + :type n_in: int :param n_in: number of input units, the dimension of the space in - which the datapoints lie + which the datapoints lie + :type n_out: int :param n_out: number of output units, the dimension of the space in - which the labels lie + which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - self.W = theano.shared( value=numpy.zeros((n_in,n_out), - dtype = theano.config.floatX) ) + self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX), + name='W') # initialize the baises b as a vector of n_out 0s - self.b = theano.shared( value=numpy.zeros((n_out,), - dtype = theano.config.floatX) ) + self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX), + name='b') # compute vector of class-membership probabilities in symbolic form @@ -88,6 +85,9 @@ # symbolic form self.y_pred=T.argmax(self.p_y_given_x, axis=1) + # parameters of the model + self.params = [self.W, self.b] + @@ -102,23 +102,30 @@ \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) - + :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the - :correct label + correct label Note: we use the mean instead of the sum so that - the learning rate is less dependent on the batch size + the learning rate is less dependent on the batch size """ + # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch + # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1] + # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class + # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]] + # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v, + # i.e., the mean log-likelihood across the minibatch. return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - - def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label """ # check if y has same dimension of y_pred @@ -134,72 +141,103 @@ raise NotImplementedError() +def load_data(dataset): + ''' Loads the dataset + + :type dataset: string + :param dataset: the path to the dataset (here MNIST) + ''' + + ############# + # LOAD DATA # + ############# + print '... loading data' + + # Load the dataset + f = gzip.open(dataset,'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + + def shared_dataset(data_xy): + """ Function that loads the dataset into shared variables + + The reason we store our dataset in shared variables is to allow + Theano to copy it into the GPU memory (when code is run on GPU). + Since copying data into the GPU is slow, copying a minibatch everytime + is needed (the default behaviour if the data is not in a shared + variable) would lead to a large decrease in performance. + """ + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + # When storing data on the GPU it has to be stored as floats + # therefore we will store the labels as ``floatX`` as well + # (``shared_y`` does exactly that). But during our computations + # we need them as ints (we use labels as index, and if they are + # floats it doesn't make sense) therefore instead of returning + # ``shared_y`` we will have to cast it to int. This little hack + # lets ous get around this issue + return shared_x, T.cast(shared_y, 'int32') + + test_set_x, test_set_y = shared_dataset(test_set) + valid_set_x, valid_set_y = shared_dataset(valid_set) + train_set_x, train_set_y = shared_dataset(train_set) + + rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)] + return rval -def sgd_optimization_mnist( learning_rate=0.01, n_iter=100): + +def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. + :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic - gradient + gradient) - :param n_iter: maximal number of iterations ot run the optimizer + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ + datasets = load_data(dataset) - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + batch_size = 600 # size of the minibatch - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size - ishape = (28,28) # this is the size of MNIST images + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels # construct the logistic regression class - classifier = LogisticRegression( \ - input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) + # Each MNIST image has size 28*28 + classifier = LogisticRegression( input=x, n_in=28*28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format @@ -207,11 +245,21 @@ # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) + test_model = theano.function(inputs = [index], + outputs = classifier.errors(y), + givens={ + x:test_set_x[index*batch_size:(index+1)*batch_size], + y:test_set_y[index*batch_size:(index+1)*batch_size]}) + + validate_model = theano.function( inputs = [index], + outputs = classifier.errors(y), + givens={ + x:valid_set_x[index*batch_size:(index+1)*batch_size], + y:valid_set_y[index*batch_size:(index+1)*batch_size]}) # compute the gradient of cost with respect to theta = (W,b) - g_W = T.grad(cost, classifier.W) - g_b = T.grad(cost, classifier.b) + g_W = T.grad(cost = cost, wrt = classifier.W) + g_b = T.grad(cost = cost, wrt = classifier.b) # specify how to update the parameters of the model as a dictionary updates ={classifier.W: classifier.W - learning_rate*g_W,\ @@ -220,17 +268,25 @@ # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` - train_model = theano.function([x, y], cost, updates = updates ) + train_model = theano.function(inputs = [index], + outputs = cost, + updates = updates, + givens={ + x:train_set_x[index*batch_size:(index+1)*batch_size], + y:train_set_y[index*batch_size:(index+1)*batch_size]}) - n_minibatches = len(train_batches) # number of minibatchers - + ############### + # TRAIN MODEL # + ############### + print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = n_minibatches # go through this many + validation_frequency = min(n_train_batches, patience/2) + # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch @@ -239,29 +295,24 @@ best_validation_loss = float('inf') test_score = 0. start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter* n_minibatches): - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches + done_looping = False + epoch = 0 + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - cost_ij = train_model(x,y) + minibatch_avg_cost = train_model(minibatch_index) + # iteration number + iter = epoch * n_train_batches + minibatch_index if (iter+1) % validation_frequency == 0: # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) + validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1,n_minibatches, \ + (epoch, minibatch_index+1,n_train_batches, \ this_validation_loss*100.)) @@ -275,15 +326,15 @@ best_validation_loss = this_validation_loss # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) + test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % \ - (epoch, minibatch_index+1, n_minibatches,test_score*100.)) + (epoch, minibatch_index+1, n_train_batches,test_score*100.)) if patience <= iter : + done_looping = True break end_time = time.clock() @@ -292,12 +343,6 @@ (best_validation_loss * 100., test_score*100.)) print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - - - - - - if __name__ == '__main__': sgd_optimization_mnist()
--- a/code_tutoriel/mlp.py Thu Feb 25 18:53:02 2010 -0500 +++ b/code_tutoriel/mlp.py Fri Feb 26 13:55:27 2010 -0500 @@ -17,22 +17,65 @@ - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 5 -TODO: recommended preprocessing, lr ranges, regularization ranges (explain - to do lr first, then add regularization) - """ __docformat__ = 'restructedtext en' -import numpy, cPickle, gzip - +import numpy, time, cPickle, gzip import theano import theano.tensor as T -import time + +from logistic_sgd import LogisticRegression, load_data + + +class HiddenLayer(object): + def __init__(self, rng, input, n_in, n_out, activation = T.tanh): + """ + Typical hidden layer of a MLP: units are fully-connected and have + sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + NOTE : The nonlinearity used here is tanh + + Hidden unit activation is given by: tanh(dot(input,W) + b) + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.dmatrix + :param input: a symbolic tensor of shape (n_examples, n_in) + + :type n_in: int + :param n_in: dimensionality of input -import theano.tensor.nnet + :type n_out: int + :param n_out: number of hidden units + + :type activation: theano.Op or function + :param activation: Non linearity to be applied in the hidden + layer + """ + self.input = input + + # `W` is initialized with `W_values` which is uniformely sampled + # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = activation(T.dot(input, self.W) + self.b) + # parameters of the model + self.params = [self.W, self.b] + class MLP(object): """Multi-Layer Perceptron Class @@ -40,188 +83,132 @@ A multilayer perceptron is a feedforward artificial neural network model that has one layer or more of hidden units and nonlinear activations. Intermidiate layers usually have as activation function thanh or the - sigmoid function while the top layer is a softamx layer. + sigmoid function (defined here by a ``SigmoidalLayer`` class) while the + top layer is a softamx layer (defined here by a ``LogisticRegression`` + class). """ - def __init__(self, input, n_in, n_hidden, n_out): + def __init__(self, rng, input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) + :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie + :type n_hidden: int :param n_hidden: number of hidden units + :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ - # initialize the parameters theta = (W1,b1,W2,b2) ; note that this - # example contains only one hidden layer, but one can have as many - # layers as he/she wishes, making the network deeper. The only - # problem making the network deep this way is during learning, - # backpropagation being unable to move the network from the starting - # point towards; this is where pre-training helps, giving a good - # starting point for backpropagation, but more about this in the - # other tutorials - - # `W1` is initialized with `W1_values` which is uniformely sampled - # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - W1_values = numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_in+n_hidden)), \ - high = numpy.sqrt(6./(n_in+n_hidden)), \ - size = (n_in, n_hidden)), dtype = theano.config.floatX) - # `W2` is initialized with `W2_values` which is uniformely sampled - # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out) - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - W2_values = numpy.asarray( numpy.random.uniform( - low = -numpy.sqrt(6./(n_hidden+n_out)), \ - high= numpy.sqrt(6./(n_hidden+n_out)),\ - size= (n_hidden, n_out)), dtype = theano.config.floatX) + # Since we are dealing with a one hidden layer MLP, this will + # translate into a TanhLayer connected to the LogisticRegression + # layer; this can be replaced by a SigmoidalLayer, or a layer + # implementing any other nonlinearity + self.hiddenLayer = HiddenLayer(rng = rng, input = input, + n_in = n_in, n_out = n_hidden, + activation = T.tanh) - self.W1 = theano.shared( value = W1_values ) - self.b1 = theano.shared( value = numpy.zeros((n_hidden,), - dtype= theano.config.floatX)) - self.W2 = theano.shared( value = W2_values ) - self.b2 = theano.shared( value = numpy.zeros((n_out,), - dtype= theano.config.floatX)) + # The logistic regression layer gets as input the hidden units + # of the hidden layer + self.logRegressionLayer = LogisticRegression( + input = self.hiddenLayer.output, + n_in = n_hidden, + n_out = n_out) - # symbolic expression computing the values of the hidden layer - self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) - - # symbolic expression computing the values of the top layer - self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) - - # compute prediction as class whose probability is maximal in - # symbolic form - self.y_pred = T.argmax( self.p_y_given_x, axis =1) - # L1 norm ; one regularization option is to enforce L1 norm to # be small - self.L1 = abs(self.W1).sum() + abs(self.W2).sum() + self.L1 = abs(self.hiddenLayer.W).sum() \ + + abs(self.logRegressionLayer.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small - self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() - - - - def negative_log_likelihood(self, y): - """Return the mean of the negative log-likelihood of the prediction - of this model under a given target distribution. - - .. math:: + self.L2_sqr = (self.hiddenLayer.W**2).sum() \ + + (self.logRegressionLayer.W**2).sum() - \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = - \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ - \ell (\theta=\{W,b\}, \mathcal{D}) - + # negative log likelihood of the MLP is given by the negative + # log likelihood of the output of the model, computed in the + # logistic regression layer + self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood + # same holds for the function computing the number of errors + self.errors = self.logRegressionLayer.errors - :param y: corresponds to a vector that gives for each example the - :correct label - """ - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - + # the parameters of the model are the parameters of the two layer it is + # made out of + self.params = self.hiddenLayer.params + self.logRegressionLayer.params - - def errors(self, y): - """Return a float representing the number of errors in the minibatch - over the total number of examples of the minibatch - """ - - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() - - - -def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ - L2_reg = 0.0001, n_iter=100): +def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000, + dataset = 'mnist.pkl.gz'): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. + :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient + :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) + :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) - :param n_iter: maximal number of iterations ot run the optimizer + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + """ - - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch + datasets = load_data(dataset) - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] - ishape = (28,28) # this is the size of MNIST images + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels - # construct the logistic regression class - classifier = MLP( input=x.reshape((batch_size,28*28)),\ - n_in=28*28, n_hidden = 500, n_out=10) + rng = numpy.random.RandomState(1234) + + # construct the MLP class + classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed @@ -230,36 +217,59 @@ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr - # compiling a theano function that computes the mistakes that are made by - # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) + # compiling a Theano function that computes the mistakes that are made + # by the model on a minibatch + test_model = theano.function(inputs = [index], + outputs = classifier.errors(y), + givens={ + x:test_set_x[index*batch_size:(index+1)*batch_size], + y:test_set_y[index*batch_size:(index+1)*batch_size]}) - # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) - g_W1 = T.grad(cost, classifier.W1) - g_b1 = T.grad(cost, classifier.b1) - g_W2 = T.grad(cost, classifier.W2) - g_b2 = T.grad(cost, classifier.b2) + validate_model = theano.function(inputs = [index], + outputs = classifier.errors(y), + givens={ + x:valid_set_x[index*batch_size:(index+1)*batch_size], + y:valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # compute the gradient of cost with respect to theta (sotred in params) + # the resulting gradients will be stored in a list gparams + gparams = [] + for param in classifier.params: + gparam = T.grad(cost, param) + gparams.append(gparam) + # specify how to update the parameters of the model as a dictionary - updates = \ - { classifier.W1: classifier.W1 - learning_rate*g_W1 \ - , classifier.b1: classifier.b1 - learning_rate*g_b1 \ - , classifier.W2: classifier.W2 - learning_rate*g_W2 \ - , classifier.b2: classifier.b2 - learning_rate*g_b2 } + updates = {} + # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of + # same length, zip generates a list C of same size, where each element + # is a pair formed from the two lists : + # C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ] + for param, gparam in zip(classifier.params, gparams): + updates[param] = param - learning_rate*gparam - # compiling a theano function `train_model` that returns the cost, but in - # the same time updates the parameter of the model based on the rules + # compiling a Theano function `train_model` that returns the cost, but + # in the same time updates the parameter of the model based on the rules # defined in `updates` - train_model = theano.function([x, y], cost, updates = updates ) - n_minibatches = len(train_batches) - + train_model =theano.function( inputs = [index], outputs = cost, + updates = updates, + givens={ + x:train_set_x[index*batch_size:(index+1)*batch_size], + y:train_set_y[index*batch_size:(index+1)*batch_size]}) + + ############### + # TRAIN MODEL # + ############### + print '... training' + # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = n_minibatches # go through this many + validation_frequency = min(n_train_batches,patience/2) + # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch @@ -270,56 +280,49 @@ best_iter = 0 test_score = 0. start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter* n_minibatches): + + epoch = 0 + done_looping = False - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - cost_ij = train_model(x,y) + minibatch_avg_cost = train_model(minibatch_index) + # iteration number + iter = epoch * n_train_batches + minibatch_index if (iter+1) % validation_frequency == 0: # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) + validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, n_minibatches, \ - this_validation_loss*100.)) + (epoch, minibatch_index+1,n_train_batches, \ + this_validation_loss*100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: - #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold : patience = max(patience, iter * patience_increase) - # save best validation score and iteration number best_validation_loss = this_validation_loss - best_iter = iter - # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) - print((' epoch %i, minibatch %i/%i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, n_minibatches, - test_score*100.)) + + test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_score = numpy.mean(test_losses) + + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % \ + (epoch, minibatch_index+1, n_train_batches,test_score*100.)) if patience <= iter : - break + done_looping = True + break + end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' @@ -329,5 +332,5 @@ if __name__ == '__main__': - sgd_optimization_mnist() + test_mlp()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/rbm.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,360 @@ +"""This tutorial introduces restricted boltzmann machines (RBM) using Theano. + +Boltzmann Machines (BMs) are a particular form of energy-based model which +contain hidden variables. Restricted Boltzmann Machines further restrict BMs +to those without visible-visible and hidden-hidden connections. +""" + + +import numpy, time, cPickle, gzip, PIL.Image + +import theano +import theano.tensor as T +import os + +from theano.tensor.shared_randomstreams import RandomStreams + +from utils import tile_raster_images +from logistic_sgd import load_data + +class RBM(object): + """Restricted Boltzmann Machine (RBM) """ + def __init__(self, input=None, n_visible=784, n_hidden=500, \ + W = None, hbias = None, vbias = None, numpy_rng = None, + theano_rng = None): + """ + RBM constructor. Defines the parameters of the model along with + basic operations for inferring hidden from visible (and vice-versa), + as well as for performing CD updates. + + :param input: None for standalone RBMs or symbolic variable if RBM is + part of a larger graph. + + :param n_visible: number of visible units + + :param n_hidden: number of hidden units + + :param W: None for standalone RBMs or symbolic variable pointing to a + shared weight matrix in case RBM is part of a DBN network; in a DBN, + the weights are shared between RBMs and layers of a MLP + + :param hbias: None for standalone RBMs or symbolic variable pointing + to a shared hidden units bias vector in case RBM is part of a + different network + + :param vbias: None for standalone RBMs or a symbolic variable + pointing to a shared visible units bias + """ + + self.n_visible = n_visible + self.n_hidden = n_hidden + + + if W is None : + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden+n_visible)), + high = numpy.sqrt(6./(n_hidden+n_visible)), + size = (n_visible, n_hidden)), + dtype = theano.config.floatX) + # theano shared variables for weights and biases + W = theano.shared(value = initial_W, name = 'W') + + if hbias is None : + # create shared variable for hidden units bias + hbias = theano.shared(value = numpy.zeros(n_hidden, + dtype = theano.config.floatX), name='hbias') + + if vbias is None : + # create shared variable for visible units bias + vbias = theano.shared(value =numpy.zeros(n_visible, + dtype = theano.config.floatX),name='vbias') + + if numpy_rng is None: + # create a number generator + numpy_rng = numpy.random.RandomState(1234) + + if theano_rng is None : + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + + # initialize input layer for standalone RBM or layer0 of DBN + self.input = input if input else T.dmatrix('input') + + self.W = W + self.hbias = hbias + self.vbias = vbias + self.theano_rng = theano_rng + # **** WARNING: It is not a good idea to put things in this list + # other than shared variables created in this function. + self.params = [self.W, self.hbias, self.vbias] + self.batch_size = self.input.shape[0] + + def free_energy(self, v_sample): + ''' Function to compute the free energy ''' + wx_b = T.dot(v_sample, self.W) + self.hbias + vbias_term = T.sum(T.dot(v_sample, self.vbias)) + hidden_term = T.sum(T.log(1+T.exp(wx_b))) + return -hidden_term - vbias_term + + def sample_h_given_v(self, v0_sample): + ''' This function infers state of hidden units given visible units ''' + # compute the activation of the hidden units given a sample of the visibles + h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias) + # get a sample of the hiddens given their activation + h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean) + return [h1_mean, h1_sample] + + def sample_v_given_h(self, h0_sample): + ''' This function infers state of visible units given hidden units ''' + # compute the activation of the visible given the hidden sample + v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias) + # get a sample of the visible given their activation + v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean) + return [v1_mean, v1_sample] + + def gibbs_hvh(self, h0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the hidden state''' + v1_mean, v1_sample = self.sample_v_given_h(h0_sample) + h1_mean, h1_sample = self.sample_h_given_v(v1_sample) + return [v1_mean, v1_sample, h1_mean, h1_sample] + + def gibbs_vhv(self, v0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the visible state''' + h1_mean, h1_sample = self.sample_h_given_v(v0_sample) + v1_mean, v1_sample = self.sample_v_given_h(h1_sample) + return [h1_mean, h1_sample, v1_mean, v1_sample] + + def cd(self, lr = 0.1, persistent=None): + """ + This functions implements one step of CD-1 or PCD-1 + + :param lr: learning rate used to train the RBM + :param persistent: None for CD. For PCD, shared variable containing old state + of Gibbs chain. This must be a shared variable of size (batch size, number of + hidden units). + + Returns the updates dictionary. The dictionary contains the update rules for weights + and biases but also an update of the shared variable used to store the persistent + chain, if one is used. + """ + + # compute positive phase + ph_mean, ph_sample = self.sample_h_given_v(self.input) + + # decide how to initialize persistent chain: + # for CD, we use the newly generate hidden sample + # for PCD, we initialize from the old state of the chain + if persistent is None: + chain_start = ph_sample + else: + chain_start = persistent + + # perform actual negative phase + [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start) + + # determine gradients on RBM parameters + g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size + g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size + g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \ + T.dot(nh_mean.T, nv_mean )/ self.batch_size + + gparams = [g_W.T, g_hbias, g_vbias] + + # constructs the update dictionary + updates = {} + for gparam, param in zip(gparams, self.params): + updates[param] = param + gparam * lr + + if persistent: + # Note that this works only if persistent is a shared variable + updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX) + # pseudo-likelihood is a better proxy for PCD + cost = self.get_pseudo_likelihood_cost(updates) + else: + # reconstruction cross-entropy is a better proxy for CD + cost = self.get_reconstruction_cost(updates, nv_mean) + + return cost, updates + + def get_pseudo_likelihood_cost(self, updates): + """Stochastic approximation to the pseudo-likelihood""" + + # index of bit i in expression p(x_i | x_{\i}) + bit_i_idx = theano.shared(value=0, name = 'bit_i_idx') + + # binarize the input image by rounding to nearest integer + xi = T.iround(self.input) + + # calculate free energy for the given bit configuration + fe_xi = self.free_energy(xi) + + # flip bit x_i of matrix xi and preserve all other bits x_{\i} + # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx] + # NB: slice(start,stop,step) is the python object used for + # slicing, e.g. to index matrix x as follows: x[start:stop:step] + xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], + idx_list=(slice(None,None,None),bit_i_idx)) + + # calculate free energy with bit flipped + fe_xi_flip = self.free_energy(xi_flip) + + # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) + cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)) + + # increment bit_i_idx % number as part of updates + updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible + + return cost + + def get_reconstruction_cost(self, updates, nv_mean): + """Approximation to the reconstruction error""" + + cross_entropy = T.mean( + T.sum(self.input*T.log(nv_mean) + + (1 - self.input)*T.log(1-nv_mean), axis = 1)) + + return cross_entropy + + + +def test_rbm(learning_rate=0.1, training_epochs = 15, + dataset='mnist.pkl.gz'): + """ + Demonstrate *** + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used for training the RBM + + :param training_epochs: number of epochs used for training + + :param dataset: path the the pickled dataset + + """ + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + test_set_x , test_set_y = datasets[2] + + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + # initialize storage fot the persistent chain (state = hidden layer of chain) + persistent_chain = theano.shared(numpy.zeros((batch_size, 500))) + + # construct the RBM class + rbm = RBM( input = x, n_visible=28*28, \ + n_hidden = 500,numpy_rng = rng, theano_rng = theano_rng) + + # get the cost and the gradient corresponding to one step of CD + cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain) + + + ################################# + # Training the RBM # + ################################# + dirname = 'lr=%.5f'%learning_rate + os.makedirs(dirname) + os.chdir(dirname) + + # it is ok for a theano function to have no output + # the purpose of train_rbm is solely to update the RBM parameters + train_rbm = theano.function([index], cost, + updates = updates, + givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]}) + + plotting_time = 0. + start_time = time.clock() + + + # go through training epochs + for epoch in xrange(training_epochs): + + # go through the training set + mean_cost = [] + for batch_index in xrange(n_train_batches): + mean_cost += [train_rbm(batch_index)] + + print 'Training epoch %d, cost is '%epoch, numpy.mean(mean_cost) + + # Plot filters after each training epoch + plotting_start = time.clock() + # Construct image from the weight matrix + image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T, + img_shape = (28,28),tile_shape = (10,10), + tile_spacing=(1,1))) + image.save('filters_at_epoch_%i.png'%epoch) + plotting_stop = time.clock() + plotting_time += (plotting_stop - plotting_start) + + end_time = time.clock() + + pretraining_time = (end_time - start_time) - plotting_time + + print ('Training took %f minutes' %(pretraining_time/60.)) + + + ################################# + # Sampling from the RBM # + ################################# + + # find out the number of test samples + number_of_test_samples = test_set_x.value.shape[0] + + # pick random test examples, with which to initialize the persistent chain + test_idx = rng.randint(number_of_test_samples-20) + persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+20]) + + # define one step of Gibbs sampling (mf = mean-field) + [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain) + + # the sample at the end of the channel is returned by ``gibbs_1`` as + # its second output; note that this is computed as a binomial draw, + # therefore it is formed of ints (0 and 1) and therefore needs to + # be converted to the same dtype as ``persistent_vis_chain`` + vis_sample = T.cast(vis_sample, dtype=theano.config.floatX) + + # construct the function that implements our persistent chain + # we generate the "mean field" activations for plotting and the actual samples for + # reinitializing the state of our persistent chain + sample_fn = theano.function([], [vis_mf, vis_sample], + updates = { persistent_vis_chain:vis_sample}) + + # sample the RBM, plotting every `plot_every`-th sample; do this + # until you plot at least `n_samples` + n_samples = 10 + plot_every = 1000 + + for idx in xrange(n_samples): + + # do `plot_every` intermediate samplings of which we do not care + for jdx in xrange(plot_every): + vis_mf, vis_sample = sample_fn() + + # construct image + image = PIL.Image.fromarray(tile_raster_images( + X = vis_mf, + img_shape = (28,28), + tile_shape = (10,10), + tile_spacing = (1,1) ) ) + print ' ... plotting sample ', idx + image.save('sample_%i_step_%i.png'%(idx,idx*jdx)) + +if __name__ == '__main__': + test_rbm()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/test.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,18 @@ +#import convolutional_mlp, dbn, logistic_cg, logistic_sgd, mlp, rbm, SdA_loops, SdA +import convolutional_mlp, logistic_cg, logistic_sgd, mlp, SdA +from nose.plugins.skip import SkipTest +#TODO: dbn, rbm, SdA, SdA_loops, convolutional_mlp +def test_logistic_sgd(): + logistic_sgd.sgd_optimization_mnist(n_epochs=10) +def test_logistic_cg(): + logistic_cg.cg_optimization_mnist(n_epochs=10) +def test_mlp(): + mlp.test_mlp(n_epochs=5) +def test_convolutional_mlp(): + convolutional_mlp.evaluate_lenet5(n_epochs=5,nkerns=[5,5]) +def test_dbn(): + raise SkipTest('Implementation not finished') +def test_rbm(): + raise SkipTest('Implementation not finished') +def test_SdA(): + SdA.test_SdA(pretraining_epochs = 2, training_epochs = 3)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/utils.py Fri Feb 26 13:55:27 2010 -0500 @@ -0,0 +1,125 @@ +""" This file contains different utility functions that are not connected +in anyway to the networks presented in the tutorials, but rather help in +processing the outputs into a more understandable way. + +For example ``tile_raster_images`` helps in generating a easy to grasp +image from a set of samples or weights. +""" + + +import numpy + + +def scale_to_unit_interval(ndar,eps=1e-8): + """ Scales all values in the ndarray ndar to be between 0 and 1 """ + ndar = ndar.copy() + ndar -= ndar.min() + ndar *= 1.0 / (ndar.max()+eps) + return ndar + + +def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0), + scale_rows_to_unit_interval = True, output_pixel_vals = True): + """ + Transform an array with one flattened image per row, into an array in + which images are reshaped and layed out like tiles on a floor. + + This function is useful for visualizing datasets whose rows are images, + and also columns of matrices for transforming those rows + (such as the first layer of a neural net). + + :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can + be 2-D ndarrays or None; + :param X: a 2-D array in which every row is a flattened image. + + :type img_shape: tuple; (height, width) + :param img_shape: the original shape of each image + + :type tile_shape: tuple; (rows, cols) + :param tile_shape: the number of images to tile (rows, cols) + + :param output_pixel_vals: if output should be pixel values (i.e. int8 + values) or floats + + :param scale_rows_to_unit_interval: if the values need to be scaled before + being plotted to [0,1] or not + + + :returns: array suitable for viewing as an image. + (See:`PIL.Image.fromarray`.) + :rtype: a 2-d array with same dtype as X. + + """ + + assert len(img_shape) == 2 + assert len(tile_shape) == 2 + assert len(tile_spacing) == 2 + + # The expression below can be re-written in a more C style as + # follows : + # + # out_shape = [0,0] + # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - + # tile_spacing[0] + # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - + # tile_spacing[1] + out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp + in zip(img_shape, tile_shape, tile_spacing)] + + if isinstance(X, tuple): + assert len(X) == 4 + # Create an output numpy ndarray to store the image + if output_pixel_vals: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8') + else: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype) + + #colors default to 0, alpha defaults to 1 (opaque) + if output_pixel_vals: + channel_defaults = [0,0,0,255] + else: + channel_defaults = [0.,0.,0.,1.] + + for i in xrange(4): + if X[i] is None: + # if channel is None, fill it with zeros of the correct + # dtype + out_array[:,:,i] = numpy.zeros(out_shape, + dtype='uint8' if output_pixel_vals else out_array.dtype + )+channel_defaults[i] + else: + # use a recurrent call to compute the channel and store it + # in the output + out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals) + return out_array + + else: + # if we are dealing with only one channel + H, W = img_shape + Hs, Ws = tile_spacing + + # generate a matrix to store the output + out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype) + + + for tile_row in xrange(tile_shape[0]): + for tile_col in xrange(tile_shape[1]): + if tile_row * tile_shape[1] + tile_col < X.shape[0]: + if scale_rows_to_unit_interval: + # if we should scale values to be between 0 and 1 + # do this by calling the `scale_to_unit_interval` + # function + this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)) + else: + this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape) + # add the slice to the corresponding position in the + # output array + out_array[ + tile_row * (H+Hs):tile_row*(H+Hs)+H, + tile_col * (W+Ws):tile_col*(W+Ws)+W + ] \ + = this_img * (255 if output_pixel_vals else 1) + return out_array + + +