ift6266: code_tutoriel/mlp.py comparison

comparison code_tutoriel/mlp.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.

author	Dumitru Erhan <dumitru.erhan@gmail.com>
date	Fri, 26 Feb 2010 13:55:27 -0500
parents	827de2cc34f8
children

comparison

equal deleted inserted replaced

-:e3de934a98b6
+:4bc5eeec6394
 References:
 - textbooks: "Pattern Recognition and Machine Learning" -
 Christopher M. Bishop, section 5
-TODO: recommended preprocessing, lr ranges, regularization ranges (explain
-to do lr first, then add regularization)
 """
 __docformat__ = 'restructedtext en'
-import numpy, cPickle, gzip
+import numpy, time, cPickle, gzip
 import theano
 import theano.tensor as T
-import time
+from logistic_sgd import LogisticRegression, load_data
-import theano.tensor.nnet
+class HiddenLayer(object):
+def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
+"""
+Typical hidden layer of a MLP: units are fully-connected and have
+sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
+and the bias vector b is of shape (n_out,).
+NOTE : The nonlinearity used here is tanh
+Hidden unit activation is given by: tanh(dot(input,W) + b)
+:type rng: numpy.random.RandomState
+:param rng: a random number generator used to initialize weights
+:type input: theano.tensor.dmatrix
+:param input: a symbolic tensor of shape (n_examples, n_in)
+:type n_in: int
+:param n_in: dimensionality of input
+:type n_out: int
+:param n_out: number of hidden units
+:type activation: theano.Op or function
+:param activation: Non linearity to be applied in the hidden
+layer
+"""
+self.input = input
+# `W` is initialized with `W_values` which is uniformely sampled
+# from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
+# the output of uniform if converted using asarray to dtype
+# theano.config.floatX so that the code is runable on GPU
+W_values = numpy.asarray( rng.uniform( \
+low = -numpy.sqrt(6./(n_in+n_out)), \
+high = numpy.sqrt(6./(n_in+n_out)), \
+size = (n_in, n_out)), dtype = theano.config.floatX)
+self.W = theano.shared(value = W_values)
+b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
+self.b = theano.shared(value= b_values)
+self.output = activation(T.dot(input, self.W) + self.b)
+# parameters of the model
+self.params = [self.W, self.b]
 class MLP(object):
 """Multi-Layer Perceptron Class
 A multilayer perceptron is a feedforward artificial neural network model
 that has one layer or more of hidden units and nonlinear activations.
 Intermidiate layers usually have as activation function thanh or the
-sigmoid function  while the top layer is a softamx layer.
+sigmoid function (defined here by a ``SigmoidalLayer`` class)  while the
+top layer is a softamx layer (defined here by a ``LogisticRegression``
+class).
 """
-def __init__(self, input, n_in, n_hidden, n_out):
+def __init__(self, rng, input, n_in, n_hidden, n_out):
 """Initialize the parameters for the multilayer perceptron
+:type rng: numpy.random.RandomState
+:param rng: a random number generator used to initialize weights
+:type input: theano.tensor.TensorType
 :param input: symbolic variable that describes the input of the
 architecture (one minibatch)
+:type n_in: int
 :param n_in: number of input units, the dimension of the space in
 which the datapoints lie
+:type n_hidden: int
 :param n_hidden: number of hidden units
+:type n_out: int
 :param n_out: number of output units, the dimension of the space in
 which the labels lie
 """
-# initialize the parameters theta = (W1,b1,W2,b2) ; note that this
+# Since we are dealing with a one hidden layer MLP, this will
-# example contains only one hidden layer, but one can have as many
+# translate into a TanhLayer connected to the LogisticRegression
-# layers as he/she wishes, making the network deeper. The only
+# layer; this can be replaced by a SigmoidalLayer, or a layer
-# problem making the network deep this way is during learning,
+# implementing any other nonlinearity
-# backpropagation being unable to move the network from the starting
+self.hiddenLayer = HiddenLayer(rng = rng, input = input,
-# point towards; this is where pre-training helps, giving a good
+n_in = n_in, n_out = n_hidden,
-# starting point for backpropagation, but more about this in the
+activation = T.tanh)
-# other tutorials
+# The logistic regression layer gets as input the hidden units
-# `W1` is initialized with `W1_values` which is uniformely sampled
+# of the hidden layer
-# from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
+self.logRegressionLayer = LogisticRegression(
-# the output of uniform if converted using asarray to dtype
+input = self.hiddenLayer.output,
-# theano.config.floatX so that the code is runable on GPU
+n_in  = n_hidden,
-W1_values = numpy.asarray( numpy.random.uniform( \
+n_out = n_out)
-low = -numpy.sqrt(6./(n_in+n_hidden)), \
-high = numpy.sqrt(6./(n_in+n_hidden)), \
-size = (n_in, n_hidden)), dtype = theano.config.floatX)
-# `W2` is initialized with `W2_values` which is uniformely sampled
-# from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
-# the output of uniform if converted using asarray to dtype
-# theano.config.floatX so that the code is runable on GPU
-W2_values = numpy.asarray( numpy.random.uniform(
-low = -numpy.sqrt(6./(n_hidden+n_out)), \
-high= numpy.sqrt(6./(n_hidden+n_out)),\
-size= (n_hidden, n_out)), dtype = theano.config.floatX)
-self.W1 = theano.shared( value = W1_values )
-self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
-dtype= theano.config.floatX))
-self.W2 = theano.shared( value = W2_values )
-self.b2 = theano.shared( value = numpy.zeros((n_out,),
-dtype= theano.config.floatX))
-# symbolic expression computing the values of the hidden layer
-self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
-# symbolic expression computing the values of the top layer
-self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
-# compute prediction as class whose probability is maximal in
-# symbolic form
-self.y_pred = T.argmax( self.p_y_given_x, axis =1)
 # L1 norm ; one regularization option is to enforce L1 norm to
 # be small
-self.L1     = abs(self.W1).sum() + abs(self.W2).sum()
+self.L1 = abs(self.hiddenLayer.W).sum() \
++ abs(self.logRegressionLayer.W).sum()
 # square of L2 norm ; one regularization option is to enforce
 # square of L2 norm to be small
-self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
+self.L2_sqr = (self.hiddenLayer.W**2).sum() \
++ (self.logRegressionLayer.W**2).sum()
+# negative log likelihood of the MLP is given by the negative
-def negative_log_likelihood(self, y):
+# log likelihood of the output of the model, computed in the
-"""Return the mean of the negative log-likelihood of the prediction
+# logistic regression layer
-of this model under a given target distribution.
+self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
+# same holds for the function computing the number of errors
-.. math::
+self.errors = self.logRegressionLayer.errors
-\frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+# the parameters of the model are the parameters of the two layer it is
-\frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+# made out of
-\ell (\theta=\{W,b\}, \mathcal{D})
+self.params = self.hiddenLayer.params + self.logRegressionLayer.params
-:param y: corresponds to a vector that gives for each example the
+def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
-:correct label
+dataset = 'mnist.pkl.gz'):
-"""
-return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
-def errors(self, y):
-"""Return a float representing the number of errors in the minibatch
-over the total number of examples of the minibatch
-"""
-# check if y has same dimension of y_pred
-if y.ndim != self.y_pred.ndim:
-raise TypeError('y should have the same shape as self.y_pred',
-('y', target.type, 'y_pred', self.y_pred.type))
-# check if y is of the correct datatype
-if y.dtype.startswith('int'):
-# the T.neq operator returns a vector of 0s and 1s, where 1
-# represents a mistake in prediction
-return T.mean(T.neq(self.y_pred, y))
-else:
-raise NotImplementedError()
-def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \
-L2_reg = 0.0001, n_iter=100):
 """
 Demonstrate stochastic gradient descent optimization for a multilayer
 perceptron
 This is demonstrated on MNIST.
+:type learning_rate: float
 :param learning_rate: learning rate used (factor for the stochastic
 gradient
+:type L1_reg: float
 :param L1_reg: L1-norm's weight when added to the cost (see
 regularization)
+:type L2_reg: float
 :param L2_reg: L2-norm's weight when added to the cost (see
 regularization)
-:param n_iter: maximal number of iterations ot run the optimizer
+:type n_epochs: int
+:param n_epochs: maximal number of epochs to run the optimizer
+:type dataset: string
+:param dataset: the path of the MNIST dataset file from
+http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 """
+datasets = load_data(dataset)
-# Load the dataset
-f = gzip.open('mnist.pkl.gz','rb')
+train_set_x, train_set_y = datasets[0]
-train_set, valid_set, test_set = cPickle.load(f)
+valid_set_x, valid_set_y = datasets[1]
-f.close()
+test_set_x , test_set_y  = datasets[2]
-# make minibatches of size 20
-batch_size = 20    # sized of the minibatch
+batch_size = 20    # size of the minibatch
-# Dealing with the training set
-# get the list of training images (x) and their labels (y)
+# compute number of minibatches for training, validation and testing
-(train_set_x, train_set_y) = train_set
+n_train_batches = train_set_x.value.shape[0] / batch_size
-# initialize the list of training minibatches with empty list
+n_valid_batches = valid_set_x.value.shape[0] / batch_size
-train_batches = []
+n_test_batches  = test_set_x.value.shape[0]  / batch_size
-for i in xrange(0, len(train_set_x), batch_size):
-# add to the list of minibatches the minibatch starting at
+######################
-# position i, ending at position i+batch_size
+# BUILD ACTUAL MODEL #
-# a minibatch is a pair ; the first element of the pair is a list
+######################
-# of datapoints, the second element is the list of corresponding
+print '... building the model'
-# labels
-train_batches = train_batches + \
-[(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
-# Dealing with the validation set
-(valid_set_x, valid_set_y) = valid_set
-# initialize the list of validation minibatches
-valid_batches = []
-for i in xrange(0, len(valid_set_x), batch_size):
-valid_batches = valid_batches + \
-[(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-# Dealing with the testing set
-(test_set_x, test_set_y) = test_set
-# initialize the list of testing minibatches
-test_batches = []
-for i in xrange(0, len(test_set_x), batch_size):
-test_batches = test_batches + \
-[(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
-ishape     = (28,28) # this is the size of MNIST images
 # allocate symbolic variables for the data
-x = T.fmatrix()  # the data is presented as rasterized images
+index = T.lscalar()    # index to a [mini]batch
-y = T.lvector()  # the labels are presented as 1D vector of
+x     = T.matrix('x')  # the data is presented as rasterized images
-# [long int] labels
+y     = T.ivector('y') # the labels are presented as 1D vector of
+# [int] labels
-# construct the logistic regression class
-classifier = MLP( input=x.reshape((batch_size,28*28)),\
+rng = numpy.random.RandomState(1234)
-n_in=28*28, n_hidden = 500, n_out=10)
+# construct the MLP class
+classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10)
 # the cost we minimize during training is the negative log likelihood of
 # the model plus the regularization terms (L1 and L2); cost is expressed
 # here symbolically
 cost = classifier.negative_log_likelihood(y) \
 + L1_reg * classifier.L1 \
 + L2_reg * classifier.L2_sqr
-# compiling a theano function that computes the mistakes that are made by
+# compiling a Theano function that computes the mistakes that are made
-# the model on a minibatch
+# by the model on a minibatch
-test_model = theano.function([x,y], classifier.errors(y))
+test_model = theano.function(inputs = [index],
+outputs = classifier.errors(y),
-# compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
+givens={
-g_W1 = T.grad(cost, classifier.W1)
+x:test_set_x[index*batch_size:(index+1)*batch_size],
-g_b1 = T.grad(cost, classifier.b1)
+y:test_set_y[index*batch_size:(index+1)*batch_size]})
-g_W2 = T.grad(cost, classifier.W2)
-g_b2 = T.grad(cost, classifier.b2)
+validate_model = theano.function(inputs = [index],
+outputs = classifier.errors(y),
+givens={
+x:valid_set_x[index*batch_size:(index+1)*batch_size],
+y:valid_set_y[index*batch_size:(index+1)*batch_size]})
+# compute the gradient of cost with respect to theta (sotred in params)
+# the resulting gradients will be stored in a list gparams
+gparams = []
+for param in classifier.params:
+gparam  = T.grad(cost, param)
+gparams.append(gparam)
 # specify how to update the parameters of the model as a dictionary
-updates = \
+updates = {}
-{ classifier.W1: classifier.W1 - learning_rate*g_W1 \
+# given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of
-, classifier.b1: classifier.b1 - learning_rate*g_b1 \
+# same length, zip generates a list C of same size, where each element
-, classifier.W2: classifier.W2 - learning_rate*g_W2 \
+# is a pair formed from the two lists :
-, classifier.b2: classifier.b2 - learning_rate*g_b2 }
+#    C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ]
+for param, gparam in zip(classifier.params, gparams):
-# compiling a theano function `train_model` that returns the cost, but in
+updates[param] = param - learning_rate*gparam
-# the same time updates the parameter of the model based on the rules
+# compiling a Theano function `train_model` that returns the cost, but
+# in the same time updates the parameter of the model based on the rules
 # defined in `updates`
-train_model = theano.function([x, y], cost, updates = updates )
+train_model =theano.function( inputs = [index], outputs = cost,
-n_minibatches        = len(train_batches)
+updates = updates,
+givens={
+x:train_set_x[index*batch_size:(index+1)*batch_size],
+y:train_set_y[index*batch_size:(index+1)*batch_size]})
+###############
+# TRAIN MODEL #
+###############
+print '... training'
 # early-stopping parameters
 patience              = 10000 # look as this many examples regardless
 patience_increase     = 2     # wait this much longer when a new best is
 # found
 improvement_threshold = 0.995 # a relative improvement of this much is
 # considered significant
-validation_frequency  = n_minibatches  # go through this many
+validation_frequency  = min(n_train_batches,patience/2)
+# go through this many
 # minibatche before checking the network
 # on the validation set; in this case we
 # check every epoch
 best_params          = None
 best_validation_loss = float('inf')
 best_iter            = 0
 test_score           = 0.
 start_time = time.clock()
-# have a maximum of `n_iter` iterations through the entire dataset
-for iter in xrange(n_iter* n_minibatches):
+epoch = 0
+done_looping = False
-# get epoch and minibatch index
-epoch           = iter / n_minibatches
+while (epoch < n_epochs) and (not done_looping):
-minibatch_index =  iter % n_minibatches
+epoch = epoch + 1
+for minibatch_index in xrange(n_train_batches):
-# get the minibatches corresponding to `iter` modulo
-# `len(train_batches)`
+minibatch_avg_cost = train_model(minibatch_index)
-x,y = train_batches[ minibatch_index ]
+# iteration number
-cost_ij = train_model(x,y)
+iter = epoch * n_train_batches + minibatch_index
 if (iter+1) % validation_frequency == 0:
 # compute zero-one loss on validation set
-this_validation_loss = 0.
+validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
-for x,y in valid_batches:
+this_validation_loss = numpy.mean(validation_losses)
-# sum up the errors for each minibatch
-this_validation_loss += test_model(x,y)
-# get the average by dividing with the number of minibatches
-this_validation_loss /= len(valid_batches)
 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-(epoch, minibatch_index+1, n_minibatches, \
+(epoch, minibatch_index+1,n_train_batches, \
 this_validation_loss*100.))
 # if we got the best validation score until now
 if this_validation_loss < best_validation_loss:
 #improve patience if loss improvement is good enough
 if this_validation_loss < best_validation_loss *  \
 improvement_threshold :
 patience = max(patience, iter * patience_increase)
-# save best validation score and iteration number
 best_validation_loss = this_validation_loss
-best_iter = iter
 # test it on the test set
-test_score = 0.
-for x,y in test_batches:
+test_losses = [test_model(i) for i in xrange(n_test_batches)]
-test_score += test_model(x,y)
+test_score = numpy.mean(test_losses)
-test_score /= len(test_batches)
 print(('     epoch %i, minibatch %i/%i, test error of best '
-'model %f %%') %
+'model %f %%') % \
-(epoch, minibatch_index+1, n_minibatches,
+(epoch, minibatch_index+1, n_train_batches,test_score*100.))
-test_score*100.))
 if patience <= iter :
-break
+done_looping = True
+break
 end_time = time.clock()
 print(('Optimization complete. Best validation score of %f %% '
 'obtained at iteration %i, with test performance %f %%') %
 (best_validation_loss * 100., best_iter, test_score*100.))
 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
 if __name__ == '__main__':
-sgd_optimization_mnist()
+test_mlp()

Mercurial > ift6266

comparison code_tutoriel/mlp.py @ 165:4bc5eeec6394