# HG changeset patch # User Dumitru Erhan # Date 1267212528 18000 # Node ID 89a725d332aecdd7a188f6cf0b337c9044b6b630 # Parent d37c944133c3d79fa6f3a0917840f9960d0dfb2a moved deepmlp code into baseline/deep_mlp diff -r d37c944133c3 -r 89a725d332ae baseline/deep_mlp/deepmlp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/deep_mlp/deepmlp.py Fri Feb 26 14:28:48 2010 -0500 @@ -0,0 +1,310 @@ +# + +import numpy, cPickle, gzip + + +import theano +import theano.tensor as T + +import time + +import theano.tensor.nnet + +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermidiate layers usually have as activation function thanh or the + sigmoid function while the top layer is a softamx layer. + """ + + + + def __init__(self, input, n_in, n_hidden, n_out): + """Initialize the parameters for the multilayer perceptron + + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :param n_hidden: List representing the number of units for each + hidden layer + + #:param n_layer: Number of hidden layers + + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize the parameters theta = (W,b) ; Here W and b are lists + # where W[i] and b[i] represent the parameters and the bias vector + # of the i-th layer. + n_layer=len(n_hidden) + W_values=[] + b_values=[] + self.W=[] + self.b=[] + + # We first initialize the matrix W[0] and b[0] that represent the parameters + # from the input to the first hidden layer + W_values.append(numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_in+n_hidden[0])), \ + high = numpy.sqrt(6./(n_in+n_hidden[0])), \ + size = (n_in, n_hidden[0])), dtype = theano.config.floatX)) + self.W.append(theano.shared( value = W_values[0] )) + self.b.append(theano.shared( value = numpy.zeros((n_hidden[0],), + dtype= theano.config.floatX))) + + # We initialize the parameters between all consecutive hidden layers + for i in range(1,n_layer): + # Each `W[i]` is initialized with `W_values[i]` which is uniformely sampled + # from -6./sqrt(n_hidden[i]+n_hidden[i+1]) and 6./sqrt(n_hidden[i]+n_hidden[i+1]) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W_values.append(numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ + high = numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ + size = (n_hidden[i-1], n_hidden[i])), dtype = theano.config.floatX)) + self.W.append(theano.shared( value = W_values[i] )) + self.b.append(theano.shared( value = numpy.zeros((n_hidden[i],), + dtype= theano.config.floatX))) + + # We initialize the matrix W[n_layer] and b[n_layer] that represent + # the parameters from the last hidden layer to the output layer using the + # same uniform sampling. + W_values.append(numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)), \ + high= numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)),\ + size= (n_hidden[n_layer-1], n_out)), dtype = theano.config.floatX)) + self.W.append(theano.shared( value = W_values[n_layer])) + self.b.append(theano.shared( value = numpy.zeros((n_out,), + dtype= theano.config.floatX))) + + # List of the symbolic expressions computing the values each hidden layer + self.hidden = [] + + # Symbolic expression of the first hidden layer + self.hidden.append(T.tanh(T.dot(input, self.W[0])+ self.b[0])) + for i in range(1,n_layer): + # Symbolic expression of the i-th hidden layer + self.hidden.append(T.tanh(T.dot(self.hidden[i-1], self.W[i])+ self.b[i])) + + # symbolic expression computing the values of the top layer + self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden[n_layer-1], self.W[n_layer])+self.b[n_layer]) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred = T.argmax( self.p_y_given_x, axis =1) + + # L1 norm ; one regularization option is to enforce L1 norm to + # be small + self.L1=abs(self.W[0]).sum() + self.L2_sqr=abs(self.W[0]).sum() + for i in range(1,n_layer+1): + self.L1 += abs(self.W[i]).sum() + # square of L2 norm ; one regularization option is to enforce + # square of L2 norm to be small + for i in range(n_layer+1): + self.L2_sqr += abs(self.W[i]**2).sum() + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() +def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ + L2_reg = 0.0001, n_iter=100,n_hidden=[200,100,90,80,70]): + """ + Demonstrate stochastic gradient descent optimization for a multilayer + perceptron + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :param L1_reg: L1-norm's weight when added to the cost (see + regularization) + + :param L2_reg: L2-norm's weight when added to the cost (see + regularization) + + :param n_iter: maximal number of iterations ot run the optimizer + + """ + + # Load the dataset + f = gzip.open('mnist.pkl.gz','rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + # make minibatches of size 20 + batch_size = 20 # sized of the minibatch + + # Dealing with the training set + # get the list of training images (x) and their labels (y) + (train_set_x, train_set_y) = train_set + + # initialize the list of training minibatches with empty list + train_batches = [] + for i in xrange(0, len(train_set_x), batch_size): + # add to the list of minibatches the minibatch starting at + # position i, ending at position i+batch_size + # a minibatch is a pair ; the first element of the pair is a list + # of datapoints, the second element is the list of corresponding + # labels + train_batches = train_batches + \ + [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + + # Dealing with the validation set + (valid_set_x, valid_set_y) = valid_set + # initialize the list of validation minibatches + valid_batches = [] + for i in xrange(0, len(valid_set_x), batch_size): + valid_batches = valid_batches + \ + [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] + + # Dealing with the testing set + (test_set_x, test_set_y) = test_set + # initialize the list of testing minibatches + test_batches = [] + for i in xrange(0, len(test_set_x), batch_size): + test_batches = test_batches + \ + [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + + + ishape = (28,28) # this is the size of MNIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + # construct the logistic regression class + classifier = MLP( input=x.reshape((batch_size,28*28)),\ + n_in=28*28, n_hidden=n_hidden, n_out=10) + + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = classifier.negative_log_likelihood(y) \ + + L1_reg * classifier.L1 \ + + L2_reg * classifier.L2_sqr + + # compiling a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + g_W=[] + g_b=[] + # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) + for i in range(len(n_hidden)+1): + g_W.append(T.grad(cost, classifier.W[i])) + g_b.append(T.grad(cost, classifier.b[i])) + + + # specify how to update the parameters of the model as a dictionary + updates={} + for i in range(len(n_hidden)+1): + updates[classifier.W[i]]= classifier.W[i] - learning_rate*g_W[i] + updates[classifier.b[i]]= classifier.b[i] - learning_rate*g_b[i] + # compiling a theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function([x, y], cost, updates = updates ) + n_minibatches = len(train_batches) + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + # have a maximum of `n_iter` iterations through the entire dataset + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + cost_ij = train_model(x,y) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + # sum up the errors for each minibatch + this_validation_loss += test_model(x,y) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(valid_batches) + + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_score = 0. + for x,y in test_batches: + test_score += test_model(x,y) + test_score /= len(test_batches) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + if patience <= iter : + break + + end_time = time.clock() + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + #test on NIST (you need pylearn and access to NIST to do that) +if __name__ == '__main__': + sgd_optimization_mnist() + diff -r d37c944133c3 -r 89a725d332ae scripts/deepmlp.py --- a/scripts/deepmlp.py Fri Feb 26 14:24:11 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,310 +0,0 @@ -# - -import numpy, cPickle, gzip - - -import theano -import theano.tensor as T - -import time - -import theano.tensor.nnet - -class MLP(object): - """Multi-Layer Perceptron Class - - A multilayer perceptron is a feedforward artificial neural network model - that has one layer or more of hidden units and nonlinear activations. - Intermidiate layers usually have as activation function thanh or the - sigmoid function while the top layer is a softamx layer. - """ - - - - def __init__(self, input, n_in, n_hidden, n_out): - """Initialize the parameters for the multilayer perceptron - - :param input: symbolic variable that describes the input of the - architecture (one minibatch) - - :param n_in: number of input units, the dimension of the space in - which the datapoints lie - - :param n_hidden: List representing the number of units for each - hidden layer - - #:param n_layer: Number of hidden layers - - :param n_out: number of output units, the dimension of the space in - which the labels lie - - """ - - # initialize the parameters theta = (W,b) ; Here W and b are lists - # where W[i] and b[i] represent the parameters and the bias vector - # of the i-th layer. - n_layer=len(n_hidden) - W_values=[] - b_values=[] - self.W=[] - self.b=[] - - # We first initialize the matrix W[0] and b[0] that represent the parameters - # from the input to the first hidden layer - W_values.append(numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_in+n_hidden[0])), \ - high = numpy.sqrt(6./(n_in+n_hidden[0])), \ - size = (n_in, n_hidden[0])), dtype = theano.config.floatX)) - self.W.append(theano.shared( value = W_values[0] )) - self.b.append(theano.shared( value = numpy.zeros((n_hidden[0],), - dtype= theano.config.floatX))) - - # We initialize the parameters between all consecutive hidden layers - for i in range(1,n_layer): - # Each `W[i]` is initialized with `W_values[i]` which is uniformely sampled - # from -6./sqrt(n_hidden[i]+n_hidden[i+1]) and 6./sqrt(n_hidden[i]+n_hidden[i+1]) - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - W_values.append(numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ - high = numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ - size = (n_hidden[i-1], n_hidden[i])), dtype = theano.config.floatX)) - self.W.append(theano.shared( value = W_values[i] )) - self.b.append(theano.shared( value = numpy.zeros((n_hidden[i],), - dtype= theano.config.floatX))) - - # We initialize the matrix W[n_layer] and b[n_layer] that represent - # the parameters from the last hidden layer to the output layer using the - # same uniform sampling. - W_values.append(numpy.asarray( numpy.random.uniform( - low = -numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)), \ - high= numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)),\ - size= (n_hidden[n_layer-1], n_out)), dtype = theano.config.floatX)) - self.W.append(theano.shared( value = W_values[n_layer])) - self.b.append(theano.shared( value = numpy.zeros((n_out,), - dtype= theano.config.floatX))) - - # List of the symbolic expressions computing the values each hidden layer - self.hidden = [] - - # Symbolic expression of the first hidden layer - self.hidden.append(T.tanh(T.dot(input, self.W[0])+ self.b[0])) - for i in range(1,n_layer): - # Symbolic expression of the i-th hidden layer - self.hidden.append(T.tanh(T.dot(self.hidden[i-1], self.W[i])+ self.b[i])) - - # symbolic expression computing the values of the top layer - self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden[n_layer-1], self.W[n_layer])+self.b[n_layer]) - - # compute prediction as class whose probability is maximal in - # symbolic form - self.y_pred = T.argmax( self.p_y_given_x, axis =1) - - # L1 norm ; one regularization option is to enforce L1 norm to - # be small - self.L1=abs(self.W[0]).sum() - self.L2_sqr=abs(self.W[0]).sum() - for i in range(1,n_layer+1): - self.L1 += abs(self.W[i]).sum() - # square of L2 norm ; one regularization option is to enforce - # square of L2 norm to be small - for i in range(n_layer+1): - self.L2_sqr += abs(self.W[i]**2).sum() - - def negative_log_likelihood(self, y): - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - def errors(self, y): - """Return a float representing the number of errors in the minibatch - over the total number of examples of the minibatch - """ - - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() -def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ - L2_reg = 0.0001, n_iter=100,n_hidden=[200,100,90,80,70]): - """ - Demonstrate stochastic gradient descent optimization for a multilayer - perceptron - - This is demonstrated on MNIST. - - :param learning_rate: learning rate used (factor for the stochastic - gradient - - :param L1_reg: L1-norm's weight when added to the cost (see - regularization) - - :param L2_reg: L2-norm's weight when added to the cost (see - regularization) - - :param n_iter: maximal number of iterations ot run the optimizer - - """ - - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch - - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] - - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] - - - ishape = (28,28) # this is the size of MNIST images - - # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels - - # construct the logistic regression class - classifier = MLP( input=x.reshape((batch_size,28*28)),\ - n_in=28*28, n_hidden=n_hidden, n_out=10) - - # the cost we minimize during training is the negative log likelihood of - # the model plus the regularization terms (L1 and L2); cost is expressed - # here symbolically - cost = classifier.negative_log_likelihood(y) \ - + L1_reg * classifier.L1 \ - + L2_reg * classifier.L2_sqr - - # compiling a theano function that computes the mistakes that are made by - # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) - g_W=[] - g_b=[] - # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) - for i in range(len(n_hidden)+1): - g_W.append(T.grad(cost, classifier.W[i])) - g_b.append(T.grad(cost, classifier.b[i])) - - - # specify how to update the parameters of the model as a dictionary - updates={} - for i in range(len(n_hidden)+1): - updates[classifier.W[i]]= classifier.W[i] - learning_rate*g_W[i] - updates[classifier.b[i]]= classifier.b[i] - learning_rate*g_b[i] - # compiling a theano function `train_model` that returns the cost, but in - # the same time updates the parameter of the model based on the rules - # defined in `updates` - train_model = theano.function([x, y], cost, updates = updates ) - n_minibatches = len(train_batches) - - # early-stopping parameters - patience = 10000 # look as this many examples regardless - patience_increase = 2 # wait this much longer when a new best is - # found - improvement_threshold = 0.995 # a relative improvement of this much is - # considered significant - validation_frequency = n_minibatches # go through this many - # minibatche before checking the network - # on the validation set; in this case we - # check every epoch - - - best_params = None - best_validation_loss = float('inf') - best_iter = 0 - test_score = 0. - start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter* n_minibatches): - - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches - - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - cost_ij = train_model(x,y) - - if (iter+1) % validation_frequency == 0: - # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) - - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, n_minibatches, \ - this_validation_loss*100.)) - - - # if we got the best validation score until now - if this_validation_loss < best_validation_loss: - - #improve patience if loss improvement is good enough - if this_validation_loss < best_validation_loss * \ - improvement_threshold : - patience = max(patience, iter * patience_increase) - - # save best validation score and iteration number - best_validation_loss = this_validation_loss - best_iter = iter - - # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) - print((' epoch %i, minibatch %i/%i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, n_minibatches, - test_score*100.)) - - if patience <= iter : - break - - end_time = time.clock() - print(('Optimization complete. Best validation score of %f %% ' - 'obtained at iteration %i, with test performance %f %%') % - (best_validation_loss * 100., best_iter, test_score*100.)) - print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - #test on NIST (you need pylearn and access to NIST to do that) -if __name__ == '__main__': - sgd_optimization_mnist() -