# HG changeset patch # User Dumitru Erhan # Date 1264390469 18000 # Node ID bcc87d3e33a342a42a0469e6ad95002663b135f7 # Parent 0fda55a7de99c9b08ce0f36acd56b9f00f2d857c adding latest tutorial code diff -r 0fda55a7de99 -r bcc87d3e33a3 code_tutoriel/logistic_cg.py --- a/code_tutoriel/logistic_cg.py Sun Jan 24 22:33:33 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,282 +0,0 @@ -""" -This tutorial introduces logistic regression using Theano and conjugate -gradient descent. - -Logistic regression is a probabilistic, linear classifier. It is parametrized -by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is -done by projecting data points onto a set of hyperplanes, the distance to -which is used to determine a class membership probability. - -Mathematically, this can be written as: - -.. math:: - P(Y=i|x, W,b) &= softmax_i(W x + b) \\ - &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} - - -The output of the model or prediction is then done by taking the argmax of -the vector whose i'th element is P(Y=i|x). - -.. math:: - - y_{pred} = argmax_i P(Y=i|x,W,b) - - -This tutorial presents a stochastic gradient descent optimization method -suitable for large datasets, and a conjugate gradient optimization method -that is suitable for smaller datasets. - - -References: - - - textbooks: "Pattern Recognition and Machine Learning" - - Christopher M. Bishop, section 4.3.2 - - -""" -__docformat__ = 'restructedtext en' - - -import numpy, cPickle, gzip - -import time - -import theano -import theano.tensor as T -import theano.tensor.nnet - - -class LogisticRegression(object): - """Multi-class Logistic Regression Class - - The logistic regression is fully described by a weight matrix :math:`W` - and bias vector :math:`b`. Classification is done by projecting data - points onto a set of hyperplanes, the distance to which is used to - determine a class membership probability. - """ - - - - - def __init__(self, input, n_in, n_out): - """ Initialize the parameters of the logistic regression - - :param input: symbolic variable that describes the input of the - architecture ( one minibatch) - - :param n_in: number of input units, the dimension of the space in - which the datapoint lies - - :param n_out: number of output units, the dimension of the space in - which the target lies - - """ - - # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), - # while b is a vector of n_out elements, making theta a vector of - # n_in*n_out + n_out elements - self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out) ) - # W is represented by the fisr n_in*n_out elements of theta - self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out)) - # b is the rest (last n_out elements) - self.b = self.theta[n_in*n_out:n_in*n_out+n_out] - - - # compute vector of class-membership probabilities in symbolic form - self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) - - # compute prediction as class whose probability is maximal in - # symbolic form - self.y_pred=T.argmax(self.p_y_given_x, axis=1) - - - - - - def negative_log_likelihood(self, y): - """Return the negative log-likelihood of the prediction of this model - under a given target distribution. - - .. math:: - - \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = - \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ - \ell (\theta=\{W,b\}, \mathcal{D}) - - - :param y: corresponds to a vector that gives for each example the - :correct label - """ - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - - - - - def errors(self, y): - """Return a float representing the number of errors in the minibatch - over the total number of examples of the minibatch - """ - - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() - - - - - - - -def cg_optimization_mnist( n_iter=50 ): - """Demonstrate conjugate gradient optimization of a log-linear model - - This is demonstrated on MNIST. - - :param n_iter: number of iterations ot run the optimizer - - """ - - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch - - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] - - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] - - - ishape = (28,28) # this is the size of MNIST images - n_in = 28*28 # number of input units - n_out = 10 # number of output units - # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels - - - # construct the logistic regression class - classifier = LogisticRegression( \ - input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) - - # the cost we minimize during training is the negative log likelihood of - # the model in symbolic format - cost = classifier.negative_log_likelihood(y).mean() - - # compile a theano function that computes the mistakes that are made by - # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) - # compile a theano function that returns the gradient of the minibatch - # with respect to theta - batch_grad = theano.function([x, y], T.grad(cost, classifier.theta)) - # compile a thenao function that returns the cost of a minibatch - batch_cost = theano.function([x, y], cost) - - # creates a function that computes the average cost on the training set - def train_fn(theta_value): - classifier.theta.value = theta_value - cost = 0. - for x,y in train_batches : - cost += batch_cost(x,y) - return cost / len(train_batches) - - # creates a function that computes the average gradient of cost with - # respect to theta - def train_fn_grad(theta_value): - classifier.theta.value = theta_value - grad = numpy.zeros(n_in * n_out + n_out) - for x,y in train_batches: - grad += batch_grad(x,y) - return grad/ len(train_batches) - - - - validation_scores = [float('inf'), 0] - - # creates the validation function - def callback(theta_value): - classifier.theta.value = theta_value - #compute the validation loss - this_validation_loss = 0. - for x,y in valid_batches: - this_validation_loss += test_model(x,y) - - this_validation_loss /= len(valid_batches) - - print('validation error %f %%' % (this_validation_loss*100.,)) - - # check if it is better then best validation score got until now - if this_validation_loss < validation_scores[0]: - # if so, replace the old one, and compute the score on the - # testing dataset - validation_scores[0] = this_validation_loss - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - validation_scores[1] = test_score / len(test_batches) - - # using scipy conjugate gradient optimizer - import scipy.optimize - print ("Optimizing using scipy.optimize.fmin_cg...") - start_time = time.clock() - best_w_b = scipy.optimize.fmin_cg( - f=train_fn, - x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype), - fprime=train_fn_grad, - callback=callback, - disp=0, - maxiter=n_iter) - end_time = time.clock() - print(('Optimization complete with best validation score of %f %%, with ' - 'test performance %f %%') % - (validation_scores[0]*100., validation_scores[1]*100.)) - - print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - - - - - - - -if __name__ == '__main__': - cg_optimization_mnist() - diff -r 0fda55a7de99 -r bcc87d3e33a3 code_tutoriel/logistic_sgd.py --- a/code_tutoriel/logistic_sgd.py Sun Jan 24 22:33:33 2010 -0500 +++ b/code_tutoriel/logistic_sgd.py Sun Jan 24 22:34:29 2010 -0500 @@ -147,7 +147,7 @@ :param learning_rate: learning rate used (factor for the stochastic gradient - :param n_iter: number of iterations ot run the optimizer + :param n_iter: maximal number of iterations ot run the optimizer """ diff -r 0fda55a7de99 -r bcc87d3e33a3 code_tutoriel/mlp.py --- a/code_tutoriel/mlp.py Sun Jan 24 22:33:33 2010 -0500 +++ b/code_tutoriel/mlp.py Sun Jan 24 22:34:29 2010 -0500 @@ -71,18 +71,20 @@ # other tutorials # `W1` is initialized with `W1_values` which is uniformely sampled - # from -1/sqrt(n_in) and 1/sqrt(n_in) + # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU W1_values = numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_in+n_hidden)), high = numpy.sqrt(6./(n_in+n_hidden)), \ + low = -numpy.sqrt(6./(n_in+n_hidden)), \ + high = numpy.sqrt(6./(n_in+n_hidden)), \ size = (n_in, n_hidden)), dtype = theano.config.floatX) # `W2` is initialized with `W2_values` which is uniformely sampled - # from -1/sqrt(n_hidden) and 1/sqrt(n_hidden) + # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out) # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU W2_values = numpy.asarray( numpy.random.uniform( - low = numpy.sqrt(6./(n_hidden+n_out)), high= numpy.sqrt(6./(n_hidden+n_out)),\ + low = numpy.sqrt(6./(n_hidden+n_out)), \ + high= numpy.sqrt(6./(n_hidden+n_out)),\ size= (n_hidden, n_out)), dtype = theano.config.floatX) self.W1 = theano.shared( value = W1_values ) @@ -161,14 +163,15 @@ :param learning_rate: learning rate used (factor for the stochastic gradient - :param n_iter: number of iterations ot run the optimizer - :param L1_reg: L1-norm's weight when added to the cost (see regularization) :param L2_reg: L2-norm's weight when added to the cost (see regularization) - """ + + :param n_iter: maximal number of iterations ot run the optimizer + + """ # Load the dataset f = gzip.open('mnist.pkl.gz','rb') @@ -264,6 +267,7 @@ best_params = None best_validation_loss = float('inf') + best_iter = 0 test_score = 0. start_time = time.clock() # have a maximum of `n_iter` iterations through the entire dataset @@ -300,9 +304,11 @@ improvement_threshold : patience = max(patience, iter * patience_increase) + # save best validation score and iteration number best_validation_loss = this_validation_loss + best_iter = iter + # test it on the test set - test_score = 0. for x,y in test_batches: test_score += test_model(x,y) @@ -313,19 +319,15 @@ test_score*100.)) if patience <= iter : - break + break end_time = time.clock() - print(('Optimization complete with best validation score of %f %%,' - 'with test performance %f %%') % - (best_validation_loss * 100., test_score*100.)) + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - - - - if __name__ == '__main__': sgd_optimization_mnist()