Mercurial > ift6266
diff code_tutoriel/logistic_sgd.py @ 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | bcc87d3e33a3 |
children |
line wrap: on
line diff
--- a/code_tutoriel/logistic_sgd.py Thu Feb 25 18:53:02 2010 -0500 +++ b/code_tutoriel/logistic_sgd.py Fri Feb 26 13:55:27 2010 -0500 @@ -32,20 +32,14 @@ - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 4.3.2 - """ __docformat__ = 'restructedtext en' - -import numpy, cPickle, gzip - -import time +import numpy, time, cPickle, gzip import theano import theano.tensor as T -import theano.tensor.nnet - class LogisticRegression(object): """Multi-class Logistic Regression Class @@ -62,23 +56,26 @@ def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression + :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the - architecture (one minibatch) - + architecture (one minibatch) + + :type n_in: int :param n_in: number of input units, the dimension of the space in - which the datapoints lie + which the datapoints lie + :type n_out: int :param n_out: number of output units, the dimension of the space in - which the labels lie + which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - self.W = theano.shared( value=numpy.zeros((n_in,n_out), - dtype = theano.config.floatX) ) + self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX), + name='W') # initialize the baises b as a vector of n_out 0s - self.b = theano.shared( value=numpy.zeros((n_out,), - dtype = theano.config.floatX) ) + self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX), + name='b') # compute vector of class-membership probabilities in symbolic form @@ -88,6 +85,9 @@ # symbolic form self.y_pred=T.argmax(self.p_y_given_x, axis=1) + # parameters of the model + self.params = [self.W, self.b] + @@ -102,23 +102,30 @@ \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) - + :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the - :correct label + correct label Note: we use the mean instead of the sum so that - the learning rate is less dependent on the batch size + the learning rate is less dependent on the batch size """ + # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch + # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1] + # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class + # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]] + # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v, + # i.e., the mean log-likelihood across the minibatch. return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - - def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label """ # check if y has same dimension of y_pred @@ -134,72 +141,103 @@ raise NotImplementedError() +def load_data(dataset): + ''' Loads the dataset + + :type dataset: string + :param dataset: the path to the dataset (here MNIST) + ''' + + ############# + # LOAD DATA # + ############# + print '... loading data' + + # Load the dataset + f = gzip.open(dataset,'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + + def shared_dataset(data_xy): + """ Function that loads the dataset into shared variables + + The reason we store our dataset in shared variables is to allow + Theano to copy it into the GPU memory (when code is run on GPU). + Since copying data into the GPU is slow, copying a minibatch everytime + is needed (the default behaviour if the data is not in a shared + variable) would lead to a large decrease in performance. + """ + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + # When storing data on the GPU it has to be stored as floats + # therefore we will store the labels as ``floatX`` as well + # (``shared_y`` does exactly that). But during our computations + # we need them as ints (we use labels as index, and if they are + # floats it doesn't make sense) therefore instead of returning + # ``shared_y`` we will have to cast it to int. This little hack + # lets ous get around this issue + return shared_x, T.cast(shared_y, 'int32') + + test_set_x, test_set_y = shared_dataset(test_set) + valid_set_x, valid_set_y = shared_dataset(valid_set) + train_set_x, train_set_y = shared_dataset(train_set) + + rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)] + return rval -def sgd_optimization_mnist( learning_rate=0.01, n_iter=100): + +def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. + :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic - gradient + gradient) - :param n_iter: maximal number of iterations ot run the optimizer + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ + datasets = load_data(dataset) - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + batch_size = 600 # size of the minibatch - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size - ishape = (28,28) # this is the size of MNIST images + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels # construct the logistic regression class - classifier = LogisticRegression( \ - input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) + # Each MNIST image has size 28*28 + classifier = LogisticRegression( input=x, n_in=28*28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format @@ -207,11 +245,21 @@ # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) + test_model = theano.function(inputs = [index], + outputs = classifier.errors(y), + givens={ + x:test_set_x[index*batch_size:(index+1)*batch_size], + y:test_set_y[index*batch_size:(index+1)*batch_size]}) + + validate_model = theano.function( inputs = [index], + outputs = classifier.errors(y), + givens={ + x:valid_set_x[index*batch_size:(index+1)*batch_size], + y:valid_set_y[index*batch_size:(index+1)*batch_size]}) # compute the gradient of cost with respect to theta = (W,b) - g_W = T.grad(cost, classifier.W) - g_b = T.grad(cost, classifier.b) + g_W = T.grad(cost = cost, wrt = classifier.W) + g_b = T.grad(cost = cost, wrt = classifier.b) # specify how to update the parameters of the model as a dictionary updates ={classifier.W: classifier.W - learning_rate*g_W,\ @@ -220,17 +268,25 @@ # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` - train_model = theano.function([x, y], cost, updates = updates ) + train_model = theano.function(inputs = [index], + outputs = cost, + updates = updates, + givens={ + x:train_set_x[index*batch_size:(index+1)*batch_size], + y:train_set_y[index*batch_size:(index+1)*batch_size]}) - n_minibatches = len(train_batches) # number of minibatchers - + ############### + # TRAIN MODEL # + ############### + print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = n_minibatches # go through this many + validation_frequency = min(n_train_batches, patience/2) + # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch @@ -239,29 +295,24 @@ best_validation_loss = float('inf') test_score = 0. start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter* n_minibatches): - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches + done_looping = False + epoch = 0 + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - cost_ij = train_model(x,y) + minibatch_avg_cost = train_model(minibatch_index) + # iteration number + iter = epoch * n_train_batches + minibatch_index if (iter+1) % validation_frequency == 0: # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) + validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1,n_minibatches, \ + (epoch, minibatch_index+1,n_train_batches, \ this_validation_loss*100.)) @@ -275,15 +326,15 @@ best_validation_loss = this_validation_loss # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) + test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % \ - (epoch, minibatch_index+1, n_minibatches,test_score*100.)) + (epoch, minibatch_index+1, n_train_batches,test_score*100.)) if patience <= iter : + done_looping = True break end_time = time.clock() @@ -292,12 +343,6 @@ (best_validation_loss * 100., test_score*100.)) print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - - - - - - if __name__ == '__main__': sgd_optimization_mnist()