ift6266: code_tutoriel/logistic

comparison code_tutoriel/logistic_sgd.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.

author	Dumitru Erhan <dumitru.erhan@gmail.com>
date	Fri, 26 Feb 2010 13:55:27 -0500
parents	bcc87d3e33a3
children

comparison

equal deleted inserted replaced

-:e3de934a98b6
+:4bc5eeec6394
 References:
 - textbooks: "Pattern Recognition and Machine Learning" -
 Christopher M. Bishop, section 4.3.2
 """
 __docformat__ = 'restructedtext en'
+import numpy, time, cPickle, gzip
-import numpy, cPickle, gzip
-import time
 import theano
 import theano.tensor as T
-import theano.tensor.nnet
 class LogisticRegression(object):
 """Multi-class Logistic Regression Class
 def __init__(self, input, n_in, n_out):
 """ Initialize the parameters of the logistic regression
+:type input: theano.tensor.TensorType
 :param input: symbolic variable that describes the input of the
 architecture (one minibatch)
+:type n_in: int
 :param n_in: number of input units, the dimension of the space in
 which the datapoints lie
+:type n_out: int
 :param n_out: number of output units, the dimension of the space in
 which the labels lie
 """
 # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
-self.W = theano.shared( value=numpy.zeros((n_in,n_out),
+self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX),
-dtype = theano.config.floatX) )
+name='W')
 # initialize the baises b as a vector of n_out 0s
-self.b = theano.shared( value=numpy.zeros((n_out,),
+self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX),
-dtype = theano.config.floatX) )
+name='b')
 # compute vector of class-membership probabilities in symbolic form
 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
 # compute prediction as class whose probability is maximal in
 # symbolic form
 self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+# parameters of the model
+self.params = [self.W, self.b]
 def negative_log_likelihood(self, y):
 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
 \ell (\theta=\{W,b\}, \mathcal{D})
+:type y: theano.tensor.TensorType
 :param y: corresponds to a vector that gives for each example the
-:correct label
+correct label
 Note: we use the mean instead of the sum so that
 the learning rate is less dependent on the batch size
 """
+# y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
+# T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
+# T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
+# LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
+# and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
+# i.e., the mean log-likelihood across the minibatch.
 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
 def errors(self, y):
 """Return a float representing the number of errors in the minibatch
 over the total number of examples of the minibatch ; zero one
 loss over the size of the minibatch
+:type y: theano.tensor.TensorType
+:param y: corresponds to a vector that gives for each example the
+correct label
 """
 # check if y has same dimension of y_pred
 if y.ndim != self.y_pred.ndim:
 raise TypeError('y should have the same shape as self.y_pred',
 return T.mean(T.neq(self.y_pred, y))
 else:
 raise NotImplementedError()
+def load_data(dataset):
+''' Loads the dataset
-def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
+:type dataset: string
+:param dataset: the path to the dataset (here MNIST)
+'''
+#############
+# LOAD DATA #
+#############
+print '... loading data'
+# Load the dataset
+f = gzip.open(dataset,'rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+def shared_dataset(data_xy):
+""" Function that loads the dataset into shared variables
+The reason we store our dataset in shared variables is to allow
+Theano to copy it into the GPU memory (when code is run on GPU).
+Since copying data into the GPU is slow, copying a minibatch everytime
+is needed (the default behaviour if the data is not in a shared
+variable) would lead to a large decrease in performance.
+"""
+data_x, data_y = data_xy
+shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+# When storing data on the GPU it has to be stored as floats
+# therefore we will store the labels as ``floatX`` as well
+# (``shared_y`` does exactly that). But during our computations
+# we need them as ints (we use labels as index, and if they are
+# floats it doesn't make sense) therefore instead of returning
+# ``shared_y`` we will have to cast it to int. This little hack
+# lets ous get around this issue
+return shared_x, T.cast(shared_y, 'int32')
+test_set_x,  test_set_y  = shared_dataset(test_set)
+valid_set_x, valid_set_y = shared_dataset(valid_set)
+train_set_x, train_set_y = shared_dataset(train_set)
+rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
+return rval
+def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'):
 """
 Demonstrate stochastic gradient descent optimization of a log-linear
 model
 This is demonstrated on MNIST.
+:type learning_rate: float
 :param learning_rate: learning rate used (factor for the stochastic
-gradient
+gradient)
-:param n_iter: maximal number of iterations ot run the optimizer
+:type n_epochs: int
+:param n_epochs: maximal number of epochs to run the optimizer
+:type dataset: string
+:param dataset: the path of the MNIST dataset file from
+http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 """
+datasets = load_data(dataset)
-# Load the dataset
-f = gzip.open('mnist.pkl.gz','rb')
+train_set_x, train_set_y = datasets[0]
-train_set, valid_set, test_set = cPickle.load(f)
+valid_set_x, valid_set_y = datasets[1]
-f.close()
+test_set_x , test_set_y  = datasets[2]
-# make minibatches of size 20
+batch_size = 600    # size of the minibatch
-batch_size = 20    # sized of the minibatch
+# compute number of minibatches for training, validation and testing
-# Dealing with the training set
+n_train_batches = train_set_x.value.shape[0] / batch_size
-# get the list of training images (x) and their labels (y)
+n_valid_batches = valid_set_x.value.shape[0] / batch_size
-(train_set_x, train_set_y) = train_set
+n_test_batches  = test_set_x.value.shape[0]  / batch_size
-# initialize the list of training minibatches with empty list
-train_batches = []
-for i in xrange(0, len(train_set_x), batch_size):
+######################
-# add to the list of minibatches the minibatch starting at
+# BUILD ACTUAL MODEL #
-# position i, ending at position i+batch_size
+######################
-# a minibatch is a pair ; the first element of the pair is a list
+print '... building the model'
-# of datapoints, the second element is the list of corresponding
-# labels
-train_batches = train_batches + \
-[(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
-# Dealing with the validation set
-(valid_set_x, valid_set_y) = valid_set
-# initialize the list of validation minibatches
-valid_batches = []
-for i in xrange(0, len(valid_set_x), batch_size):
-valid_batches = valid_batches + \
-[(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-# Dealing with the testing set
-(test_set_x, test_set_y) = test_set
-# initialize the list of testing minibatches
-test_batches = []
-for i in xrange(0, len(test_set_x), batch_size):
-test_batches = test_batches + \
-[(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
-ishape     = (28,28) # this is the size of MNIST images
 # allocate symbolic variables for the data
-x = T.fmatrix()  # the data is presented as rasterized images
+index = T.lscalar()    # index to a [mini]batch
-y = T.lvector()  # the labels are presented as 1D vector of
+x     = T.matrix('x')  # the data is presented as rasterized images
-# [long int] labels
+y     = T.ivector('y') # the labels are presented as 1D vector of
+# [int] labels
 # construct the logistic regression class
-classifier = LogisticRegression( \
+# Each MNIST image has size 28*28
-input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
+classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
 # the cost we minimize during training is the negative log likelihood of
 # the model in symbolic format
 cost = classifier.negative_log_likelihood(y)
 # compiling a Theano function that computes the mistakes that are made by
 # the model on a minibatch
-test_model = theano.function([x,y], classifier.errors(y))
+test_model = theano.function(inputs = [index],
+outputs = classifier.errors(y),
+givens={
+x:test_set_x[index*batch_size:(index+1)*batch_size],
+y:test_set_y[index*batch_size:(index+1)*batch_size]})
+validate_model = theano.function( inputs = [index],
+outputs = classifier.errors(y),
+givens={
+x:valid_set_x[index*batch_size:(index+1)*batch_size],
+y:valid_set_y[index*batch_size:(index+1)*batch_size]})
 # compute the gradient of cost with respect to theta = (W,b)
-g_W = T.grad(cost, classifier.W)
+g_W = T.grad(cost = cost, wrt = classifier.W)
-g_b = T.grad(cost, classifier.b)
+g_b = T.grad(cost = cost, wrt = classifier.b)
 # specify how to update the parameters of the model as a dictionary
 updates ={classifier.W: classifier.W - learning_rate*g_W,\
 classifier.b: classifier.b - learning_rate*g_b}
 # compiling a Theano function `train_model` that returns the cost, but in
 # the same time updates the parameter of the model based on the rules
 # defined in `updates`
-train_model = theano.function([x, y], cost, updates = updates )
+train_model = theano.function(inputs = [index],
+outputs = cost,
-n_minibatches        = len(train_batches) # number of minibatchers
+updates = updates,
+givens={
+x:train_set_x[index*batch_size:(index+1)*batch_size],
+y:train_set_y[index*batch_size:(index+1)*batch_size]})
+###############
+# TRAIN MODEL #
+###############
+print '... training the model'
 # early-stopping parameters
 patience              = 5000  # look as this many examples regardless
 patience_increase     = 2     # wait this much longer when a new best is
 # found
 improvement_threshold = 0.995 # a relative improvement of this much is
 # considered significant
-validation_frequency  = n_minibatches  # go through this many
+validation_frequency  = min(n_train_batches, patience/2)
+# go through this many
 # minibatche before checking the network
 # on the validation set; in this case we
 # check every epoch
 best_params          = None
 best_validation_loss = float('inf')
 test_score           = 0.
 start_time = time.clock()
-# have a maximum of `n_iter` iterations through the entire dataset
-for iter in xrange(n_iter* n_minibatches):
+done_looping = False
+epoch = 0
-# get epoch and minibatch index
+while (epoch < n_epochs) and (not done_looping):
-epoch           = iter / n_minibatches
+epoch = epoch + 1
-minibatch_index =  iter % n_minibatches
+for minibatch_index in xrange(n_train_batches):
-# get the minibatches corresponding to `iter` modulo
+minibatch_avg_cost = train_model(minibatch_index)
-# `len(train_batches)`
+# iteration number
-x,y = train_batches[ minibatch_index ]
+iter = epoch * n_train_batches + minibatch_index
-cost_ij = train_model(x,y)
 if (iter+1) % validation_frequency == 0:
 # compute zero-one loss on validation set
-this_validation_loss = 0.
+validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
-for x,y in valid_batches:
+this_validation_loss = numpy.mean(validation_losses)
-# sum up the errors for each minibatch
-this_validation_loss += test_model(x,y)
-# get the average by dividing with the number of minibatches
-this_validation_loss /= len(valid_batches)
 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-(epoch, minibatch_index+1,n_minibatches, \
+(epoch, minibatch_index+1,n_train_batches, \
 this_validation_loss*100.))
 # if we got the best validation score until now
 if this_validation_loss < best_validation_loss:
 patience = max(patience, iter * patience_increase)
 best_validation_loss = this_validation_loss
 # test it on the test set
-test_score = 0.
+test_losses = [test_model(i) for i in xrange(n_test_batches)]
-for x,y in test_batches:
+test_score  = numpy.mean(test_losses)
-test_score += test_model(x,y)
-test_score /= len(test_batches)
 print(('     epoch %i, minibatch %i/%i, test error of best '
 'model %f %%') % \
-(epoch, minibatch_index+1, n_minibatches,test_score*100.))
+(epoch, minibatch_index+1, n_train_batches,test_score*100.))
 if patience <= iter :
+done_looping = True
 break
 end_time = time.clock()
 print(('Optimization complete with best validation score of %f %%,'
 'with test performance %f %%') %
 (best_validation_loss * 100., test_score*100.))
 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
 if __name__ == '__main__':
 sgd_optimization_mnist()

Mercurial > ift6266

comparison code_tutoriel/logistic_sgd.py @ 165:4bc5eeec6394