diff code_tutoriel/logistic_sgd.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents bcc87d3e33a3
children
line wrap: on
line diff
--- a/code_tutoriel/logistic_sgd.py	Thu Feb 25 18:53:02 2010 -0500
+++ b/code_tutoriel/logistic_sgd.py	Fri Feb 26 13:55:27 2010 -0500
@@ -32,20 +32,14 @@
     - textbooks: "Pattern Recognition and Machine Learning" - 
                  Christopher M. Bishop, section 4.3.2
 
-
 """
 __docformat__ = 'restructedtext en'
 
-
-import numpy, cPickle, gzip
-
-import time
+import numpy, time, cPickle, gzip
 
 import theano
 import theano.tensor as T
 
-import theano.tensor.nnet
-
 
 class LogisticRegression(object):
     """Multi-class Logistic Regression Class
@@ -62,23 +56,26 @@
     def __init__(self, input, n_in, n_out):
         """ Initialize the parameters of the logistic regression
 
+        :type input: theano.tensor.TensorType
         :param input: symbolic variable that describes the input of the 
-        architecture (one minibatch)
-
+                      architecture (one minibatch)
+        
+        :type n_in: int
         :param n_in: number of input units, the dimension of the space in 
-        which the datapoints lie
+                     which the datapoints lie
 
+        :type n_out: int
         :param n_out: number of output units, the dimension of the space in 
-        which the labels lie
+                      which the labels lie
 
         """ 
 
         # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
-        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
-                                            dtype = theano.config.floatX) )
+        self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX),
+                                name='W')
         # initialize the baises b as a vector of n_out 0s
-        self.b = theano.shared( value=numpy.zeros((n_out,), 
-                                            dtype = theano.config.floatX) )
+        self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX),
+                               name='b')
 
 
         # compute vector of class-membership probabilities in symbolic form
@@ -88,6 +85,9 @@
         # symbolic form
         self.y_pred=T.argmax(self.p_y_given_x, axis=1)
 
+        # parameters of the model
+        self.params = [self.W, self.b]
+
 
 
 
@@ -102,23 +102,30 @@
             \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                 \ell (\theta=\{W,b\}, \mathcal{D})
 
-
+        :type y: theano.tensor.TensorType
         :param y: corresponds to a vector that gives for each example the
-        :correct label
+                  correct label
 
         Note: we use the mean instead of the sum so that
-        the learning rate is less dependent on the batch size
+              the learning rate is less dependent on the batch size
         """
+        # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
+        # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
+        # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class 
+        # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
+        # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
+        # i.e., the mean log-likelihood across the minibatch.
         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
 
 
-
-
-
     def errors(self, y):
         """Return a float representing the number of errors in the minibatch 
         over the total number of examples of the minibatch ; zero one
         loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the 
+                  correct label
         """
 
         # check if y has same dimension of y_pred 
@@ -134,72 +141,103 @@
             raise NotImplementedError()
 
 
+def load_data(dataset):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+
+    #############
+    # LOAD DATA #
+    #############
+    print '... loading data'
+
+    # Load the dataset 
+    f = gzip.open(dataset,'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+
+
+    def shared_dataset(data_xy):
+        """ Function that loads the dataset into shared variables
+        
+        The reason we store our dataset in shared variables is to allow 
+        Theano to copy it into the GPU memory (when code is run on GPU). 
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared 
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+        shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are 
+        # floats it doesn't make sense) therefore instead of returning 
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast(shared_y, 'int32')
+
+    test_set_x,  test_set_y  = shared_dataset(test_set)
+    valid_set_x, valid_set_y = shared_dataset(valid_set)
+    train_set_x, train_set_y = shared_dataset(train_set)
+
+    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
+    return rval
 
 
 
-def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
+
+def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'):
     """
     Demonstrate stochastic gradient descent optimization of a log-linear 
     model
 
     This is demonstrated on MNIST.
     
+    :type learning_rate: float
     :param learning_rate: learning rate used (factor for the stochastic 
-    gradient
+                          gradient)
 
-    :param n_iter: maximal number of iterations ot run the optimizer 
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer 
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from 
+                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
 
     """
+    datasets = load_data(dataset)
 
-    # Load the dataset 
-    f = gzip.open('mnist.pkl.gz','rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-
-    # make minibatches of size 20 
-    batch_size = 20    # sized of the minibatch
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
 
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = train_set
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+    batch_size = 600    # size of the minibatch
 
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = valid_set
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = test_set
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
 
 
-    ishape     = (28,28) # this is the size of MNIST images
+    ######################
+    # BUILD ACTUAL MODEL #
+    ######################
+    print '... building the model'
+
 
     # allocate symbolic variables for the data
-    x = T.fmatrix()  # the data is presented as rasterized images
-    y = T.lvector()  # the labels are presented as 1D vector of 
-                     # [long int] labels
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
 
     # construct the logistic regression class
-    classifier = LogisticRegression( \
-                   input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
+    # Each MNIST image has size 28*28
+    classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
 
     # the cost we minimize during training is the negative log likelihood of 
     # the model in symbolic format
@@ -207,11 +245,21 @@
 
     # compiling a Theano function that computes the mistakes that are made by 
     # the model on a minibatch
-    test_model = theano.function([x,y], classifier.errors(y))
+    test_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:test_set_x[index*batch_size:(index+1)*batch_size],
+                y:test_set_y[index*batch_size:(index+1)*batch_size]})
+
+    validate_model = theano.function( inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:valid_set_x[index*batch_size:(index+1)*batch_size],
+                y:valid_set_y[index*batch_size:(index+1)*batch_size]})
 
     # compute the gradient of cost with respect to theta = (W,b) 
-    g_W = T.grad(cost, classifier.W)
-    g_b = T.grad(cost, classifier.b)
+    g_W = T.grad(cost = cost, wrt = classifier.W)
+    g_b = T.grad(cost = cost, wrt = classifier.b)
 
     # specify how to update the parameters of the model as a dictionary
     updates ={classifier.W: classifier.W - learning_rate*g_W,\
@@ -220,17 +268,25 @@
     # compiling a Theano function `train_model` that returns the cost, but in 
     # the same time updates the parameter of the model based on the rules 
     # defined in `updates`
-    train_model = theano.function([x, y], cost, updates = updates )
+    train_model = theano.function(inputs = [index], 
+            outputs = cost, 
+            updates = updates,
+            givens={
+                x:train_set_x[index*batch_size:(index+1)*batch_size],
+                y:train_set_y[index*batch_size:(index+1)*batch_size]})
 
-    n_minibatches        = len(train_batches) # number of minibatchers
- 
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training the model'
     # early-stopping parameters
     patience              = 5000  # look as this many examples regardless
     patience_increase     = 2     # wait this much longer when a new best is 
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = n_minibatches  # go through this many 
+    validation_frequency  = min(n_train_batches, patience/2)  
+                                  # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
@@ -239,29 +295,24 @@
     best_validation_loss = float('inf')
     test_score           = 0.
     start_time = time.clock()
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter* n_minibatches):
 
-        # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
+    done_looping = False 
+    epoch = 0  
+    while (epoch < n_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
 
-        # get the minibatches corresponding to `iter` modulo
-        # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
-        cost_ij = train_model(x,y)
+        minibatch_avg_cost = train_model(minibatch_index)
+        # iteration number
+        iter = epoch * n_train_batches + minibatch_index
 
         if (iter+1) % validation_frequency == 0: 
             # compute zero-one loss on validation set 
-            this_validation_loss = 0.
-            for x,y in valid_batches:
-                # sum up the errors for each minibatch
-                this_validation_loss += test_model(x,y)
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(valid_batches)
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+            this_validation_loss = numpy.mean(validation_losses)
 
             print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                 (epoch, minibatch_index+1,n_minibatches, \
+                 (epoch, minibatch_index+1,n_train_batches, \
                   this_validation_loss*100.))
 
 
@@ -275,15 +326,15 @@
                 best_validation_loss = this_validation_loss
                 # test it on the test set
 
-                test_score = 0.
-                for x,y in test_batches:
-                    test_score += test_model(x,y)
-                test_score /= len(test_batches)
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_score  = numpy.mean(test_losses)
+
                 print(('     epoch %i, minibatch %i/%i, test error of best ' 
                        'model %f %%') % \
-                  (epoch, minibatch_index+1, n_minibatches,test_score*100.))
+                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
 
         if patience <= iter :
+                done_looping = True
                 break
 
     end_time = time.clock()
@@ -292,12 +343,6 @@
                  (best_validation_loss * 100., test_score*100.))
     print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
 
-
-
-
-
-
-
 if __name__ == '__main__':
     sgd_optimization_mnist()