diff code_tutoriel/mlp.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents 827de2cc34f8
children
line wrap: on
line diff
--- a/code_tutoriel/mlp.py	Thu Feb 25 18:53:02 2010 -0500
+++ b/code_tutoriel/mlp.py	Fri Feb 26 13:55:27 2010 -0500
@@ -17,22 +17,65 @@
     - textbooks: "Pattern Recognition and Machine Learning" - 
                  Christopher M. Bishop, section 5
 
-TODO: recommended preprocessing, lr ranges, regularization ranges (explain 
-      to do lr first, then add regularization)
-
 """
 __docformat__ = 'restructedtext en'
 
 
-import numpy, cPickle, gzip
-
+import numpy, time, cPickle, gzip
 
 import theano
 import theano.tensor as T
 
-import time 
+
+from logistic_sgd import LogisticRegression, load_data
+
+
+class HiddenLayer(object):
+    def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
+        """
+        Typical hidden layer of a MLP: units are fully-connected and have
+        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
+        and the bias vector b is of shape (n_out,).
+
+        NOTE : The nonlinearity used here is tanh
+        
+        Hidden unit activation is given by: tanh(dot(input,W) + b)
+
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.dmatrix
+        :param input: a symbolic tensor of shape (n_examples, n_in)
+
+        :type n_in: int
+        :param n_in: dimensionality of input
 
-import theano.tensor.nnet
+        :type n_out: int
+        :param n_out: number of hidden units
+
+        :type activation: theano.Op or function
+        :param activation: Non linearity to be applied in the hidden 
+                              layer
+        """
+        self.input = input
+
+        # `W` is initialized with `W_values` which is uniformely sampled
+        # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        W_values = numpy.asarray( rng.uniform( \
+              low = -numpy.sqrt(6./(n_in+n_out)), \
+              high = numpy.sqrt(6./(n_in+n_out)), \
+              size = (n_in, n_out)), dtype = theano.config.floatX)
+        self.W = theano.shared(value = W_values)
+
+        b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
+        self.b = theano.shared(value= b_values)
+
+        self.output = activation(T.dot(input, self.W) + self.b)
+        # parameters of the model
+        self.params = [self.W, self.b]
+
 
 class MLP(object):
     """Multi-Layer Perceptron Class
@@ -40,188 +83,132 @@
     A multilayer perceptron is a feedforward artificial neural network model 
     that has one layer or more of hidden units and nonlinear activations. 
     Intermidiate layers usually have as activation function thanh or the 
-    sigmoid function  while the top layer is a softamx layer. 
+    sigmoid function (defined here by a ``SigmoidalLayer`` class)  while the 
+    top layer is a softamx layer (defined here by a ``LogisticRegression`` 
+    class). 
     """
 
 
 
-    def __init__(self, input, n_in, n_hidden, n_out):
+    def __init__(self, rng, input, n_in, n_hidden, n_out):
         """Initialize the parameters for the multilayer perceptron
 
+        :type rng: numpy.random.RandomState
+        :param rng: a random number generator used to initialize weights
+
+        :type input: theano.tensor.TensorType
         :param input: symbolic variable that describes the input of the 
         architecture (one minibatch)
 
+        :type n_in: int
         :param n_in: number of input units, the dimension of the space in 
         which the datapoints lie
 
+        :type n_hidden: int
         :param n_hidden: number of hidden units 
 
+        :type n_out: int
         :param n_out: number of output units, the dimension of the space in 
         which the labels lie
 
         """
 
-        # initialize the parameters theta = (W1,b1,W2,b2) ; note that this 
-        # example contains only one hidden layer, but one can have as many 
-        # layers as he/she wishes, making the network deeper. The only 
-        # problem making the network deep this way is during learning, 
-        # backpropagation being unable to move the network from the starting
-        # point towards; this is where pre-training helps, giving a good 
-        # starting point for backpropagation, but more about this in the 
-        # other tutorials
-        
-        # `W1` is initialized with `W1_values` which is uniformely sampled
-        # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
-        # the output of uniform if converted using asarray to dtype 
-        # theano.config.floatX so that the code is runable on GPU
-        W1_values = numpy.asarray( numpy.random.uniform( \
-              low = -numpy.sqrt(6./(n_in+n_hidden)), \
-              high = numpy.sqrt(6./(n_in+n_hidden)), \
-              size = (n_in, n_hidden)), dtype = theano.config.floatX)
-        # `W2` is initialized with `W2_values` which is uniformely sampled 
-        # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
-        # the output of uniform if converted using asarray to dtype 
-        # theano.config.floatX so that the code is runable on GPU
-        W2_values = numpy.asarray( numpy.random.uniform( 
-              low = -numpy.sqrt(6./(n_hidden+n_out)), \
-              high= numpy.sqrt(6./(n_hidden+n_out)),\
-              size= (n_hidden, n_out)), dtype = theano.config.floatX)
+        # Since we are dealing with a one hidden layer MLP, this will 
+        # translate into a TanhLayer connected to the LogisticRegression
+        # layer; this can be replaced by a SigmoidalLayer, or a layer 
+        # implementing any other nonlinearity
+        self.hiddenLayer = HiddenLayer(rng = rng, input = input, 
+                                 n_in = n_in, n_out = n_hidden,
+                                 activation = T.tanh)
 
-        self.W1 = theano.shared( value = W1_values )
-        self.b1 = theano.shared( value = numpy.zeros((n_hidden,), 
-                                                dtype= theano.config.floatX))
-        self.W2 = theano.shared( value = W2_values )
-        self.b2 = theano.shared( value = numpy.zeros((n_out,), 
-                                                dtype= theano.config.floatX))
+        # The logistic regression layer gets as input the hidden units 
+        # of the hidden layer
+        self.logRegressionLayer = LogisticRegression( 
+                                    input = self.hiddenLayer.output,
+                                    n_in  = n_hidden,
+                                    n_out = n_out)
 
-        # symbolic expression computing the values of the hidden layer
-        self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
-
-        # symbolic expression computing the values of the top layer 
-        self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
-
-        # compute prediction as class whose probability is maximal in 
-        # symbolic form
-        self.y_pred = T.argmax( self.p_y_given_x, axis =1)
-        
         # L1 norm ; one regularization option is to enforce L1 norm to 
         # be small 
-        self.L1     = abs(self.W1).sum() + abs(self.W2).sum()
+        self.L1 = abs(self.hiddenLayer.W).sum() \
+                + abs(self.logRegressionLayer.W).sum()
 
         # square of L2 norm ; one regularization option is to enforce 
         # square of L2 norm to be small
-        self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
-
-
-
-    def negative_log_likelihood(self, y):
-        """Return the mean of the negative log-likelihood of the prediction
-        of this model under a given target distribution.
-
-        .. math::
+        self.L2_sqr = (self.hiddenLayer.W**2).sum() \
+                    + (self.logRegressionLayer.W**2).sum()
 
-            \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 
-            \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
-                \ell (\theta=\{W,b\}, \mathcal{D}) 
-
+        # negative log likelihood of the MLP is given by the negative 
+        # log likelihood of the output of the model, computed in the 
+        # logistic regression layer
+        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
+        # same holds for the function computing the number of errors
+        self.errors = self.logRegressionLayer.errors
 
-        :param y: corresponds to a vector that gives for each example the
-        :correct label
-        """
-        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
-
+        # the parameters of the model are the parameters of the two layer it is
+        # made out of
+        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
 
 
-
-    def errors(self, y):
-        """Return a float representing the number of errors in the minibatch 
-        over the total number of examples of the minibatch 
-        """
-
-        # check if y has same dimension of y_pred 
-        if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred', 
-                ('y', target.type, 'y_pred', self.y_pred.type))
-        # check if y is of the correct datatype        
-        if y.dtype.startswith('int'):
-            # the T.neq operator returns a vector of 0s and 1s, where 1
-            # represents a mistake in prediction
-            return T.mean(T.neq(self.y_pred, y))
-        else:
-            raise NotImplementedError()
-
-
-
-def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \
-                            L2_reg = 0.0001, n_iter=100):
+def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
+            dataset = 'mnist.pkl.gz'):
     """
     Demonstrate stochastic gradient descent optimization for a multilayer 
     perceptron
 
     This is demonstrated on MNIST.
 
+    :type learning_rate: float
     :param learning_rate: learning rate used (factor for the stochastic 
     gradient
 
+    :type L1_reg: float
     :param L1_reg: L1-norm's weight when added to the cost (see 
     regularization)
 
+    :type L2_reg: float
     :param L2_reg: L2-norm's weight when added to the cost (see 
     regularization)
  
-    :param n_iter: maximal number of iterations ot run the optimizer 
+    :type n_epochs: int
+    :param n_epochs: maximal number of epochs to run the optimizer 
+
+    :type dataset: string
+    :param dataset: the path of the MNIST dataset file from 
+                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+
 
    """
-
-    # Load the dataset 
-    f = gzip.open('mnist.pkl.gz','rb')
-    train_set, valid_set, test_set = cPickle.load(f)
-    f.close()
-
-    # make minibatches of size 20 
-    batch_size = 20    # sized of the minibatch
+    datasets = load_data(dataset)
 
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = train_set
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
 
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = valid_set
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = test_set
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
 
 
-    ishape     = (28,28) # this is the size of MNIST images
+    batch_size = 20    # size of the minibatch
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+    ######################
+    # BUILD ACTUAL MODEL #
+    ###################### 
+    print '... building the model'
 
     # allocate symbolic variables for the data
-    x = T.fmatrix()  # the data is presented as rasterized images
-    y = T.lvector()  # the labels are presented as 1D vector of 
-                          # [long int] labels
+    index = T.lscalar()    # index to a [mini]batch 
+    x     = T.matrix('x')  # the data is presented as rasterized images
+    y     = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
 
-    # construct the logistic regression class
-    classifier = MLP( input=x.reshape((batch_size,28*28)),\
-                      n_in=28*28, n_hidden = 500, n_out=10)
+    rng = numpy.random.RandomState(1234)
+
+    # construct the MLP class
+    classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10)
 
     # the cost we minimize during training is the negative log likelihood of 
     # the model plus the regularization terms (L1 and L2); cost is expressed
@@ -230,36 +217,59 @@
          + L1_reg * classifier.L1 \
          + L2_reg * classifier.L2_sqr 
 
-    # compiling a theano function that computes the mistakes that are made by 
-    # the model on a minibatch
-    test_model = theano.function([x,y], classifier.errors(y))
+    # compiling a Theano function that computes the mistakes that are made
+    # by the model on a minibatch
+    test_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:test_set_x[index*batch_size:(index+1)*batch_size],
+                y:test_set_y[index*batch_size:(index+1)*batch_size]})
 
-    # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) 
-    g_W1 = T.grad(cost, classifier.W1)
-    g_b1 = T.grad(cost, classifier.b1)
-    g_W2 = T.grad(cost, classifier.W2)
-    g_b2 = T.grad(cost, classifier.b2)
+    validate_model = theano.function(inputs = [index], 
+            outputs = classifier.errors(y),
+            givens={
+                x:valid_set_x[index*batch_size:(index+1)*batch_size],
+                y:valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+    # compute the gradient of cost with respect to theta (sotred in params)
+    # the resulting gradients will be stored in a list gparams
+    gparams = []
+    for param in classifier.params:
+        gparam  = T.grad(cost, param)
+        gparams.append(gparam)
+
 
     # specify how to update the parameters of the model as a dictionary
-    updates = \
-        { classifier.W1: classifier.W1 - learning_rate*g_W1 \
-        , classifier.b1: classifier.b1 - learning_rate*g_b1 \
-        , classifier.W2: classifier.W2 - learning_rate*g_W2 \
-        , classifier.b2: classifier.b2 - learning_rate*g_b2 }
+    updates = {}
+    # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of 
+    # same length, zip generates a list C of same size, where each element
+    # is a pair formed from the two lists : 
+    #    C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ] 
+    for param, gparam in zip(classifier.params, gparams):
+        updates[param] = param - learning_rate*gparam
 
-    # compiling a theano function `train_model` that returns the cost, but in 
-    # the same time updates the parameter of the model based on the rules 
+    # compiling a Theano function `train_model` that returns the cost, but  
+    # in the same time updates the parameter of the model based on the rules 
     # defined in `updates`
-    train_model = theano.function([x, y], cost, updates = updates )
-    n_minibatches        = len(train_batches) 
- 
+    train_model =theano.function( inputs = [index], outputs = cost, 
+            updates = updates,
+            givens={
+                x:train_set_x[index*batch_size:(index+1)*batch_size],
+                y:train_set_y[index*batch_size:(index+1)*batch_size]})
+
+    ###############
+    # TRAIN MODEL #
+    ###############
+    print '... training'
+
     # early-stopping parameters
     patience              = 10000 # look as this many examples regardless
     patience_increase     = 2     # wait this much longer when a new best is 
                                   # found
     improvement_threshold = 0.995 # a relative improvement of this much is 
                                   # considered significant
-    validation_frequency  = n_minibatches  # go through this many 
+    validation_frequency  = min(n_train_batches,patience/2)  
+                                  # go through this many 
                                   # minibatche before checking the network 
                                   # on the validation set; in this case we 
                                   # check every epoch 
@@ -270,56 +280,49 @@
     best_iter            = 0
     test_score           = 0.
     start_time = time.clock()
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter* n_minibatches):
+
+    epoch = 0
+    done_looping = False
 
-        # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
+    while (epoch < n_epochs) and (not done_looping):
+      epoch = epoch + 1
+      for minibatch_index in xrange(n_train_batches):
 
-        # get the minibatches corresponding to `iter` modulo
-        # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
-        cost_ij = train_model(x,y)
+        minibatch_avg_cost = train_model(minibatch_index)
+        # iteration number
+        iter = epoch * n_train_batches + minibatch_index
 
         if (iter+1) % validation_frequency == 0: 
             # compute zero-one loss on validation set 
-            this_validation_loss = 0.
-            for x,y in valid_batches:
-                # sum up the errors for each minibatch
-                this_validation_loss += test_model(x,y)
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(valid_batches)
+            validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
+            this_validation_loss = numpy.mean(validation_losses)
 
             print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                   (epoch, minibatch_index+1, n_minibatches, \
-                    this_validation_loss*100.))
+                 (epoch, minibatch_index+1,n_train_batches, \
+                  this_validation_loss*100.))
 
 
             # if we got the best validation score until now
             if this_validation_loss < best_validation_loss:
-
                 #improve patience if loss improvement is good enough
                 if this_validation_loss < best_validation_loss *  \
                        improvement_threshold :
                     patience = max(patience, iter * patience_increase)
 
-                # save best validation score and iteration number
                 best_validation_loss = this_validation_loss
-                best_iter = iter
-
                 # test it on the test set
-                test_score = 0.
-                for x,y in test_batches:
-                    test_score += test_model(x,y)
-                test_score /= len(test_batches)
-                print(('     epoch %i, minibatch %i/%i, test error of best '
-                      'model %f %%') % 
-                             (epoch, minibatch_index+1, n_minibatches,
-                              test_score*100.))
+
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_score = numpy.mean(test_losses)
+
+                print(('     epoch %i, minibatch %i/%i, test error of best ' 
+                       'model %f %%') % \
+                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
 
         if patience <= iter :
-            break
+                done_looping = True
+                break
+
 
     end_time = time.clock()
     print(('Optimization complete. Best validation score of %f %% '
@@ -329,5 +332,5 @@
 
 
 if __name__ == '__main__':
-    sgd_optimization_mnist()
+    test_mlp()