changeset 159:e81241cfc2de

merge
author Myriam Cote <cotemyri@iro.umontreal.ca>
date Thu, 25 Feb 2010 09:05:48 -0500
parents d1bb6e06497a (diff) 221799d79188 (current diff)
children 68160fd149fe
files
diffstat 1 files changed, 437 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline_algorithms/log_reg/log_reg.py	Thu Feb 25 09:05:48 2010 -0500
@@ -0,0 +1,437 @@
+"""
+This tutorial introduces logistic regression using Theano and stochastic 
+gradient descent.  
+
+Logistic regression is a probabilistic, linear classifier. It is parametrized
+by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
+done by projecting data points onto a set of hyperplanes, the distance to
+which is used to determine a class membership probability. 
+
+Mathematically, this can be written as:
+
+.. math::
+  P(Y=i|x, W,b) &= softmax_i(W x + b) \\
+                &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
+
+
+The output of the model or prediction is then done by taking the argmax of 
+the vector whose i'th element is P(Y=i|x).
+
+.. math::
+
+  y_{pred} = argmax_i P(Y=i|x,W,b)
+
+
+This tutorial presents a stochastic gradient descent optimization method 
+suitable for large datasets, and a conjugate gradient optimization method 
+that is suitable for smaller datasets.
+
+
+References:
+
+    - textbooks: "Pattern Recognition and Machine Learning" - 
+                 Christopher M. Bishop, section 4.3.2
+
+"""
+__docformat__ = 'restructedtext en'
+
+import numpy, time, cPickle, gzip
+
+import theano
+import theano.tensor as T
+
+
+class LogisticRegression(object):
+    """Multi-class Logistic Regression Class
+
+    The logistic regression is fully described by a weight matrix :math:`W` 
+    and bias vector :math:`b`. Classification is done by projecting data 
+    points onto a set of hyperplanes, the distance to which is used to 
+    determine a class membership probability. 
+    """
+
+
+    def __init__( self, input, n_in, n_out ):
+        """ Initialize the parameters of the logistic regression
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the 
+                      architecture (one minibatch)
+        
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in 
+                     which the datapoints lie
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in 
+                      which the labels lie
+
+        """ 
+
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
+        self.W = theano.shared( value = numpy.zeros(( n_in, n_out ), dtype = theano.config.floatX ),
+                                name =' W')
+        # initialize the baises b as a vector of n_out 0s
+        self.b = theano.shared( value = numpy.zeros(( n_out, ), dtype = theano.config.floatX ),
+                               name = 'b')
+
+
+        # compute vector of class-membership probabilities in symbolic form
+        self.p_y_given_x = T.nnet.softmax( T.dot( input, self.W ) + self.b )
+
+        # compute prediction as class whose probability is maximal in 
+        # symbolic form
+        self.y_pred=T.argmax( self.p_y_given_x, axis =1 )
+
+        # parameters of the model
+        self.params = [ self.W, self.b ]
+
+
+    def negative_log_likelihood( self, y ):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D})
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+
+        Note: we use the mean instead of the sum so that
+              the learning rate is less dependent on the batch size
+        """
+        # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
+        # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
+        # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class 
+        # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
+        # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
+        # i.e., the mean log-likelihood across the minibatch.
+        return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] )
+
+
+    def errors( self, y ):
+        """Return a float representing the number of errors in the minibatch 
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the 
+                  correct label
+        """
+
+        # check if y has same dimension of y_pred 
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError( 'y should have the same shape as self.y_pred', 
+                ( 'y', target.type, 'y_pred', self.y_pred.type ) )
+        # check if y is of the correct datatype        
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean( T.neq( self.y_pred, y ) )
+        else:
+            raise NotImplementedError()
+        
+def shared_dataset( data_xy ):
+        """ Function that loads the dataset into shared variables
+        
+        The reason we store our dataset in shared variables is to allow 
+        Theano to copy it into the GPU memory (when code is run on GPU). 
+        Since copying data into the GPU is slow, copying a minibatch everytime
+        is needed (the default behaviour if the data is not in a shared 
+        variable) would lead to a large decrease in performance.
+        """
+        data_x, data_y = data_xy
+        shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) )
+        shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) )
+        # When storing data on the GPU it has to be stored as floats
+        # therefore we will store the labels as ``floatX`` as well
+        # (``shared_y`` does exactly that). But during our computations
+        # we need them as ints (we use labels as index, and if they are 
+        # floats it doesn't make sense) therefore instead of returning 
+        # ``shared_y`` we will have to cast it to int. This little hack
+        # lets ous get around this issue
+        return shared_x, T.cast( shared_y, 'int32' )
+
+def load_data_pkl_gz( dataset ):
+    ''' Loads the dataset
+
+    :type dataset: string
+    :param dataset: the path to the dataset (here MNIST)
+    '''
+
+    #--------------------------------------------------------------------------------------------------------------------
+    # Load Data
+    #--------------------------------------------------------------------------------------------------------------------
+
+
+    print '... loading data'
+
+    # Load the dataset 
+    f = gzip.open(dataset,'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    f.close()
+    
+    test_set_x,  test_set_y  = shared_dataset( test_set )
+    valid_set_x, valid_set_y = shared_dataset( valid_set )
+    train_set_x, train_set_y = shared_dataset( train_set )
+
+    rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ]
+    return rval
+
+##def load_data_ft(      verbose = False,\
+##                                    data_path = '/data/lisa/data/nist/by_class/'\
+##                                    train_data = 'all/all_train_data.ft',\
+##                                    train_labels = 'all/all_train_labels.ft',\
+##                                    test_data = 'all/all_test_data.ft',\
+##                                    test_labels = 'all/all_test_labels.ft'):
+##   
+##    train_data_file = open(data_path + train_data)
+##    train_labels_file = open(data_path + train_labels)
+##    test_labels_file = open(data_path + test_data)
+##    test_data_file = open(data_path + test_labels)
+##    
+##    raw_train_data = ft.read( train_data_file)
+##    raw_train_labels = ft.read(train_labels_file)
+##    raw_test_data = ft.read( test_labels_file)
+##    raw_test_labels = ft.read( test_data_file)
+##    
+##    f.close()
+##    g.close()
+##    i.close()
+##    h.close()
+##    
+##    
+##    test_set_x,  test_set_y  = shared_dataset(test_set)
+##    valid_set_x, valid_set_y = shared_dataset(valid_set)
+##    train_set_x, train_set_y = shared_dataset(train_set)
+##
+##    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
+##    return rval
+##    #create a validation set the same size as the test size
+##    #use the end of the training array for this purpose
+##    #discard the last remaining so we get a %batch_size number
+##    test_size=len(raw_test_labels)
+##    test_size = int(test_size/batch_size)
+##    test_size*=batch_size
+##    train_size = len(raw_train_data)
+##    train_size = int(train_size/batch_size)
+##    train_size*=batch_size
+##    validation_size =test_size 
+##    offset = train_size-test_size
+##    if verbose == True:
+##        print 'train size = %d' %train_size
+##        print 'test size = %d' %test_size
+##        print 'valid size = %d' %validation_size
+##        print 'offset = %d' %offset
+##    
+##    
+
+#--------------------------------------------------------------------------------------------------------------------
+# MAIN
+#--------------------------------------------------------------------------------------------------------------------
+
+def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
+                    dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10,  \
+                    patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
+    
+    """
+    Demonstrate stochastic gradient descent optimization of a log-linear 
+    model
+
+    This is demonstrated on MNIST.
+    
+    :type learning_rate: float
+    :param learning_rate: learning rate used (factor for the stochastic 
+                          gradient)
+
+    :type nb_max_examples: int
+    :param nb_max_examples: maximal number of epochs to run the optimizer 
+    
+    :type batch_size: int  
+    :param batch_size:  size of the minibatch
+
+    :type dataset_name: string
+    :param dataset: the path of the MNIST dataset file from 
+                         http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+                        
+    :type image_size: int
+    :param image_size: size of the input image in pixels (width * height)
+    
+    :type nb_class: int
+    :param nb_class: number of classes
+    
+    :type patience: int
+    :param patience: look as this many examples regardless
+    
+    :type patience_increase: int
+    :param patience_increase: wait this much longer when a new best is found
+    
+    :type improvement_threshold: float
+    :param improvement_threshold: a relative improvement of this much is considered significant
+
+
+    """
+    datasets = load_data_pkl_gz( dataset_name )
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y   = datasets[2]
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+    n_valid_batches = valid_set_x.value.shape[0] / batch_size
+    n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+    #--------------------------------------------------------------------------------------------------------------------
+    # Build actual model
+    #--------------------------------------------------------------------------------------------------------------------
+    
+    print '... building the model'
+
+    # allocate symbolic variables for the data
+    index = T.lscalar( )    # index to a [mini]batch 
+    x        = T.matrix('x')  # the data is presented as rasterized images
+    y        = T.ivector('y') # the labels are presented as 1D vector of 
+                           # [int] labels
+
+    # construct the logistic regression class
+    
+    classifier = LogisticRegression( input = x, n_in = image_size, n_out = nb_class )
+
+    # the cost we minimize during training is the negative log likelihood of 
+    # the model in symbolic format
+    cost = classifier.negative_log_likelihood( y ) 
+
+    # compiling a Theano function that computes the mistakes that are made by 
+    # the model on a minibatch
+    test_model = theano.function( inputs = [ index ], 
+            outputs = classifier.errors( y ),
+            givens = {
+                x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
+                y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+
+    validate_model = theano.function( inputs = [ index ], 
+            outputs = classifier.errors( y ),
+            givens = {
+                x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
+                y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+
+    # compute the gradient of cost with respect to theta = ( W, b ) 
+    g_W = T.grad( cost = cost, wrt = classifier.W )
+    g_b  = T.grad( cost = cost, wrt = classifier.b )
+
+    # specify how to update the parameters of the model as a dictionary
+    updates = { classifier.W: classifier.W - learning_rate * g_W,\
+                         classifier.b: classifier.b  - learning_rate * g_b}
+
+    # compiling a Theano function `train_model` that returns the cost, but in 
+    # the same time updates the parameter of the model based on the rules 
+    # defined in `updates`
+    train_model = theano.function( inputs = [ index ], 
+            outputs = cost, 
+            updates = updates,
+            givens = {
+                x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
+                y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+
+    #--------------------------------------------------------------------------------------------------------------------
+    # Train model
+    #--------------------------------------------------------------------------------------------------------------------
+   
+    print '... training the model'
+    # early-stopping parameters
+    patience              = 5000  # look as this many examples regardless
+    patience_increase     = 2     # wait this much longer when a new best is 
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is 
+                                  # considered significant
+    validation_frequency  = min( n_train_batches, patience * 0.5 )  
+                                  # go through this many 
+                                  # minibatche before checking the network 
+                                  # on the validation set; in this case we 
+                                  # check every epoch 
+
+    best_params             = None
+    best_validation_loss = float('inf')
+    test_score                 = 0.
+    start_time                  = time.clock()
+
+    done_looping = False 
+    n_epochs       = nb_max_examples / train_set_x.value.shape[0]
+    epoch             = 0  
+    
+    while ( epoch < n_epochs ) and ( not done_looping ):
+        
+      epoch = epoch + 1
+      for minibatch_index in xrange( n_train_batches ):
+
+        minibatch_avg_cost = train_model( minibatch_index )
+        # iteration number
+        iter = epoch * n_train_batches + minibatch_index
+
+        if ( iter + 1 ) % validation_frequency == 0: 
+            # compute zero-one loss on validation set 
+            validation_losses     = [ validate_model( i ) for i in xrange( n_valid_batches ) ]
+            this_validation_loss = numpy.mean( validation_losses )
+
+            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                 ( epoch, minibatch_index + 1,n_train_batches, \
+                  this_validation_loss*100. ) )
+
+
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss *  \
+                       improvement_threshold :
+                    patience = max( patience, iter * patience_increase )
+
+                best_validation_loss = this_validation_loss
+                # test it on the test set
+
+                test_losses = [test_model(i) for i in xrange(n_test_batches)]
+                test_score  = numpy.mean(test_losses)
+
+                print(('     epoch %i, minibatch %i/%i, test error of best ' 
+                       'model %f %%') % \
+                  (epoch, minibatch_index+1, n_train_batches,test_score*100.))
+
+        if patience <= iter :
+                done_looping = True
+                break
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %  
+                 ( best_validation_loss * 100., test_score * 100.))
+    print ('The code ran for %f minutes' % ((end_time-start_time) / 60.))
+    
+ ######   return validation_error, test_error, nb_exemples, time
+
+if __name__ == '__main__':
+    log_reg()
+    
+ 
+def jobman_log_reg(state, channel):
+    (validation_error, test_error, nb_exemples, time) = log_reg( learning_rate = state.learning_rate,\
+                                                                                        nb_max_examples = state.nb_max_examples,\
+                                                                                                    batch_size  = state.batch_size,\
+                                                                                                dataset_name = state.dataset_name, \
+                                                                                                    image_size = state.image_size,  \
+                                                                                                       nb_class  = state.nb_class )
+
+    state.validation_error = validation_error
+    state.test_error = test_error
+    state.nb_exemples = nb_exemples
+    state.time = time
+    return channel.COMPLETE
+                                                                
+                                      
+    
+    
+
+