ift6266: baseline/log_reg/log

comparison baseline/log_reg/log_reg.py @ 169:d37c944133c3

directory name change

author	Dumitru Erhan <dumitru.erhan@gmail.com>
date	Fri, 26 Feb 2010 14:24:11 -0500
parents	baseline_algorithms/log_reg/log_reg.py@d1bb6e06497a
children	5d88ed99c0af

comparison

equal deleted inserted replaced

-:5e0e5f1860ec
+:d37c944133c3
+"""
+This tutorial introduces logistic regression using Theano and stochastic
+gradient descent.
+Logistic regression is a probabilistic, linear classifier. It is parametrized
+by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
+done by projecting data points onto a set of hyperplanes, the distance to
+which is used to determine a class membership probability.
+Mathematically, this can be written as:
+.. math::
+P(Y=i|x, W,b) &= softmax_i(W x + b) \\
+&= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
+The output of the model or prediction is then done by taking the argmax of
+the vector whose i'th element is P(Y=i|x).
+.. math::
+y_{pred} = argmax_i P(Y=i|x,W,b)
+This tutorial presents a stochastic gradient descent optimization method
+suitable for large datasets, and a conjugate gradient optimization method
+that is suitable for smaller datasets.
+References:
+- textbooks: "Pattern Recognition and Machine Learning" -
+Christopher M. Bishop, section 4.3.2
+"""
+__docformat__ = 'restructedtext en'
+import numpy, time, cPickle, gzip
+import theano
+import theano.tensor as T
+class LogisticRegression(object):
+"""Multi-class Logistic Regression Class
+The logistic regression is fully described by a weight matrix :math:`W`
+and bias vector :math:`b`. Classification is done by projecting data
+points onto a set of hyperplanes, the distance to which is used to
+determine a class membership probability.
+"""
+def __init__( self, input, n_in, n_out ):
+""" Initialize the parameters of the logistic regression
+:type input: theano.tensor.TensorType
+:param input: symbolic variable that describes the input of the
+architecture (one minibatch)
+:type n_in: int
+:param n_in: number of input units, the dimension of the space in
+which the datapoints lie
+:type n_out: int
+:param n_out: number of output units, the dimension of the space in
+which the labels lie
+"""
+# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
+self.W = theano.shared( value = numpy.zeros(( n_in, n_out ), dtype = theano.config.floatX ),
+name =' W')
+# initialize the baises b as a vector of n_out 0s
+self.b = theano.shared( value = numpy.zeros(( n_out, ), dtype = theano.config.floatX ),
+name = 'b')
+# compute vector of class-membership probabilities in symbolic form
+self.p_y_given_x = T.nnet.softmax( T.dot( input, self.W ) + self.b )
+# compute prediction as class whose probability is maximal in
+# symbolic form
+self.y_pred=T.argmax( self.p_y_given_x, axis =1 )
+# parameters of the model
+self.params = [ self.W, self.b ]
+def negative_log_likelihood( self, y ):
+"""Return the mean of the negative log-likelihood of the prediction
+of this model under a given target distribution.
+.. math::
+\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+\ell (\theta=\{W,b\}, \mathcal{D})
+:type y: theano.tensor.TensorType
+:param y: corresponds to a vector that gives for each example the
+correct label
+Note: we use the mean instead of the sum so that
+the learning rate is less dependent on the batch size
+"""
+# y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
+# T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
+# T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
+# LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
+# and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
+# i.e., the mean log-likelihood across the minibatch.
+return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] )
+def errors( self, y ):
+"""Return a float representing the number of errors in the minibatch
+over the total number of examples of the minibatch ; zero one
+loss over the size of the minibatch
+:type y: theano.tensor.TensorType
+:param y: corresponds to a vector that gives for each example the
+correct label
+"""
+# check if y has same dimension of y_pred
+if y.ndim != self.y_pred.ndim:
+raise TypeError( 'y should have the same shape as self.y_pred',
+( 'y', target.type, 'y_pred', self.y_pred.type ) )
+# check if y is of the correct datatype
+if y.dtype.startswith('int'):
+# the T.neq operator returns a vector of 0s and 1s, where 1
+# represents a mistake in prediction
+return T.mean( T.neq( self.y_pred, y ) )
+else:
+raise NotImplementedError()
+def shared_dataset( data_xy ):
+""" Function that loads the dataset into shared variables
+The reason we store our dataset in shared variables is to allow
+Theano to copy it into the GPU memory (when code is run on GPU).
+Since copying data into the GPU is slow, copying a minibatch everytime
+is needed (the default behaviour if the data is not in a shared
+variable) would lead to a large decrease in performance.
+"""
+data_x, data_y = data_xy
+shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) )
+shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) )
+# When storing data on the GPU it has to be stored as floats
+# therefore we will store the labels as ``floatX`` as well
+# (``shared_y`` does exactly that). But during our computations
+# we need them as ints (we use labels as index, and if they are
+# floats it doesn't make sense) therefore instead of returning
+# ``shared_y`` we will have to cast it to int. This little hack
+# lets ous get around this issue
+return shared_x, T.cast( shared_y, 'int32' )
+def load_data_pkl_gz( dataset ):
+''' Loads the dataset
+:type dataset: string
+:param dataset: the path to the dataset (here MNIST)
+'''
+#--------------------------------------------------------------------------------------------------------------------
+# Load Data
+#--------------------------------------------------------------------------------------------------------------------
+print '... loading data'
+# Load the dataset
+f = gzip.open(dataset,'rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+test_set_x,  test_set_y  = shared_dataset( test_set )
+valid_set_x, valid_set_y = shared_dataset( valid_set )
+train_set_x, train_set_y = shared_dataset( train_set )
+rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ]
+return rval
+##def load_data_ft(      verbose = False,\
+##                                    data_path = '/data/lisa/data/nist/by_class/'\
+##                                    train_data = 'all/all_train_data.ft',\
+##                                    train_labels = 'all/all_train_labels.ft',\
+##                                    test_data = 'all/all_test_data.ft',\
+##                                    test_labels = 'all/all_test_labels.ft'):
+##
+##    train_data_file = open(data_path + train_data)
+##    train_labels_file = open(data_path + train_labels)
+##    test_labels_file = open(data_path + test_data)
+##    test_data_file = open(data_path + test_labels)
+##
+##    raw_train_data = ft.read( train_data_file)
+##    raw_train_labels = ft.read(train_labels_file)
+##    raw_test_data = ft.read( test_labels_file)
+##    raw_test_labels = ft.read( test_data_file)
+##
+##    f.close()
+##    g.close()
+##    i.close()
+##    h.close()
+##
+##
+##    test_set_x,  test_set_y  = shared_dataset(test_set)
+##    valid_set_x, valid_set_y = shared_dataset(valid_set)
+##    train_set_x, train_set_y = shared_dataset(train_set)
+##
+##    rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)]
+##    return rval
+##    #create a validation set the same size as the test size
+##    #use the end of the training array for this purpose
+##    #discard the last remaining so we get a %batch_size number
+##    test_size=len(raw_test_labels)
+##    test_size = int(test_size/batch_size)
+##    test_size*=batch_size
+##    train_size = len(raw_train_data)
+##    train_size = int(train_size/batch_size)
+##    train_size*=batch_size
+##    validation_size =test_size
+##    offset = train_size-test_size
+##    if verbose == True:
+##        print 'train size = %d' %train_size
+##        print 'test size = %d' %test_size
+##        print 'valid size = %d' %validation_size
+##        print 'offset = %d' %offset
+##
+##
+#--------------------------------------------------------------------------------------------------------------------
+# MAIN
+#--------------------------------------------------------------------------------------------------------------------
+def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
+dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10,  \
+patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
+"""
+Demonstrate stochastic gradient descent optimization of a log-linear
+model
+This is demonstrated on MNIST.
+:type learning_rate: float
+:param learning_rate: learning rate used (factor for the stochastic
+gradient)
+:type nb_max_examples: int
+:param nb_max_examples: maximal number of epochs to run the optimizer
+:type batch_size: int
+:param batch_size:  size of the minibatch
+:type dataset_name: string
+:param dataset: the path of the MNIST dataset file from
+http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+:type image_size: int
+:param image_size: size of the input image in pixels (width * height)
+:type nb_class: int
+:param nb_class: number of classes
+:type patience: int
+:param patience: look as this many examples regardless
+:type patience_increase: int
+:param patience_increase: wait this much longer when a new best is found
+:type improvement_threshold: float
+:param improvement_threshold: a relative improvement of this much is considered significant
+"""
+datasets = load_data_pkl_gz( dataset_name )
+train_set_x, train_set_y = datasets[0]
+valid_set_x, valid_set_y = datasets[1]
+test_set_x , test_set_y   = datasets[2]
+# compute number of minibatches for training, validation and testing
+n_train_batches = train_set_x.value.shape[0] / batch_size
+n_valid_batches = valid_set_x.value.shape[0] / batch_size
+n_test_batches  = test_set_x.value.shape[0]  / batch_size
+#--------------------------------------------------------------------------------------------------------------------
+# Build actual model
+#--------------------------------------------------------------------------------------------------------------------
+print '... building the model'
+# allocate symbolic variables for the data
+index = T.lscalar( )    # index to a [mini]batch
+x        = T.matrix('x')  # the data is presented as rasterized images
+y        = T.ivector('y') # the labels are presented as 1D vector of
+# [int] labels
+# construct the logistic regression class
+classifier = LogisticRegression( input = x, n_in = image_size, n_out = nb_class )
+# the cost we minimize during training is the negative log likelihood of
+# the model in symbolic format
+cost = classifier.negative_log_likelihood( y )
+# compiling a Theano function that computes the mistakes that are made by
+# the model on a minibatch
+test_model = theano.function( inputs = [ index ],
+outputs = classifier.errors( y ),
+givens = {
+x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
+y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+validate_model = theano.function( inputs = [ index ],
+outputs = classifier.errors( y ),
+givens = {
+x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
+y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+# compute the gradient of cost with respect to theta = ( W, b )
+g_W = T.grad( cost = cost, wrt = classifier.W )
+g_b  = T.grad( cost = cost, wrt = classifier.b )
+# specify how to update the parameters of the model as a dictionary
+updates = { classifier.W: classifier.W - learning_rate * g_W,\
+classifier.b: classifier.b  - learning_rate * g_b}
+# compiling a Theano function `train_model` that returns the cost, but in
+# the same time updates the parameter of the model based on the rules
+# defined in `updates`
+train_model = theano.function( inputs = [ index ],
+outputs = cost,
+updates = updates,
+givens = {
+x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ],
+y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } )
+#--------------------------------------------------------------------------------------------------------------------
+# Train model
+#--------------------------------------------------------------------------------------------------------------------
+print '... training the model'
+# early-stopping parameters
+patience              = 5000  # look as this many examples regardless
+patience_increase     = 2     # wait this much longer when a new best is
+# found
+improvement_threshold = 0.995 # a relative improvement of this much is
+# considered significant
+validation_frequency  = min( n_train_batches, patience * 0.5 )
+# go through this many
+# minibatche before checking the network
+# on the validation set; in this case we
+# check every epoch
+best_params             = None
+best_validation_loss = float('inf')
+test_score                 = 0.
+start_time                  = time.clock()
+done_looping = False
+n_epochs       = nb_max_examples / train_set_x.value.shape[0]
+epoch             = 0
+while ( epoch < n_epochs ) and ( not done_looping ):
+epoch = epoch + 1
+for minibatch_index in xrange( n_train_batches ):
+minibatch_avg_cost = train_model( minibatch_index )
+# iteration number
+iter = epoch * n_train_batches + minibatch_index
+if ( iter + 1 ) % validation_frequency == 0:
+# compute zero-one loss on validation set
+validation_losses     = [ validate_model( i ) for i in xrange( n_valid_batches ) ]
+this_validation_loss = numpy.mean( validation_losses )
+print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+( epoch, minibatch_index + 1,n_train_batches, \
+this_validation_loss*100. ) )
+# if we got the best validation score until now
+if this_validation_loss < best_validation_loss:
+#improve patience if loss improvement is good enough
+if this_validation_loss < best_validation_loss *  \
+improvement_threshold :
+patience = max( patience, iter * patience_increase )
+best_validation_loss = this_validation_loss
+# test it on the test set
+test_losses = [test_model(i) for i in xrange(n_test_batches)]
+test_score  = numpy.mean(test_losses)
+print(('     epoch %i, minibatch %i/%i, test error of best '
+'model %f %%') % \
+(epoch, minibatch_index+1, n_train_batches,test_score*100.))
+if patience <= iter :
+done_looping = True
+break
+end_time = time.clock()
+print(('Optimization complete with best validation score of %f %%,'
+'with test performance %f %%') %
+( best_validation_loss * 100., test_score * 100.))
+print ('The code ran for %f minutes' % ((end_time-start_time) / 60.))
+######   return validation_error, test_error, nb_exemples, time
+if __name__ == '__main__':
+log_reg()
+def jobman_log_reg(state, channel):
+(validation_error, test_error, nb_exemples, time) = log_reg( learning_rate = state.learning_rate,\
+nb_max_examples = state.nb_max_examples,\
+batch_size  = state.batch_size,\
+dataset_name = state.dataset_name, \
+image_size = state.image_size,  \
+nb_class  = state.nb_class )
+state.validation_error = validation_error
+state.test_error = test_error
+state.nb_exemples = nb_exemples
+state.time = time
+return channel.COMPLETE

Mercurial > ift6266

comparison baseline/log_reg/log_reg.py @ 169:d37c944133c3