# HG changeset patch # User Arnaud Bergeron # Date 1267571809 18000 # Node ID 5d88ed99c0af2cde30d490f388358c127f091946 # Parent 9116cfe8e4ab6407f2ec597c12793a7e12c96611 Modify the log_reg.py tutorial code to use the datasets module. diff -r 9116cfe8e4ab -r 5d88ed99c0af baseline/log_reg/log_reg.py --- a/baseline/log_reg/log_reg.py Tue Mar 02 18:03:42 2010 -0500 +++ b/baseline/log_reg/log_reg.py Tue Mar 02 18:16:49 2010 -0500 @@ -35,11 +35,11 @@ """ __docformat__ = 'restructedtext en' -import numpy, time, cPickle, gzip +import numpy, time import theano import theano.tensor as T - +from ift6266 import datasets class LogisticRegression(object): """Multi-class Logistic Regression Class @@ -135,107 +135,12 @@ else: raise NotImplementedError() -def shared_dataset( data_xy ): - """ Function that loads the dataset into shared variables - - The reason we store our dataset in shared variables is to allow - Theano to copy it into the GPU memory (when code is run on GPU). - Since copying data into the GPU is slow, copying a minibatch everytime - is needed (the default behaviour if the data is not in a shared - variable) would lead to a large decrease in performance. - """ - data_x, data_y = data_xy - shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) ) - shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) ) - # When storing data on the GPU it has to be stored as floats - # therefore we will store the labels as ``floatX`` as well - # (``shared_y`` does exactly that). But during our computations - # we need them as ints (we use labels as index, and if they are - # floats it doesn't make sense) therefore instead of returning - # ``shared_y`` we will have to cast it to int. This little hack - # lets ous get around this issue - return shared_x, T.cast( shared_y, 'int32' ) - -def load_data_pkl_gz( dataset ): - ''' Loads the dataset - - :type dataset: string - :param dataset: the path to the dataset (here MNIST) - ''' - - #-------------------------------------------------------------------------------------------------------------------- - # Load Data - #-------------------------------------------------------------------------------------------------------------------- - - - print '... loading data' - - # Load the dataset - f = gzip.open(dataset,'rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - test_set_x, test_set_y = shared_dataset( test_set ) - valid_set_x, valid_set_y = shared_dataset( valid_set ) - train_set_x, train_set_y = shared_dataset( train_set ) - - rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ] - return rval - -##def load_data_ft( verbose = False,\ -## data_path = '/data/lisa/data/nist/by_class/'\ -## train_data = 'all/all_train_data.ft',\ -## train_labels = 'all/all_train_labels.ft',\ -## test_data = 'all/all_test_data.ft',\ -## test_labels = 'all/all_test_labels.ft'): -## -## train_data_file = open(data_path + train_data) -## train_labels_file = open(data_path + train_labels) -## test_labels_file = open(data_path + test_data) -## test_data_file = open(data_path + test_labels) -## -## raw_train_data = ft.read( train_data_file) -## raw_train_labels = ft.read(train_labels_file) -## raw_test_data = ft.read( test_labels_file) -## raw_test_labels = ft.read( test_data_file) -## -## f.close() -## g.close() -## i.close() -## h.close() -## -## -## test_set_x, test_set_y = shared_dataset(test_set) -## valid_set_x, valid_set_y = shared_dataset(valid_set) -## train_set_x, train_set_y = shared_dataset(train_set) -## -## rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)] -## return rval -## #create a validation set the same size as the test size -## #use the end of the training array for this purpose -## #discard the last remaining so we get a %batch_size number -## test_size=len(raw_test_labels) -## test_size = int(test_size/batch_size) -## test_size*=batch_size -## train_size = len(raw_train_data) -## train_size = int(train_size/batch_size) -## train_size*=batch_size -## validation_size =test_size -## offset = train_size-test_size -## if verbose == True: -## print 'train size = %d' %train_size -## print 'test size = %d' %test_size -## print 'valid size = %d' %validation_size -## print 'offset = %d' %offset -## -## - #-------------------------------------------------------------------------------------------------------------------- # MAIN #-------------------------------------------------------------------------------------------------------------------- def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \ - dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10, \ + dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10, \ patience = 5000, patience_increase = 2, improvement_threshold = 0.995): """ @@ -254,9 +159,8 @@ :type batch_size: int :param batch_size: size of the minibatch - :type dataset_name: string - :param dataset: the path of the MNIST dataset file from - http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + :type dataset: dataset + :param dataset: a dataset instance from ift6266.datasets :type image_size: int :param image_size: size of the input image in pixels (width * height) @@ -275,17 +179,6 @@ """ - datasets = load_data_pkl_gz( dataset_name ) - - train_set_x, train_set_y = datasets[0] - valid_set_x, valid_set_y = datasets[1] - test_set_x , test_set_y = datasets[2] - - # compute number of minibatches for training, validation and testing - n_train_batches = train_set_x.value.shape[0] / batch_size - n_valid_batches = valid_set_x.value.shape[0] / batch_size - n_test_batches = test_set_x.value.shape[0] / batch_size - #-------------------------------------------------------------------------------------------------------------------- # Build actual model #-------------------------------------------------------------------------------------------------------------------- @@ -308,17 +201,11 @@ # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch - test_model = theano.function( inputs = [ index ], - outputs = classifier.errors( y ), - givens = { - x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ], - y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) + test_model = theano.function( inputs = [ x, y ], + outputs = classifier.errors( y )) - validate_model = theano.function( inputs = [ index ], - outputs = classifier.errors( y ), - givens = { - x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ], - y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) + validate_model = theano.function( inputs = [ x, y ], + outputs = classifier.errors( y )) # compute the gradient of cost with respect to theta = ( W, b ) g_W = T.grad( cost = cost, wrt = classifier.W ) @@ -331,12 +218,9 @@ # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` - train_model = theano.function( inputs = [ index ], + train_model = theano.function( inputs = [ x, y ], outputs = cost, - updates = updates, - givens = { - x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ], - y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) + updates = updates) #-------------------------------------------------------------------------------------------------------------------- # Train model @@ -349,38 +233,38 @@ # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = min( n_train_batches, patience * 0.5 ) + validation_frequency = patience * 0.5 # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch - best_params = None + best_params = None best_validation_loss = float('inf') - test_score = 0. - start_time = time.clock() + test_score = 0. + start_time = time.clock() done_looping = False - n_epochs = nb_max_examples / train_set_x.value.shape[0] - epoch = 0 + n_iters = nb_max_examples / batch_size + epoch = 0 + iter = 0 - while ( epoch < n_epochs ) and ( not done_looping ): + while ( iter < n_iters ) and ( not done_looping ): epoch = epoch + 1 - for minibatch_index in xrange( n_train_batches ): + for x, y in dataset.train(batch_size): - minibatch_avg_cost = train_model( minibatch_index ) + minibatch_avg_cost = train_model( x, y ) # iteration number - iter = epoch * n_train_batches + minibatch_index + iter += 1 - if ( iter + 1 ) % validation_frequency == 0: + if iter % validation_frequency == 0: # compute zero-one loss on validation set - validation_losses = [ validate_model( i ) for i in xrange( n_valid_batches ) ] + validation_losses = [ validate_model( xv, yv ) for xv, yv in dataset.valid(batch_size) ] this_validation_loss = numpy.mean( validation_losses ) - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - ( epoch, minibatch_index + 1,n_train_batches, \ - this_validation_loss*100. ) ) + print('epoch %i, iter %i, validation error %f %%' % \ + ( epoch, iter, this_validation_loss*100. ) ) # if we got the best validation score until now @@ -393,12 +277,12 @@ best_validation_loss = this_validation_loss # test it on the test set - test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)] test_score = numpy.mean(test_losses) - print((' epoch %i, minibatch %i/%i, test error of best ' + print((' epoch %i, iter %i, test error of best ' 'model %f %%') % \ - (epoch, minibatch_index+1, n_train_batches,test_score*100.)) + (epoch, iter, test_score*100.)) if patience <= iter : done_looping = True