Mercurial > ift6266
changeset 365:22919039f7ab
Fixing a wrong commit and committing more files.
author | humel |
---|---|
date | Thu, 22 Apr 2010 19:57:05 -0400 |
parents | c05680f8c92f (diff) 7bc555cc9aab (current diff) |
children | 64fa85d68923 |
files | scripts/setup_batches.py |
diffstat | 14 files changed, 2168 insertions(+), 68 deletions(-) [+] |
line wrap: on
line diff
--- a/baseline/mlp/mlp_nist.py Mon Apr 19 07:09:44 2010 -0400 +++ b/baseline/mlp/mlp_nist.py Thu Apr 22 19:57:05 2010 -0400 @@ -23,6 +23,7 @@ """ __docformat__ = 'restructedtext en' +import sys import pdb import numpy import pylab @@ -268,6 +269,8 @@ dataset=datasets.nist_all() elif data_set==1: dataset=datasets.nist_P07() + elif data_set==2: + dataset=datasets.PNIST07() @@ -370,8 +373,9 @@ - if verbose == 1: - print 'starting training' + + print 'starting training' + sys.stdout.flush() while(minibatch_index*batch_size<nb_max_exemples): for x, y in dataset.train(batch_size): @@ -389,9 +393,7 @@ learning_rate_list.append(classifier.lr.value) divergence_flag_list.append(divergence_flag) - #save temp results to check during training - numpy.savez('temp_results.npy',config=configuration,total_validation_error_list=total_validation_error_list,\ - learning_rate_list=learning_rate_list, divergence_flag_list=divergence_flag_list) + # compute the validation error this_validation_loss = 0. @@ -404,10 +406,15 @@ this_validation_loss /= temp #save the validation loss total_validation_error_list.append(this_validation_loss) - if verbose == 1: - print(('epoch %i, minibatch %i, learning rate %f current validation error %f ') % - (epoch, minibatch_index+1,classifier.lr.value, - this_validation_loss*100.)) + + print(('epoch %i, minibatch %i, learning rate %f current validation error %f ') % + (epoch, minibatch_index+1,classifier.lr.value, + this_validation_loss*100.)) + sys.stdout.flush() + + #save temp results to check during training + numpy.savez('temp_results.npy',config=configuration,total_validation_error_list=total_validation_error_list,\ + learning_rate_list=learning_rate_list, divergence_flag_list=divergence_flag_list) # if we got the best validation score until now if this_validation_loss < best_validation_loss: @@ -429,11 +436,12 @@ test_score += test_model(xt,yt) temp = temp+1 test_score /= temp - if verbose == 1: - print(('epoch %i, minibatch %i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, - test_score*100.)) + + print(('epoch %i, minibatch %i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, + test_score*100.)) + sys.stdout.flush() # if the validation error is going up, we are overfitting (or oscillating) # check if we are allowed to continue and if we will adjust the learning rate @@ -459,12 +467,13 @@ test_score += test_model(xt,yt) temp=temp+1 test_score /= temp - if verbose == 1: - print ' validation error is going up, possibly stopping soon' - print((' epoch %i, minibatch %i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, - test_score*100.)) + + print ' validation error is going up, possibly stopping soon' + print((' epoch %i, minibatch %i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, + test_score*100.)) + sys.stdout.flush() @@ -489,12 +498,13 @@ # we have finished looping through the training set epoch = epoch+1 end_time = time.clock() - if verbose == 1: - print(('Optimization complete. Best validation score of %f %% ' - 'obtained at iteration %i, with test performance %f %%') % - (best_validation_loss * 100., best_iter, test_score*100.)) - print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - print minibatch_index + + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + print minibatch_index + sys.stdout.flush() #save the model and the weights numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/mlp/ratio_classes/mlp_nist_ratio.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,438 @@ +# -*- coding: utf-8 -*- +""" +This tutorial introduces the multilayer perceptron using Theano. + + A multilayer perceptron is a logistic regressor where +instead of feeding the input to the logistic regression you insert a +intermidiate layer, called the hidden layer, that has a nonlinear +activation function (usually tanh or sigmoid) . One can use many such +hidden layers making the architecture deep. The tutorial will also tackle +the problem of MNIST digit classification. + +.. math:: + + f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 5 + +TODO: recommended preprocessing, lr ranges, regularization ranges (explain + to do lr first, then add regularization) + +""" +__docformat__ = 'restructedtext en' + +import ift6266 +from scripts import setup_batches +import pdb +import numpy + +import theano +import theano.tensor as T +import time +import theano.tensor.nnet +import pylearn +import theano,pylearn.version +from pylearn.io import filetensor as ft + +data_path = '/data/lisa/data/nist/by_class/' + +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermidiate layers usually have as activation function thanh or the + sigmoid function while the top layer is a softamx layer. + """ + + + + def __init__(self, input, n_in, n_hidden, n_out,learning_rate): + """Initialize the parameters for the multilayer perceptron + + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :param n_hidden: number of hidden units + + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize the parameters theta = (W1,b1,W2,b2) ; note that this + # example contains only one hidden layer, but one can have as many + # layers as he/she wishes, making the network deeper. The only + # problem making the network deep this way is during learning, + # backpropagation being unable to move the network from the starting + # point towards; this is where pre-training helps, giving a good + # starting point for backpropagation, but more about this in the + # other tutorials + + # `W1` is initialized with `W1_values` which is uniformely sampled + # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W1_values = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_in+n_hidden)), \ + high = numpy.sqrt(6./(n_in+n_hidden)), \ + size = (n_in, n_hidden)), dtype = theano.config.floatX) + # `W2` is initialized with `W2_values` which is uniformely sampled + # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W2_values = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden+n_out)), \ + high= numpy.sqrt(6./(n_hidden+n_out)),\ + size= (n_hidden, n_out)), dtype = theano.config.floatX) + + self.W1 = theano.shared( value = W1_values ) + self.b1 = theano.shared( value = numpy.zeros((n_hidden,), + dtype= theano.config.floatX)) + self.W2 = theano.shared( value = W2_values ) + self.b2 = theano.shared( value = numpy.zeros((n_out,), + dtype= theano.config.floatX)) + + #include the learning rate in the classifer so + #we can modify it on the fly when we want + lr_value=learning_rate + self.lr=theano.shared(value=lr_value) + # symbolic expression computing the values of the hidden layer + self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) + + + + # symbolic expression computing the values of the top layer + self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred = T.argmax( self.p_y_given_x, axis =1) + self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1) + + + + + # L1 norm ; one regularization option is to enforce L1 norm to + # be small + self.L1 = abs(self.W1).sum() + abs(self.W2).sum() + + # square of L2 norm ; one regularization option is to enforce + # square of L2 norm to be small + self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() + + + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + + :param y: corresponds to a vector that gives for each example the + :correct label + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +def mlp_full_nist( verbose = False,\ + adaptive_lr = 1,\ + train_data = 'all/all_train_data.ft',\ + train_labels = 'all/all_train_labels.ft',\ + test_data = 'all/all_test_data.ft',\ + test_labels = 'all/all_test_labels.ft',\ + learning_rate=0.5,\ + L1_reg = 0.00,\ + L2_reg = 0.0001,\ + nb_max_exemples=1000000,\ + batch_size=20,\ + nb_hidden = 500,\ + nb_targets = 62,\ + tau=1e6,\ + main_class="d",\ + start_ratio=1,\ + end_ratio=1): + + + configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr] + + #save initial learning rate if classical adaptive lr is used + initial_lr=learning_rate + + total_validation_error_list = [] + total_train_error_list = [] + learning_rate_list=[] + best_training_error=float('inf'); + + # set up batches + batches = setup_batches.Batches() + batches.set_batches(main_class, start_ratio,end_ratio,batch_size,verbose) + + train_batches = batches.get_train_batches() + test_batches = batches.get_test_batches() + validation_batches = batches.get_validation_batches() + + ishape = (32,32) # this is the size of NIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + if verbose==True: + print 'finished parsing the data' + # construct the logistic regression class + classifier = MLP( input=x.reshape((batch_size,32*32)),\ + n_in=32*32,\ + n_hidden=nb_hidden,\ + n_out=nb_targets, + learning_rate=learning_rate) + + + + + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = classifier.negative_log_likelihood(y) \ + + L1_reg * classifier.L1 \ + + L2_reg * classifier.L2_sqr + + # compiling a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + + # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) + g_W1 = T.grad(cost, classifier.W1) + g_b1 = T.grad(cost, classifier.b1) + g_W2 = T.grad(cost, classifier.W2) + g_b2 = T.grad(cost, classifier.b2) + + # specify how to update the parameters of the model as a dictionary + updates = \ + { classifier.W1: classifier.W1 - classifier.lr*g_W1 \ + , classifier.b1: classifier.b1 - classifier.lr*g_b1 \ + , classifier.W2: classifier.W2 - classifier.lr*g_W2 \ + , classifier.b2: classifier.b2 - classifier.lr*g_b2 } + + # compiling a theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function([x, y], cost, updates = updates ) + n_minibatches = len(train_batches) + + + + + + + #conditions for stopping the adaptation: + #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size) + #2) validation error is going up twice in a row(probable overfitting) + + # This means we no longer stop on slow convergence as low learning rates stopped + # too fast. + + # no longer relevant + patience =nb_max_exemples/batch_size + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches/4 + + + + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples + n_iter = n_iter/n_minibatches + 1 #round up + n_iter=max(1,n_iter) # run at least once on short debug call + time_n=0 #in unit of exemples + + + + if verbose == True: + print 'looping at most %d times through the data set' %n_iter + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + + if adaptive_lr==2: + classifier.lr.value = tau*initial_lr/(tau+time_n) + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + # convert to float + x_float = x/255.0 + cost_ij = train_model(x_float,y) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + + this_validation_loss = 0. + for x,y in validation_batches: + # sum up the errors for each minibatch + x_float = x/255.0 + this_validation_loss += test_model(x_float,y) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(validation_batches) + #save the validation loss + total_validation_error_list.append(this_validation_loss) + + #get the training error rate + this_train_loss=0 + for x,y in train_batches: + # sum up the errors for each minibatch + x_float = x/255.0 + this_train_loss += test_model(x_float,y) + # get the average by dividing with the number of minibatches + this_train_loss /= len(train_batches) + #save the validation loss + total_train_error_list.append(this_train_loss) + if(this_train_loss<best_training_error): + best_training_error=this_train_loss + + if verbose == True: + print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss*100.,this_train_loss*100)) + print 'learning rate = %f' %classifier.lr.value + print 'time = %i' %time_n + + + #save the learning rate + learning_rate_list.append(classifier.lr.value) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + # reset patience if we are going down again + # so we continue exploring + patience=nb_max_exemples/batch_size + # test it on the test set + test_score = 0. + for x,y in test_batches: + x_float=x/255.0 + test_score += test_model(x_float,y) + test_score /= len(test_batches) + if verbose == True: + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + # if the validation error is going up, we are overfitting (or oscillating) + # stop converging but run at least to next validation + # to check overfitting or ocsillation + # the saved weights of the model will be a bit off in that case + elif this_validation_loss >= best_validation_loss: + #calculate the test error at this point and exit + # test it on the test set + # however, if adaptive_lr is true, try reducing the lr to + # get us out of an oscilliation + if adaptive_lr==1: + classifier.lr.value=classifier.lr.value/2.0 + + test_score = 0. + #cap the patience so we are allowed one more validation error + #calculation before aborting + patience = iter+validation_frequency+1 + for x,y in test_batches: + x_float=x/255.0 + test_score += test_model(x_float,y) + test_score /= len(test_batches) + if verbose == True: + print ' validation error is going up, possibly stopping soon' + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + + + + if iter>patience: + print 'we have diverged' + break + + + time_n= time_n + batch_size + end_time = time.clock() + if verbose == True: + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + print iter + + #save the model and the weights + numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value) + numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\ + learning_rate_list=learning_rate_list) + + return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60) + + +if __name__ == '__main__': + mlp_full_nist(True) + +def jobman_mlp_full_nist(state,channel): + (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ + nb_max_exemples=state.nb_max_exemples,\ + nb_hidden=state.nb_hidden,\ + adaptive_lr=state.adaptive_lr,\ + tau=state.tau,\ + main_class=state.main_class,\ + start_ratio=state.start_ratio,\ + end_ratio=state.end_ratio) + state.train_error=train_error + state.validation_error=validation_error + state.test_error=test_error + state.nb_exemples=nb_exemples + state.time=time + return channel.COMPLETE + + \ No newline at end of file
--- a/datasets/defs.py Mon Apr 19 07:09:44 2010 -0400 +++ b/datasets/defs.py Thu Apr 22 19:57:05 2010 -0400 @@ -1,5 +1,5 @@ __all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all', 'ocr', - 'nist_P07', 'mnist'] + 'nist_P07', 'PNIST07', 'mnist'] from ftfile import FTDataSet from gzpklfile import GzpklDataSet @@ -52,6 +52,15 @@ valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')], valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')], indtype=theano.config.floatX, inscale=255., maxsize=maxsize) + +#Added PNIST07 +PNIST07 = lambda maxsize=None, min_file=0, max_file=100: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/PNIST07_train'+str(i)+'_data.ft') for i in range(min_file, max_file)], + train_lbl = [os.path.join(DATA_PATH,'data/PNIST07_train'+str(i)+'_labels.ft') for i in range(min_file, max_file)], + test_data = [os.path.join(DATA_PATH,'data/PNIST07_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'data/PNIST07_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'data/PNIST07_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'data/PNIST07_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) mnist = lambda maxsize=None: GzpklDataSet(os.path.join(DATA_PATH,'mnist.pkl.gz'), maxsize=maxsize)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/config.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,177 @@ +''' +These are parameters used by nist_sda.py. They'll end up as globals in there. + +Rename this file to config.py and configure as needed. +DON'T add the renamed file to the repository, as others might use it +without realizing it, with dire consequences. +''' + +# Set this to True when you want to run cluster tests, ie. you want +# to run on the cluster, many jobs, but want to reduce the training +# set size and the number of epochs, so you know everything runs +# fine on the cluster. +# Set this PRIOR to inserting your test jobs in the DB. +TEST_CONFIG = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +NIST_ALL_TRAIN_SIZE = 649081 +# valid et test =82587 82587 + +# change "sandbox" when you're ready +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/rifaisal_csda' +EXPERIMENT_PATH = "ift6266.deep.convolutional_dae.salah_exp.nist_csda.jobman_entrypoint" + +##Pour lancer des travaux sur le cluster: (il faut etre ou se trouve les fichiers) +##python nist_sda.py jobman_insert +##dbidispatch --condor --repeat_jobs=2 jobman sql 'postgres://ift6266h10@gershwin/ift6266h10_db/pannetis_finetuningSDA0' . #C'est le path dans config.py + +# reduce training set to that many examples +REDUCE_TRAIN_TO = None +# that's a max, it usually doesn't get to that point +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 +#Set the finetune dataset +FINETUNE_SET=1 +#Set the pretrain dataset used. 0: NIST, 1:P07 +PRETRAIN_CHOICE=1 + + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + + +# This is to configure insertion of jobs on the cluster. +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. + + +JOB_VALS = {'pretraining_lr': [0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10], + 'kernels' : [[[52,5,5], [32,3,3]], [[52,7,7], [52,3,3]]], + 'mlp_size' : [[1000],[500]], + 'imgshp' : [[32,32]], + 'max_pool_layers' : [[[2,2],[2,2]]], + 'corruption_levels': [[0.2,0.1]], + 'minibatch_size': [100], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'max_finetuning_epochs_P07':[1000], + 'finetuning_lr':[0.1,0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2], + 'finetune_set':[1], + 'pretrain_choice':[1] + } + +DEFAULT_HP_NIST = {'pretraining_lr': 0.01, + 'pretraining_epochs_per_layer': 1, + 'kernels' : [[4,5,5], [2,3,3]], + 'mlp_size' : [10], + 'imgshp' : [32,32], + 'max_pool_layers' : [[2,2],[2,2]], + 'corruption_levels': [0.1,0.2], + 'minibatch_size': 20, + 'max_finetuning_epochs':MAX_FINETUNING_EPOCHS, + 'max_finetuning_epochs_P07':1000, + 'finetuning_lr':0.1, #0.001 was very bad, so we leave it out + 'num_hidden_layers':2, + 'finetune_set':1, + 'pretrain_choice':1, + #'reduce_train_to':1000, + } + + + +##[pannetis@ceylon test]$ python nist_sda.py test_jobman_entrypoint +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/TMP_DBI/configobj.py +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/TMP_DBI/utils.py +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/config.py +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/config2.py +##Creating optimizer with state, DD{'reduce_train_to': 11000, 'pretraining_epochs_per_layer': 2, 'hidden_layers_sizes': 300, 'num_hidden_layers': 2, 'corruption_levels': 0.20000000000000001, 'finetuning_lr': 0.10000000000000001, 'pretrain_choice': 0, 'max_finetuning_epochs': 2, 'version_pylearn': '08b37147dec1', 'finetune_set': -1, 'pretraining_lr': 0.10000000000000001, 'version_ift6266': 'a6b6b1140de9', 'version_theano': 'fb6c3a06cb65', 'minibatch_size': 20} +##SdaSgdOptimizer, max_minibatches = 11000 +##C##n_outs 62 +##pretrain_lr 0.1 +##finetune_lr 0.1 +##---- +## +##pretraining with NIST +## +##STARTING PRETRAINING, time = 2010-03-29 15:07:43.945981 +##Pre-training layer 0, epoch 0, cost 113.562562494 +##Pre-training layer 0, epoch 1, cost 113.410032944 +##Pre-training layer 1, epoch 0, cost 98.4539954687 +##Pre-training layer 1, epoch 1, cost 97.8658966686 +##Pretraining took 9.011333 minutes +## +##SERIE OF 3 DIFFERENT FINETUNINGS +## +## +##finetune with NIST +## +## +##STARTING FINETUNING, time = 2010-03-29 15:16:46.512235 +##epoch 1, minibatch 4999, validation error on P07 : 29.511250 % +## epoch 1, minibatch 4999, test error on dataset NIST (train data) of best model 40.408509 % +## epoch 1, minibatch 4999, test error on dataset P07 of best model 96.700000 % +##epoch 1, minibatch 9999, validation error on P07 : 25.560000 % +## epoch 1, minibatch 9999, test error on dataset NIST (train data) of best model 34.778969 % +## epoch 1, minibatch 9999, test error on dataset P07 of best model 97.037500 % +## +##Optimization complete with best validation score of 25.560000 %,with test performance 34.778969 % on dataset NIST +##The test score on the P07 dataset is 97.037500 +##The finetuning ran for 3.281833 minutes +## +## +##finetune with P07 +## +## +##STARTING FINETUNING, time = 2010-03-29 15:20:06.346009 +##epoch 1, minibatch 4999, validation error on NIST : 65.226250 % +## epoch 1, minibatch 4999, test error on dataset P07 (train data) of best model 84.465000 % +## epoch 1, minibatch 4999, test error on dataset NIST of best model 65.965237 % +##epoch 1, minibatch 9999, validation error on NIST : 58.745000 % +## epoch 1, minibatch 9999, test error on dataset P07 (train data) of best model 80.405000 % +## epoch 1, minibatch 9999, test error on dataset NIST of best model 61.341923 % +## +##Optimization complete with best validation score of 58.745000 %,with test performance 80.405000 % on dataset P07 +##The test score on the NIST dataset is 61.341923 +##The finetuning ran for 3.299500 minutes +## +## +##finetune with NIST (done earlier) followed by P07 (written here) +## +## +##STARTING FINETUNING, time = 2010-03-29 15:23:27.947374 +##epoch 1, minibatch 4999, validation error on NIST : 83.975000 % +## epoch 1, minibatch 4999, test error on dataset P07 (train data) of best model 83.872500 % +## epoch 1, minibatch 4999, test error on dataset NIST of best model 43.170010 % +##epoch 1, minibatch 9999, validation error on NIST : 79.775000 % +## epoch 1, minibatch 9999, test error on dataset P07 (train data) of best model 80.971250 % +## epoch 1, minibatch 9999, test error on dataset NIST of best model 49.017468 % +## +##Optimization complete with best validation score of 79.775000 %,with test performance 80.971250 % on dataset P07 +##The test score on the NIST dataset is 49.017468 +##The finetuning ran for 2.851500 minutes +## +## +##finetune with NIST only on the logistic regression on top. +## All hidden units output are input of the logistic regression +## +## +##STARTING FINETUNING, time = 2010-03-29 15:26:21.430557 +##epoch 1, minibatch 4999, validation error on P07 : 95.223750 % +## epoch 1, minibatch 4999, test error on dataset NIST (train data) of best model 93.268765 % +## epoch 1, minibatch 4999, test error on dataset P07 of best model 96.535000 % +##epoch 1, minibatch 9999, validation error on P07 : 95.223750 % +## +##Optimization complete with best validation score of 95.223750 %,with test performance 93.268765 % on dataset NIST +##The test score on the P07 dataset is 96.535000 +##The finetuning ran for 2.013167 minutes +##Closing remaining open files: /u/pannetis/IFT6266/test/series.h5... done +##[pannetis@ceylon test]$ + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/nist_csda.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,261 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs +from copy import copy + +from sgd_optimization_new import CSdASgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +from ift6266 import datasets +from config import * + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + #REDUCE_TRAIN_TO = 40000 + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + if state.has_key('decrease_lr'): + decrease_lr = state['decrease_lr'] + else : + decrease_lr = 0 + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = 100000#NIST_ALL_TRAIN_SIZE + + #To be sure variables will not be only in the if statement + PATH = '' + nom_reptrain = '' + nom_serie = "" + if state['pretrain_choice'] == 0: + nom_serie="series_NIST.h5" + elif state['pretrain_choice'] == 1: + nom_serie="series_P07.h5" + + series = create_series(state.num_hidden_layers,nom_serie) + + + print "Creating optimizer with state, ", state + + optimizer = CSdASgdOptimizer(dataset=datasets.nist_all(), + hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) + + parameters=[] + #Number of files of P07 used for pretraining + nb_file=0 + if state['pretrain_choice'] == 0: + print('\n\tpretraining with NIST\n') + optimizer.pretrain(datasets.nist_all()) + elif state['pretrain_choice'] == 1: + #To know how many file will be used during pretraining + nb_file = int(state['pretraining_epochs_per_layer']) + state['pretraining_epochs_per_layer'] = 1 #Only 1 time over the dataset + if nb_file >=100: + sys.exit("The code does not support this much pretraining epoch (99 max with P07).\n"+ + "You have to correct the code (and be patient, P07 is huge !!)\n"+ + "or reduce the number of pretraining epoch to run the code (better idea).\n") + print('\n\tpretraining with P07') + optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file)) + channel.save() + + #Set some of the parameters used for the finetuning + if state.has_key('finetune_set'): + finetune_choice=state['finetune_set'] + else: + finetune_choice=FINETUNE_SET + + if state.has_key('max_finetuning_epochs'): + max_finetune_epoch_NIST=state['max_finetuning_epochs'] + else: + max_finetune_epoch_NIST=MAX_FINETUNING_EPOCHS + + if state.has_key('max_finetuning_epochs_P07'): + max_finetune_epoch_P07=state['max_finetuning_epochs_P07'] + else: + max_finetune_epoch_P07=max_finetune_epoch_NIST + + #Decide how the finetune is done + + if finetune_choice == 0: + print('\n\n\tfinetune with NIST\n\n') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,decrease=decrease_lr) + channel.save() + if finetune_choice == 1: + print('\n\n\tfinetune with P07\n\n') + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0,decrease=decrease_lr) + channel.save() + if finetune_choice == 2: + print('\n\n\tfinetune with P07 followed by NIST\n\n') + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=20,decrease=decrease_lr) + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=21,decrease=decrease_lr) + channel.save() + if finetune_choice == 3: + print('\n\n\tfinetune with NIST only on the logistic regression on top (but validation on P07).\n\ + All hidden units output are input of the logistic regression\n\n') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,special=1,decrease=decrease_lr) + + + if finetune_choice==-1: + print('\nSERIE OF 4 DIFFERENT FINETUNINGS') + print('\n\n\tfinetune with NIST\n\n') + sys.stdout.flush() + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,decrease=decrease_lr) + channel.save() + print('\n\n\tfinetune with P07\n\n') + sys.stdout.flush() + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0,decrease=decrease_lr) + channel.save() + print('\n\n\tfinetune with P07 (done earlier) followed by NIST (written here)\n\n') + sys.stdout.flush() + optimizer.reload_parameters('params_finetune_P07.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=21,decrease=decrease_lr) + channel.save() + print('\n\n\tfinetune with NIST only on the logistic regression on top.\n\ + All hidden units output are input of the logistic regression\n\n') + sys.stdout.flush() + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,special=1,decrease=decrease_lr) + channel.save() + + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(num_hidden_layers, nom_serie): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, nom_serie), "w") + + #REDUCE_EVERY=10 + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + # comment out series we don't want to save + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +if __name__ == '__main__': + + args = sys.argv[1:] + + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() + + if len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DD(DEFAULT_HP_NIST), chanmock) + + else: + print "Bad arguments" +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/sgd_optimization_new.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,394 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys +import pickle + +from jobman import DD +import jobman, jobman.sql +from copy import copy + +from stacked_convolutional_dae_uit import CSdA + +from ift6266.utils.seriestables import * + +buffersize=1000 + +default_series = { \ + 'reconstruction_error' : DummySeries(), + 'training_error' : DummySeries(), + 'validation_error' : DummySeries(), + 'test_error' : DummySeries(), + 'params' : DummySeries() + } + +def itermax(iter, max): + for i,it in enumerate(iter): + if i >= max: + break + yield it +def get_conv_shape(kernels,imgshp,batch_size,max_pool_layers): + # Returns the dimension at the output of the convoluational net + # and a list of Image and kernel shape for every + # Convolutional layer + conv_layers=[] + init_layer = [ [ kernels[0][0],1,kernels[0][1],kernels[0][2] ],\ + [ batch_size , 1, imgshp[0], imgshp[1] ], + max_pool_layers[0] ] + conv_layers.append(init_layer) + + conv_n_out = int((32-kernels[0][2]+1)/max_pool_layers[0][0]) + + for i in range(1,len(kernels)): + layer = [ [ kernels[i][0],kernels[i-1][0],kernels[i][1],kernels[i][2] ],\ + [ batch_size, kernels[i-1][0],conv_n_out,conv_n_out ], + max_pool_layers[i] ] + conv_layers.append(layer) + conv_n_out = int( (conv_n_out - kernels[i][2]+1)/max_pool_layers[i][0]) + conv_n_out=kernels[-1][0]*conv_n_out**2 + return conv_n_out,conv_layers + + + + + +class CSdASgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, max_minibatches=None): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + self.parameters_pre=[] + + self.max_minibatches = max_minibatches + print "CSdASgdOptimizer, max_minibatches =", max_minibatches + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + + self.series = series + + self.rng = numpy.random.RandomState(1234) + self.init_classifier() + + sys.stdout.flush() + + def init_classifier(self): + print "Constructing classifier" + + n_ins,convlayers = get_conv_shape(self.hp.kernels,self.hp.imgshp,self.hp.minibatch_size,self.hp.max_pool_layers) + + self.classifier = CSdA(n_ins_mlp = n_ins, + batch_size = self.hp.minibatch_size, + conv_hidden_layers_sizes = convlayers, + mlp_hidden_layers_sizes = self.hp.mlp_size, + corruption_levels = self.hp.corruption_levels, + rng = self.rng, + n_out = self.n_outs, + pretrain_lr = self.hp.pretraining_lr, + finetune_lr = self.hp.finetuning_lr) + + + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain(self.dataset) + self.finetune(self.dataset) + + def pretrain(self,dataset): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + un_fichier=int(819200.0/self.hp.minibatch_size) #Number of batches in a P07 file + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + batch_index=0 + count=0 + num_files=0 + for x,y in dataset.train(self.hp.minibatch_size): + if x.shape[0] != self.hp.minibatch_size: + continue + c = self.classifier.pretrain_functions[i](x) + count +=1 + + self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 + + #if batch_index % 100 == 0: + # print "100 batches" + + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break + + #When we pass through the data only once (the case with P07) + #There is approximately 800*1024=819200 examples per file (1k per example and files are 800M) + if self.hp.pretraining_epochs_per_layer == 1 and count%un_fichier == 0: + print 'Pre-training layer %i, epoch %d, cost '%(i,num_files),c + num_files+=1 + sys.stdout.flush() + self.series['params'].append((num_files,), self.classifier.all_params) + + #When NIST is used + if self.hp.pretraining_epochs_per_layer > 1: + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + #To be able to load them later for tests on finetune + self.parameters_pre=[copy(x.value) for x in self.classifier.params] + f = open('params_pretrain.txt', 'w') + pickle.dump(self.parameters_pre,f) + f.close() + def finetune(self,dataset,dataset_test,num_finetune,ind_test,special=0,decrease=0): + + if special != 0 and special != 1: + sys.exit('Bad value for variable special. Must be in {0,1}') + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + minibatch_size = self.hp.minibatch_size + if ind_test == 0 or ind_test == 20: + nom_test = "NIST" + nom_train="P07" + else: + nom_test = "P07" + nom_train = "NIST" + + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) + + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.mb_per_epoch, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + if self.max_minibatches and validation_frequency > self.max_minibatches: + validation_frequency = self.max_minibatches / 2 + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + total_mb_index = 0 + minibatch_index = 0 + parameters_finetune=[] + learning_rate = self.hp.finetuning_lr + + + while (epoch < num_finetune) and (not done_looping): + epoch = epoch + 1 + + for x,y in dataset.train(minibatch_size,bufsize=buffersize): + + minibatch_index += 1 + + if x.shape[0] != self.hp.minibatch_size: + print 'bim' + continue + + cost_ij = self.classifier.finetune(x,y)#,learning_rate) + total_mb_index += 1 + + self.series["training_error"].append((epoch, minibatch_index), cost_ij) + + if (total_mb_index+1) % validation_frequency == 0: + #minibatch_index += 1 + #The validation set is always NIST (we want the model to be good on NIST) + + iter=dataset_test.valid(minibatch_size,bufsize=buffersize) + + + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + + validation_losses = [] + + for x,y in iter: + if x.shape[0] != self.hp.minibatch_size: + print 'bim' + continue + validation_losses.append(validate_model(x,y)) + + this_validation_loss = numpy.mean(validation_losses) + + self.series["validation_error"].\ + append((epoch, minibatch_index), this_validation_loss*100.) + + print('epoch %i, minibatch %i, validation error on NIST : %f %%' % \ + (epoch, minibatch_index+1, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, total_mb_index * patience_increase) + + # save best validation score, iteration number and parameters + best_validation_loss = this_validation_loss + best_iter = total_mb_index + parameters_finetune=[copy(x.value) for x in self.classifier.params] + + # test it on the test set + iter = dataset.test(minibatch_size,bufsize=buffersize) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + test_losses = [] + test_losses2 = [] + for x,y in iter: + if x.shape[0] != self.hp.minibatch_size: + print 'bim' + continue + test_losses.append(test_model(x,y)) + + test_score = numpy.mean(test_losses) + + #test it on the second test set + iter2 = dataset_test.test(minibatch_size,bufsize=buffersize) + if self.max_minibatches: + iter2 = itermax(iter2, self.max_minibatches) + for x,y in iter2: + if x.shape[0] != self.hp.minibatch_size: + continue + test_losses2.append(test_model(x,y)) + + test_score2 = numpy.mean(test_losses2) + + self.series["test_error"].\ + append((epoch, minibatch_index), test_score*100.) + + print((' epoch %i, minibatch %i, test error on dataset %s (train data) of best ' + 'model %f %%') % + (epoch, minibatch_index+1,nom_train, + test_score*100.)) + + print((' epoch %i, minibatch %i, test error on dataset %s of best ' + 'model %f %%') % + (epoch, minibatch_index+1,nom_test, + test_score2*100.)) + + if patience <= total_mb_index: + done_looping = True + break #to exit the FOR loop + + sys.stdout.flush() + + # useful when doing tests + if self.max_minibatches and minibatch_index >= self.max_minibatches: + break + + if decrease == 1: + learning_rate /= 2 #divide the learning rate by 2 for each new epoch + + self.series['params'].append((epoch,), self.classifier.all_params) + + if done_looping == True: #To exit completly the fine-tuning + break #to exit the WHILE loop + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + print(('\nOptimization complete with best validation score of %f %%,' + 'with test performance %f %% on dataset %s ') % + (best_validation_loss * 100., test_score*100.,nom_train)) + print(('The test score on the %s dataset is %f')%(nom_test,test_score2*100.)) + + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + sys.stdout.flush() + + #Save a copy of the parameters in a file to be able to get them in the future + + if special == 1: #To keep a track of the value of the parameters + f = open('params_finetune_stanford.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + elif ind_test == 0 | ind_test == 20: #To keep a track of the value of the parameters + f = open('params_finetune_P07.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + + elif ind_test== 1: #For the run with 2 finetunes. It will be faster. + f = open('params_finetune_NIST.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + elif ind_test== 21: #To keep a track of the value of the parameters + f = open('params_finetune_P07_then_NIST.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + #Set parameters like they where right after pre-train or finetune + def reload_parameters(self,which): + + #self.parameters_pre=pickle.load('params_pretrain.txt') + f = open(which) + self.parameters_pre=pickle.load(f) + f.close() + for idx,x in enumerate(self.parameters_pre): + if x.dtype=='float64': + self.classifier.params[idx].value=theano._asarray(copy(x),dtype=theano.config.floatX) + else: + self.classifier.params[idx].value=copy(x) + + def training_error(self,dataset): + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + + iter2 = dataset.train(self.hp.minibatch_size,bufsize=buffersize) + train_losses2 = [test_model(x,y) for x,y in iter2] + train_score2 = numpy.mean(train_losses2) + print "Training error is: " + str(train_score2)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/stacked_convolutional_dae_uit.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,243 @@ +import numpy +import theano +import time +import sys +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import theano.sandbox.softsign +import copy +from theano.tensor.signal import downsample +from theano.tensor.nnet import conv + +sys.path.append('../../') +#import ift6266.datasets +import ift6266.datasets +from ift6266.baseline.log_reg.log_reg import LogisticRegression + +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.tanh(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + +class dA_conv(object): + + def __init__(self, input, filter_shape, corruption_level = 0.1, + shared_W = None, shared_b = None, image_shape = None, num = 0,batch_size=20): + + theano_rng = RandomStreams() + + fan_in = numpy.prod(filter_shape[1:]) + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) + + center = theano.shared(value = 1, name="center") + scale = theano.shared(value = 2, name="scale") + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(fan_in+fan_out)), + high = numpy.sqrt(6./(fan_in+fan_out)), + size = filter_shape), dtype = theano.config.floatX) + initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros((filter_shape[1],),dtype=theano.config.floatX) + + self.b_prime = theano.shared(value = initial_b_prime, name = "b_prime") + + self.x = input + + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level,dtype=theano.config.floatX) * self.x + + conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape, + image_shape=image_shape, + unroll_kern=4,unroll_batch=4, + border_mode='valid') + + + self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x')) + + + da_filter_shape = [ filter_shape[1], filter_shape[0], filter_shape[2],\ + filter_shape[3] ] + da_image_shape = [ batch_size, filter_shape[0], image_shape[2]-filter_shape[2]+1, + image_shape[3]-filter_shape[3]+1 ] + #import pdb; pdb.set_trace() + initial_W_prime = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(fan_in+fan_out)), \ + high = numpy.sqrt(6./(fan_in+fan_out)), \ + size = da_filter_shape), dtype = theano.config.floatX) + self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime") + + conv2_out = conv.conv2d(self.y, self.W_prime, + filter_shape = da_filter_shape,\ + image_shape = da_image_shape, \ + unroll_kern=4,unroll_batch=4, \ + border_mode='full') + + self.z = (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale + + if num != 0 : + scaled_x = (self.x + center) / scale + else: + scaled_x = self.x + self.L = - T.sum( scaled_x*T.log(self.z) + (1-scaled_x)*T.log(1-self.z), axis=1 ) + + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + +class LeNetConvPoolLayer(object): + + def __init__(self, rng, input, filter_shape, image_shape=None, poolsize=(2,2)): + self.input = input + + W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX) + self.W = theano.shared(value=W_values) + + b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + self.b = theano.shared(value=b_values) + + conv_out = conv.conv2d(input, self.W, + filter_shape=filter_shape, image_shape=image_shape, + unroll_kern=4,unroll_batch=4) + + + fan_in = numpy.prod(filter_shape[1:]) + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize) + + W_bound = numpy.sqrt(6./(fan_in + fan_out)) + self.W.value = numpy.asarray( + rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), + dtype = theano.config.floatX) + + + pooled_out = downsample.max_pool2D(conv_out, poolsize, ignore_border=True) + + self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + self.params = [self.W, self.b] + + +class CSdA(): + def __init__(self, n_ins_mlp,batch_size, conv_hidden_layers_sizes, + mlp_hidden_layers_sizes, corruption_levels, rng, n_out, + pretrain_lr, finetune_lr): + + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(mlp_hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + #update_locals(self, locals()) + + + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + self.n_layers = len(conv_hidden_layers_sizes) + self.mlp_n_layers = len(mlp_hidden_layers_sizes) + + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + + for i in xrange( self.n_layers ): + filter_shape=conv_hidden_layers_sizes[i][0] + image_shape=conv_hidden_layers_sizes[i][1] + max_poolsize=conv_hidden_layers_sizes[i][2] + + if i == 0 : + layer_input=self.x.reshape((batch_size, 1, 32, 32)) + else: + layer_input=self.layers[-1].output + + layer = LeNetConvPoolLayer(rng, input=layer_input, + image_shape=image_shape, + filter_shape=filter_shape, + poolsize=max_poolsize) + print 'Convolutional layer', str(i+1), 'created' + + self.layers += [layer] + self.params += layer.params + + da_layer = dA_conv(corruption_level = corruption_levels[0], + input = layer_input, + shared_W = layer.W, shared_b = layer.b, + filter_shape=filter_shape, + image_shape = image_shape, num=i , batch_size=batch_size) + + gparams = T.grad(da_layer.cost, da_layer.params) + + updates = {} + for param, gparam in zip(da_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + update_fn = theano.function([self.x], da_layer.cost, updates = updates) + + self.pretrain_functions += [update_fn] + + for i in xrange( self.mlp_n_layers ): + if i == 0 : + input_size = n_ins_mlp + else: + input_size = mlp_hidden_layers_sizes[i-1] + + if i == 0 : + if len( self.layers ) == 0 : + layer_input=self.x + else : + layer_input = self.layers[-1].output.flatten(2) + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + mlp_hidden_layers_sizes[i] ) + + self.layers += [layer] + self.params += layer.params + + print 'MLP layer', str(i+1), 'created' + + self.logLayer = LogisticRegression(input=self.layers[-1].output, \ + n_in=mlp_hidden_layers_sizes[-1], n_out=n_out) + + + self.params += self.logLayer.params + self.all_params = self.params + cost = self.logLayer.negative_log_likelihood(self.y) + + gparams = T.grad(cost, self.params) + + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([self.x, self.y], cost, updates = updates) + + self.errors = self.logLayer.errors(self.y) + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/utils.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/rbm/mnistrbm.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,215 @@ +import sys +import os, os.path + +import numpy as N + +import theano +import theano.tensor as T + +from crbm import CRBM, ConvolutionParams + +from pylearn.datasets import MNIST +from pylearn.io.image_tiling import tile_raster_images + +import Image + +from pylearn.io.seriestables import * +import tables + +IMAGE_OUTPUT_DIR = 'img/' + +REDUCE_EVERY = 100 + +def filename_from_time(suffix): + import datetime + return str(datetime.datetime.now()) + suffix + ".png" + +# Just a shortcut for a common case where we need a few +# related Error (float) series + +def get_accumulator_series_array( \ + hdf5_file, group_name, series_names, + reduce_every, + index_names=('epoch','minibatch'), + stdout_too=True, + skip_hdf5_append=False): + all_series = [] + + hdf5_file.createGroup('/', group_name) + + other_targets = [] + if stdout_too: + other_targets = [StdoutAppendTarget()] + + for sn in series_names: + series_base = \ + ErrorSeries(error_name=sn, + table_name=sn, + hdf5_file=hdf5_file, + hdf5_group='/'+group_name, + index_names=index_names, + other_targets=other_targets, + skip_hdf5_append=skip_hdf5_append) + + all_series.append( \ + AccumulatorSeriesWrapper( \ + base_series=series_base, + reduce_every=reduce_every)) + + ret_wrapper = SeriesArrayWrapper(all_series) + + return ret_wrapper + +class ExperienceRbm(object): + def __init__(self): + self.mnist = MNIST.full()#first_10k() + + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + test_set_x , test_set_y = datasets[2] + + + batch_size = 100 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + # initialize storage fot the persistent chain (state = hidden layer of chain) + persistent_chain = theano.shared(numpy.zeros((batch_size, 500))) + + # construct the RBM class + self.rbm = RBM( input = x, n_visible=28*28, \ + n_hidden = 500,numpy_rng = rng, theano_rng = theano_rng) + + # get the cost and the gradient corresponding to one step of CD + + + self.init_series() + + def init_series(self): + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + cd_series_names = self.rbm.cd_return_desc + series['cd'] = \ + get_accumulator_series_array( \ + h5f, 'cd', cd_series_names, + REDUCE_EVERY, + stdout_too=True) + + + + # so first we create the names for each table, based on + # position of each param in the array + params_stdout = StdoutAppendTarget("\n------\nParams") + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=['W','b_h','b_x'], + hdf5_file=h5f, + index_names=('epoch','minibatch'), + other_targets=[params_stdout]) + + self.series = series + + def train(self, persistent, learning_rate): + + training_epochs = 15 + + #get the cost and the gradient corresponding to one step of CD + if persistant: + persistent_chain = theano.shared(numpy.zeros((batch_size, self.rbm.n_hidden))) + cost, updates = self.rbm.cd(lr=learning_rate, persistent=persistent_chain) + + else: + cost, updates = self.rbm.cd(lr=learning_rate) + + dirname = 'lr=%.5f'%self.rbm.learning_rate + os.makedirs(dirname) + os.chdir(dirname) + + # the purpose of train_rbm is solely to update the RBM parameters + train_rbm = theano.function([index], cost, + updates = updates, + givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]}) + + plotting_time = 0. + start_time = time.clock() + + + # go through training epochs + for epoch in xrange(training_epochs): + + # go through the training set + mean_cost = [] + for batch_index in xrange(n_train_batches): + mean_cost += [train_rbm(batch_index)] + + + pretraining_time = (end_time - start_time) + + + + + def sample_from_rbm(self, gibbs_steps, test_set_x): + + # find out the number of test samples + number_of_test_samples = test_set_x.value.shape[0] + + # pick random test examples, with which to initialize the persistent chain + test_idx = rng.randint(number_of_test_samples-20) + persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+20]) + + # define one step of Gibbs sampling (mf = mean-field) + [hid_mf, hid_sample, vis_mf, vis_sample] = self.rbm.gibbs_vhv(persistent_vis_chain) + + # the sample at the end of the channel is returned by ``gibbs_1`` as + # its second output; note that this is computed as a binomial draw, + # therefore it is formed of ints (0 and 1) and therefore needs to + # be converted to the same dtype as ``persistent_vis_chain`` + vis_sample = T.cast(vis_sample, dtype=theano.config.floatX) + + # construct the function that implements our persistent chain + # we generate the "mean field" activations for plotting and the actual samples for + # reinitializing the state of our persistent chain + sample_fn = theano.function([], [vis_mf, vis_sample], + updates = { persistent_vis_chain:vis_sample}) + + # sample the RBM, plotting every `plot_every`-th sample; do this + # until you plot at least `n_samples` + n_samples = 10 + plot_every = 1000 + + for idx in xrange(n_samples): + + # do `plot_every` intermediate samplings of which we do not care + for jdx in xrange(plot_every): + vis_mf, vis_sample = sample_fn() + + # construct image + image = PIL.Image.fromarray(tile_raster_images( + X = vis_mf, + img_shape = (28,28), + tile_shape = (10,10), + tile_spacing = (1,1) ) ) + + image.save('sample_%i_step_%i.png'%(idx,idx*jdx)) + + +if __name__ == '__main__': + mc = ExperienceRbm() + mc.train() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/rbm/rbm.py Thu Apr 22 19:57:05 2010 -0400 @@ -0,0 +1,226 @@ +"""This tutorial introduces restricted boltzmann machines (RBM) using Theano. + +Boltzmann Machines (BMs) are a particular form of energy-based model which +contain hidden variables. Restricted Boltzmann Machines further restrict BMs +to those without visible-visible and hidden-hidden connections. +""" + + +import numpy, time, cPickle, gzip, PIL.Image + +import theano +import theano.tensor as T +import os + +from theano.tensor.shared_randomstreams import RandomStreams + +from utils import tile_raster_images +from logistic_sgd import load_data + + +class RBM(object): + """Restricted Boltzmann Machine (RBM) """ + def __init__(self, input=None, n_visible=784, n_hidden=1000, \ + W = None, hbias = None, vbias = None, numpy_rng = None, + theano_rng = None): + """ + RBM constructor. Defines the parameters of the model along with + basic operations for inferring hidden from visible (and vice-versa), + as well as for performing CD updates. + + :param input: None for standalone RBMs or symbolic variable if RBM is + part of a larger graph. + + :param n_visible: number of visible units + + :param n_hidden: number of hidden units + + :param W: None for standalone RBMs or symbolic variable pointing to a + shared weight matrix in case RBM is part of a DBN network; in a DBN, + the weights are shared between RBMs and layers of a MLP + + :param hbias: None for standalone RBMs or symbolic variable pointing + to a shared hidden units bias vector in case RBM is part of a + different network + + :param vbias: None for standalone RBMs or a symbolic variable + pointing to a shared visible units bias + """ + + self.n_visible = n_visible + self.n_hidden = n_hidden + + + if W is None : + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden+n_visible)), + high = numpy.sqrt(6./(n_hidden+n_visible)), + size = (n_visible, n_hidden)), + dtype = theano.config.floatX) + # theano shared variables for weights and biases + W = theano.shared(value = initial_W, name = 'W') + + if hbias is None : + # create shared variable for hidden units bias + hbias = theano.shared(value = numpy.zeros(n_hidden, + dtype = theano.config.floatX), name='hbias') + + if vbias is None : + # create shared variable for visible units bias + vbias = theano.shared(value =numpy.zeros(n_visible, + dtype = theano.config.floatX),name='vbias') + + if numpy_rng is None: + # create a number generator + numpy_rng = numpy.random.RandomState(1234) + + if theano_rng is None : + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + + # initialize input layer for standalone RBM or layer0 of DBN + self.input = input if input else T.dmatrix('input') + + self.W = W + self.hbias = hbias + self.vbias = vbias + self.theano_rng = theano_rng + # **** WARNING: It is not a good idea to put things in this list + # other than shared variables created in this function. + self.params = [self.W, self.hbias, self.vbias] + self.batch_size = self.input.shape[0] + + def free_energy(self, v_sample): + ''' Function to compute the free energy ''' + wx_b = T.dot(v_sample, self.W) + self.hbias + vbias_term = T.sum(T.dot(v_sample, self.vbias)) + hidden_term = T.sum(T.log(1+T.exp(wx_b))) + return -hidden_term - vbias_term + + def sample_h_given_v(self, v0_sample): + ''' This function infers state of hidden units given visible units ''' + # compute the activation of the hidden units given a sample of the visibles + h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias) + # get a sample of the hiddens given their activation + h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean) + return [h1_mean, h1_sample] + + def sample_v_given_h(self, h0_sample): + ''' This function infers state of visible units given hidden units ''' + # compute the activation of the visible given the hidden sample + v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias) + # get a sample of the visible given their activation + v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean) + return [v1_mean, v1_sample] + + def gibbs_hvh(self, h0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the hidden state''' + v1_mean, v1_sample = self.sample_v_given_h(h0_sample) + h1_mean, h1_sample = self.sample_h_given_v(v1_sample) + return [v1_mean, v1_sample, h1_mean, h1_sample] + + def gibbs_vhv(self, v0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the visible state''' + h1_mean, h1_sample = self.sample_h_given_v(v0_sample) + v1_mean, v1_sample = self.sample_v_given_h(h1_sample) + return [h1_mean, h1_sample, v1_mean, v1_sample] + + def cd(self, lr = 0.1, persistent=None): + """ + This functions implements one step of CD-1 or PCD-1 + + :param lr: learning rate used to train the RBM + :param persistent: None for CD. For PCD, shared variable containing old state + of Gibbs chain. This must be a shared variable of size (batch size, number of + hidden units). + + Returns the updates dictionary. The dictionary contains the update rules for weights + and biases but also an update of the shared variable used to store the persistent + chain, if one is used. + """ + + # compute positive phase + ph_mean, ph_sample = self.sample_h_given_v(self.input) + + # decide how to initialize persistent chain: + # for CD, we use the newly generate hidden sample + # for PCD, we initialize from the old state of the chain + if persistent is None: + chain_start = ph_sample + else: + chain_start = persistent + + # perform actual negative phase + [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start) + + # determine gradients on RBM parameters + g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size + g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size + g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \ + T.dot(nh_mean.T, nv_mean )/ self.batch_size + + gparams = [g_W.T, g_hbias, g_vbias] + + # constructs the update dictionary + updates = {} + for gparam, param in zip(gparams, self.params): + updates[param] = param + gparam * lr + + if persistent: + # Note that this works only if persistent is a shared variable + updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX) + # pseudo-likelihood is a better proxy for PCD + cost = self.get_pseudo_likelihood_cost(updates) + else: + # reconstruction cross-entropy is a better proxy for CD + cost = self.get_reconstruction_cost(updates, nv_mean) + + return cost, updates + + def get_pseudo_likelihood_cost(self, updates): + """Stochastic approximation to the pseudo-likelihood""" + + # index of bit i in expression p(x_i | x_{\i}) + bit_i_idx = theano.shared(value=0, name = 'bit_i_idx') + + # binarize the input image by rounding to nearest integer + xi = T.iround(self.input) + + # calculate free energy for the given bit configuration + fe_xi = self.free_energy(xi) + + # flip bit x_i of matrix xi and preserve all other bits x_{\i} + # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx] + # NB: slice(start,stop,step) is the python object used for + # slicing, e.g. to index matrix x as follows: x[start:stop:step] + xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], + idx_list=(slice(None,None,None),bit_i_idx)) + + # calculate free energy with bit flipped + fe_xi_flip = self.free_energy(xi_flip) + + # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) + cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)) + + # increment bit_i_idx % number as part of updates + updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible + + return cost + + def get_reconstruction_cost(self, updates, nv_mean): + """Approximation to the reconstruction error""" + + cross_entropy = T.mean( + T.sum(self.input*T.log(nv_mean) + + (1 - self.input)*T.log(1-nv_mean), axis = 1)) + + return cross_entropy + + +
--- a/deep/stacked_dae/v_sylvain/nist_sda.py Mon Apr 19 07:09:44 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/nist_sda.py Thu Apr 22 19:57:05 2010 -0400 @@ -55,6 +55,11 @@ decrease_lr = state['decrease_lr'] else : decrease_lr = 0 + + if state.has_key('decrease_lr_pretrain'): + dec=state['decrease_lr_pretrain'] + else : + dec=0 n_ins = 32*32 n_outs = 62 # 10 digits, 26*2 (lower, capitals) @@ -87,7 +92,7 @@ nb_file=0 if state['pretrain_choice'] == 0: print('\n\tpretraining with NIST\n') - optimizer.pretrain(datasets.nist_all()) + optimizer.pretrain(datasets.nist_all(), decrease = dec) elif state['pretrain_choice'] == 1: #To know how many file will be used during pretraining nb_file = int(state['pretraining_epochs_per_layer']) @@ -97,7 +102,7 @@ "You have to correct the code (and be patient, P07 is huge !!)\n"+ "or reduce the number of pretraining epoch to run the code (better idea).\n") print('\n\tpretraining with P07') - optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file)) + optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file),decrease = dec) channel.save() #Set some of the parameters used for the finetuning
--- a/deep/stacked_dae/v_sylvain/sgd_optimization.py Mon Apr 19 07:09:44 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/sgd_optimization.py Thu Apr 22 19:57:05 2010 -0400 @@ -9,7 +9,8 @@ import datetime import theano.tensor as T import sys -import pickle +#import pickle +import cPickle from jobman import DD import jobman, jobman.sql @@ -87,30 +88,40 @@ self.pretrain(self.dataset) self.finetune(self.dataset) - def pretrain(self,dataset): + def pretrain(self,dataset,decrease=0): print "STARTING PRETRAINING, time = ", datetime.datetime.now() sys.stdout.flush() un_fichier=int(819200.0/self.hp.minibatch_size) #Number of batches in a P07 file start_time = time.clock() + + ######## This is hardcoaded. THe 0.95 parameter is hardcoaded and can be changed at will ### + #Set the decreasing rate of the learning rate. We want the final learning rate to + #be 5% of the original learning rate. The decreasing factor is linear + decreasing = (decrease*self.hp.pretraining_lr)/float(self.hp.pretraining_epochs_per_layer*800000/self.hp.minibatch_size) + ## Pre-train layer-wise for i in xrange(self.classifier.n_layers): # go through pretraining epochs + + #To reset the learning rate to his original value + learning_rate=self.hp.pretraining_lr for epoch in xrange(self.hp.pretraining_epochs_per_layer): # go through the training set batch_index=0 count=0 num_files=0 for x,y in dataset.train(self.hp.minibatch_size): - c = self.classifier.pretrain_functions[i](x) + c = self.classifier.pretrain_functions[i](x,learning_rate) count +=1 self.series["reconstruction_error"].append((epoch, batch_index), c) batch_index+=1 - #if batch_index % 100 == 0: - # print "100 batches" + #If we need to decrease the learning rate for the pretrain + if decrease != 0: + learning_rate -= decreasing # useful when doing tests if self.max_minibatches and batch_index >= self.max_minibatches: @@ -141,7 +152,7 @@ #To be able to load them later for tests on finetune self.parameters_pre=[copy(x.value) for x in self.classifier.params] f = open('params_pretrain.txt', 'w') - pickle.dump(self.parameters_pre,f) + cPickle.dump(self.parameters_pre,f,protocol=-1) f.close() @@ -204,7 +215,7 @@ parameters_finetune=[] if ind_test == 21: - learning_rate = self.hp.finetuning_lr / 5.0 + learning_rate = self.hp.finetuning_lr / 10.0 else: learning_rate = self.hp.finetuning_lr #The initial finetune lr @@ -295,8 +306,8 @@ break if decrease == 1: - if (ind_test == 21 & epoch % 100 == 0) | ind_test == 20: - learning_rate /= 2 #divide the learning rate by 2 for each new epoch of P07 (or 100 of NIST) + if (ind_test == 21 & epoch % 100 == 0) | ind_test == 20: + learning_rate /= 2 #divide the learning rate by 2 for each new epoch of P07 (or 100 of NIST) self.series['params'].append((epoch,), self.classifier.all_params) @@ -322,23 +333,23 @@ if special == 1: #To keep a track of the value of the parameters f = open('params_finetune_stanford.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() elif ind_test == 0 | ind_test == 20: #To keep a track of the value of the parameters f = open('params_finetune_P07.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() elif ind_test== 1: #For the run with 2 finetunes. It will be faster. f = open('params_finetune_NIST.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() elif ind_test== 21: #To keep a track of the value of the parameters f = open('params_finetune_P07_then_NIST.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() @@ -347,7 +358,7 @@ #self.parameters_pre=pickle.load('params_pretrain.txt') f = open(which) - self.parameters_pre=pickle.load(f) + self.parameters_pre=cPickle.load(f) f.close() for idx,x in enumerate(self.parameters_pre): if x.dtype=='float64':
--- a/deep/stacked_dae/v_sylvain/stacked_dae.py Mon Apr 19 07:09:44 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/stacked_dae.py Thu Apr 22 19:57:05 2010 -0400 @@ -27,8 +27,10 @@ # initialize the baises b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros((n_out,), dtype = theano.config.floatX) ) - # compute vector of class-membership probabilities in symbolic form - self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + # compute vector of class-membership. This is a sigmoid instead of + #a softmax to be able to classify as nothing later +## self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + self.p_y_given_x = T.nnet.sigmoid(T.dot(input, self.W)+self.b) # compute prediction as class whose probability is maximal in # symbolic form @@ -71,7 +73,25 @@ self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) self.params = [self.W, self.b] + +class TanhLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = (T.tanh(T.dot(input, self.W) + self.b) + 1) /2 + # ( *+ 1) /2 is because tanh goes from -1 to 1 and sigmoid goes from 0 to 1 + # I want to use tanh, but the image has to stay the same. The correction is necessary. + self.params = [self.W, self.b] class dA(object): @@ -132,22 +152,24 @@ # Equation (2) # note : y is stored as an attribute of the class so that it can be # used later when stacking dAs. - self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) - # Equation (3) - #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) - # Equation (4) - # note : we sum over the size of a datapoint; if we are using minibatches, - # L will be a vector, with one entry per example in minibatch - #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) - #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) - - # bypassing z to avoid running to log(0) - z_a = T.dot(self.y, self.W_prime) + self.b_prime - log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) - # log(1-sigmoid(z_a)) - log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) - self.L = -T.sum( self.x * (log_sigmoid) \ - + (1.0-self.x) * (log_1_sigmoid), axis=1 ) + +## self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) +## +## # Equation (3) +## #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) +## # Equation (4) +## # note : we sum over the size of a datapoint; if we are using minibatches, +## # L will be a vector, with one entry per example in minibatch +## #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) +## #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) +## +## # bypassing z to avoid running to log(0) +## z_a = T.dot(self.y, self.W_prime) + self.b_prime +## log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) +## # log(1-sigmoid(z_a)) +## log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) +## self.L = -T.sum( self.x * (log_sigmoid) \ +## + (1.0-self.x) * (log_1_sigmoid), axis=1 ) # I added this epsilon to avoid getting log(0) and 1/0 in grad # This means conceptually that there'd be no probability of 0, but that @@ -160,6 +182,21 @@ # of the reconstruction of the corresponding example of the # minibatch. We need to compute the average of all these to get # the cost of the minibatch + + #Or use a Tanh everything is always between 0 and 1, the range is + #changed so it remain the same as when sigmoid is used + self.y = (T.tanh(T.dot(self.tilde_x, self.W ) + self.b)+1.0)/2.0 + + z_a = T.dot(self.y, self.W_prime) + self.b_prime + self.z = (T.tanh(z_a + self.b_prime)+1.0) / 2.0 + #To ensure to do not have a log(0) operation + if self.z <= 0: + self.z = 0.000001 + if self.z >= 1: + self.z = 0.999999 + + self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + self.cost = T.mean(self.L) self.params = [ self.W, self.b, self.b_prime ] @@ -204,6 +241,7 @@ self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels self.finetune_lr = T.fscalar('finetune_lr') #To get a dynamic finetune learning rate + self.pretrain_lr = T.fscalar('pretrain_lr') #To get a dynamic pretrain learning rate for i in xrange( self.n_layers ): # construct the sigmoidal layer @@ -222,8 +260,12 @@ layer_input = self.x else: layer_input = self.layers[-1].output + #We have to choose between sigmoidal layer or tanh layer ! - layer = SigmoidalLayer(rng, layer_input, input_size, +## layer = SigmoidalLayer(rng, layer_input, input_size, +## hidden_layers_sizes[i] ) + + layer = TanhLayer(rng, layer_input, input_size, hidden_layers_sizes[i] ) # add the layer to the self.layers += [layer] @@ -244,10 +286,10 @@ # compute the list of updates updates = {} for param, gparam in zip(dA_layer.params, gparams): - updates[param] = param - gparam * pretrain_lr + updates[param] = param - gparam * self.pretrain_lr # create a function that trains the dA - update_fn = theano.function([self.x], dA_layer.cost, \ + update_fn = theano.function([self.x, self.pretrain_lr], dA_layer.cost, \ updates = updates)#, # givens = { # self.x : ensemble})
--- a/scripts/setup_batches.py Mon Apr 19 07:09:44 2010 -0400 +++ b/scripts/setup_batches.py Thu Apr 22 19:57:05 2010 -0400 @@ -61,8 +61,8 @@ self.raw_upper_test_data = ft.read(f_upper_test_data) self.raw_upper_test_labels = ft.read(f_upper_test_labels) - self.raw_test_data = ft.read(f_test_data) - self.raw_test_labels = ft.read(f_test_labels) + #self.raw_test_data = ft.read(f_test_data) + #self.raw_test_labels = ft.read(f_test_labels) f_digits_train_data.close() f_digits_train_labels.close() @@ -79,8 +79,8 @@ f_upper_test_data.close() f_upper_test_labels.close() - f_test_data.close() - f_test_labels.close() + #f_test_data.close() + #f_test_labels.close() print 'Data opened'