Mercurial > ift6266
changeset 373:39a3917db946
merge1
author | Xavier Glorot <glorotxa@iro.umontreal.ca> |
---|---|
date | Sun, 25 Apr 2010 13:54:57 -0400 |
parents | 543ae35e387e (diff) 8cf52a1c8055 (current diff) |
children | 846f0678ffe8 |
files | |
diffstat | 20 files changed, 3640 insertions(+), 92 deletions(-) [+] |
line wrap: on
line diff
--- a/baseline/mlp/mlp_nist.py Sun Apr 25 12:31:22 2010 -0400 +++ b/baseline/mlp/mlp_nist.py Sun Apr 25 13:54:57 2010 -0400 @@ -23,6 +23,7 @@ """ __docformat__ = 'restructedtext en' +import sys import pdb import numpy import pylab @@ -163,6 +164,75 @@ else: raise NotImplementedError() +def mlp_get_nist_error(model_name='/u/mullerx/ift6266h10_sandbox_db/xvm_final_lr1_p073/8/best_model.npy.npz', + data_set=0): + + + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + # load the data set and create an mlp based on the dimensions of the model + model=numpy.load(model_name) + W1=model['W1'] + W2=model['W2'] + b1=model['b1'] + b2=model['b2'] + nb_hidden=b1.shape[0] + input_dim=W1.shape[0] + nb_targets=b2.shape[0] + learning_rate=0.1 + + + if data_set==0: + dataset=datasets.nist_all() + elif data_set==1: + dataset=datasets.nist_P07() + + + classifier = MLP( input=x,\ + n_in=input_dim,\ + n_hidden=nb_hidden,\ + n_out=nb_targets, + learning_rate=learning_rate) + + + #overwrite weights with weigths from model + classifier.W1.value=W1 + classifier.W2.value=W2 + classifier.b1.value=b1 + classifier.b2.value=b2 + + + cost = classifier.negative_log_likelihood(y) \ + + 0.0 * classifier.L1 \ + + 0.0 * classifier.L2_sqr + + # compiling a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + + + + #get the test error + #use a batch size of 1 so we can get the sub-class error + #without messing with matrices (will be upgraded later) + test_score=0 + temp=0 + for xt,yt in dataset.test(20): + test_score += test_model(xt,yt) + temp = temp+1 + test_score /= temp + + + return test_score*100 + + + + + def mlp_full_nist( verbose = 1,\ adaptive_lr = 0,\ @@ -174,15 +244,19 @@ batch_size=20,\ nb_hidden = 30,\ nb_targets = 62, - tau=1e6,\ - lr_t2_factor=0.5): + tau=1e6,\ + lr_t2_factor=0.5,\ + init_model=0,\ + channel=0): + if channel!=0: + channel.save() configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr] #save initial learning rate if classical adaptive lr is used initial_lr=learning_rate - max_div_count=3 + max_div_count=1000 total_validation_error_list = [] @@ -195,6 +269,8 @@ dataset=datasets.nist_all() elif data_set==1: dataset=datasets.nist_P07() + elif data_set==2: + dataset=datasets.PNIST07() @@ -215,6 +291,14 @@ learning_rate=learning_rate) + # check if we want to initialise the weights with a previously calculated model + # dimensions must be consistent between old model and current configuration!!!!!! (nb_hidden and nb_targets) + if init_model!=0: + old_model=numpy.load(init_model) + classifier.W1.value=old_model['W1'] + classifier.W2.value=old_model['W2'] + classifier.b1.value=old_model['b1'] + classifier.b2.value=old_model['b2'] # the cost we minimize during training is the negative log likelihood of @@ -289,8 +373,9 @@ - if verbose == 1: - print 'starting training' + + print 'starting training' + sys.stdout.flush() while(minibatch_index*batch_size<nb_max_exemples): for x, y in dataset.train(batch_size): @@ -303,10 +388,12 @@ #train model cost_ij = train_model(x,y) - if (minibatch_index+1) % validation_frequency == 0: + if (minibatch_index) % validation_frequency == 0: #save the current learning rate learning_rate_list.append(classifier.lr.value) divergence_flag_list.append(divergence_flag) + + # compute the validation error this_validation_loss = 0. @@ -319,10 +406,15 @@ this_validation_loss /= temp #save the validation loss total_validation_error_list.append(this_validation_loss) - if verbose == 1: - print(('epoch %i, minibatch %i, learning rate %f current validation error %f ') % - (epoch, minibatch_index+1,classifier.lr.value, - this_validation_loss*100.)) + + print(('epoch %i, minibatch %i, learning rate %f current validation error %f ') % + (epoch, minibatch_index+1,classifier.lr.value, + this_validation_loss*100.)) + sys.stdout.flush() + + #save temp results to check during training + numpy.savez('temp_results.npy',config=configuration,total_validation_error_list=total_validation_error_list,\ + learning_rate_list=learning_rate_list, divergence_flag_list=divergence_flag_list) # if we got the best validation score until now if this_validation_loss < best_validation_loss: @@ -344,11 +436,12 @@ test_score += test_model(xt,yt) temp = temp+1 test_score /= temp - if verbose == 1: - print(('epoch %i, minibatch %i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, - test_score*100.)) + + print(('epoch %i, minibatch %i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, + test_score*100.)) + sys.stdout.flush() # if the validation error is going up, we are overfitting (or oscillating) # check if we are allowed to continue and if we will adjust the learning rate @@ -374,12 +467,13 @@ test_score += test_model(xt,yt) temp=temp+1 test_score /= temp - if verbose == 1: - print ' validation error is going up, possibly stopping soon' - print((' epoch %i, minibatch %i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, - test_score*100.)) + + print ' validation error is going up, possibly stopping soon' + print((' epoch %i, minibatch %i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, + test_score*100.)) + sys.stdout.flush() @@ -393,6 +487,9 @@ #force one epoch at least if epoch>0 and minibatch_index*batch_size>nb_max_exemples: break + + + time_n= time_n + batch_size @@ -401,12 +498,13 @@ # we have finished looping through the training set epoch = epoch+1 end_time = time.clock() - if verbose == 1: - print(('Optimization complete. Best validation score of %f %% ' - 'obtained at iteration %i, with test performance %f %%') % - (best_validation_loss * 100., best_iter, test_score*100.)) - print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - print minibatch_index + + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + print minibatch_index + sys.stdout.flush() #save the model and the weights numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value) @@ -427,7 +525,8 @@ tau=state.tau,\ verbose = state.verbose,\ lr_t2_factor=state.lr_t2_factor, - data_set=state.data_set) + data_set=state.data_set, + channel=channel) state.train_error=train_error state.validation_error=validation_error state.test_error=test_error
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/mlp/ratio_classes/mlp_nist_ratio.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,438 @@ +# -*- coding: utf-8 -*- +""" +This tutorial introduces the multilayer perceptron using Theano. + + A multilayer perceptron is a logistic regressor where +instead of feeding the input to the logistic regression you insert a +intermidiate layer, called the hidden layer, that has a nonlinear +activation function (usually tanh or sigmoid) . One can use many such +hidden layers making the architecture deep. The tutorial will also tackle +the problem of MNIST digit classification. + +.. math:: + + f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 5 + +TODO: recommended preprocessing, lr ranges, regularization ranges (explain + to do lr first, then add regularization) + +""" +__docformat__ = 'restructedtext en' + +import ift6266 +from scripts import setup_batches +import pdb +import numpy + +import theano +import theano.tensor as T +import time +import theano.tensor.nnet +import pylearn +import theano,pylearn.version +from pylearn.io import filetensor as ft + +data_path = '/data/lisa/data/nist/by_class/' + +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermidiate layers usually have as activation function thanh or the + sigmoid function while the top layer is a softamx layer. + """ + + + + def __init__(self, input, n_in, n_hidden, n_out,learning_rate): + """Initialize the parameters for the multilayer perceptron + + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :param n_hidden: number of hidden units + + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize the parameters theta = (W1,b1,W2,b2) ; note that this + # example contains only one hidden layer, but one can have as many + # layers as he/she wishes, making the network deeper. The only + # problem making the network deep this way is during learning, + # backpropagation being unable to move the network from the starting + # point towards; this is where pre-training helps, giving a good + # starting point for backpropagation, but more about this in the + # other tutorials + + # `W1` is initialized with `W1_values` which is uniformely sampled + # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W1_values = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_in+n_hidden)), \ + high = numpy.sqrt(6./(n_in+n_hidden)), \ + size = (n_in, n_hidden)), dtype = theano.config.floatX) + # `W2` is initialized with `W2_values` which is uniformely sampled + # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W2_values = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden+n_out)), \ + high= numpy.sqrt(6./(n_hidden+n_out)),\ + size= (n_hidden, n_out)), dtype = theano.config.floatX) + + self.W1 = theano.shared( value = W1_values ) + self.b1 = theano.shared( value = numpy.zeros((n_hidden,), + dtype= theano.config.floatX)) + self.W2 = theano.shared( value = W2_values ) + self.b2 = theano.shared( value = numpy.zeros((n_out,), + dtype= theano.config.floatX)) + + #include the learning rate in the classifer so + #we can modify it on the fly when we want + lr_value=learning_rate + self.lr=theano.shared(value=lr_value) + # symbolic expression computing the values of the hidden layer + self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) + + + + # symbolic expression computing the values of the top layer + self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred = T.argmax( self.p_y_given_x, axis =1) + self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1) + + + + + # L1 norm ; one regularization option is to enforce L1 norm to + # be small + self.L1 = abs(self.W1).sum() + abs(self.W2).sum() + + # square of L2 norm ; one regularization option is to enforce + # square of L2 norm to be small + self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() + + + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + + :param y: corresponds to a vector that gives for each example the + :correct label + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +def mlp_full_nist( verbose = False,\ + adaptive_lr = 1,\ + train_data = 'all/all_train_data.ft',\ + train_labels = 'all/all_train_labels.ft',\ + test_data = 'all/all_test_data.ft',\ + test_labels = 'all/all_test_labels.ft',\ + learning_rate=0.5,\ + L1_reg = 0.00,\ + L2_reg = 0.0001,\ + nb_max_exemples=1000000,\ + batch_size=20,\ + nb_hidden = 500,\ + nb_targets = 62,\ + tau=1e6,\ + main_class="d",\ + start_ratio=1,\ + end_ratio=1): + + + configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr] + + #save initial learning rate if classical adaptive lr is used + initial_lr=learning_rate + + total_validation_error_list = [] + total_train_error_list = [] + learning_rate_list=[] + best_training_error=float('inf'); + + # set up batches + batches = setup_batches.Batches() + batches.set_batches(main_class, start_ratio,end_ratio,batch_size,verbose) + + train_batches = batches.get_train_batches() + test_batches = batches.get_test_batches() + validation_batches = batches.get_validation_batches() + + ishape = (32,32) # this is the size of NIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + if verbose==True: + print 'finished parsing the data' + # construct the logistic regression class + classifier = MLP( input=x.reshape((batch_size,32*32)),\ + n_in=32*32,\ + n_hidden=nb_hidden,\ + n_out=nb_targets, + learning_rate=learning_rate) + + + + + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = classifier.negative_log_likelihood(y) \ + + L1_reg * classifier.L1 \ + + L2_reg * classifier.L2_sqr + + # compiling a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + + # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) + g_W1 = T.grad(cost, classifier.W1) + g_b1 = T.grad(cost, classifier.b1) + g_W2 = T.grad(cost, classifier.W2) + g_b2 = T.grad(cost, classifier.b2) + + # specify how to update the parameters of the model as a dictionary + updates = \ + { classifier.W1: classifier.W1 - classifier.lr*g_W1 \ + , classifier.b1: classifier.b1 - classifier.lr*g_b1 \ + , classifier.W2: classifier.W2 - classifier.lr*g_W2 \ + , classifier.b2: classifier.b2 - classifier.lr*g_b2 } + + # compiling a theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function([x, y], cost, updates = updates ) + n_minibatches = len(train_batches) + + + + + + + #conditions for stopping the adaptation: + #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size) + #2) validation error is going up twice in a row(probable overfitting) + + # This means we no longer stop on slow convergence as low learning rates stopped + # too fast. + + # no longer relevant + patience =nb_max_exemples/batch_size + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches/4 + + + + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples + n_iter = n_iter/n_minibatches + 1 #round up + n_iter=max(1,n_iter) # run at least once on short debug call + time_n=0 #in unit of exemples + + + + if verbose == True: + print 'looping at most %d times through the data set' %n_iter + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + + if adaptive_lr==2: + classifier.lr.value = tau*initial_lr/(tau+time_n) + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + # convert to float + x_float = x/255.0 + cost_ij = train_model(x_float,y) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + + this_validation_loss = 0. + for x,y in validation_batches: + # sum up the errors for each minibatch + x_float = x/255.0 + this_validation_loss += test_model(x_float,y) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(validation_batches) + #save the validation loss + total_validation_error_list.append(this_validation_loss) + + #get the training error rate + this_train_loss=0 + for x,y in train_batches: + # sum up the errors for each minibatch + x_float = x/255.0 + this_train_loss += test_model(x_float,y) + # get the average by dividing with the number of minibatches + this_train_loss /= len(train_batches) + #save the validation loss + total_train_error_list.append(this_train_loss) + if(this_train_loss<best_training_error): + best_training_error=this_train_loss + + if verbose == True: + print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss*100.,this_train_loss*100)) + print 'learning rate = %f' %classifier.lr.value + print 'time = %i' %time_n + + + #save the learning rate + learning_rate_list.append(classifier.lr.value) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + # reset patience if we are going down again + # so we continue exploring + patience=nb_max_exemples/batch_size + # test it on the test set + test_score = 0. + for x,y in test_batches: + x_float=x/255.0 + test_score += test_model(x_float,y) + test_score /= len(test_batches) + if verbose == True: + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + # if the validation error is going up, we are overfitting (or oscillating) + # stop converging but run at least to next validation + # to check overfitting or ocsillation + # the saved weights of the model will be a bit off in that case + elif this_validation_loss >= best_validation_loss: + #calculate the test error at this point and exit + # test it on the test set + # however, if adaptive_lr is true, try reducing the lr to + # get us out of an oscilliation + if adaptive_lr==1: + classifier.lr.value=classifier.lr.value/2.0 + + test_score = 0. + #cap the patience so we are allowed one more validation error + #calculation before aborting + patience = iter+validation_frequency+1 + for x,y in test_batches: + x_float=x/255.0 + test_score += test_model(x_float,y) + test_score /= len(test_batches) + if verbose == True: + print ' validation error is going up, possibly stopping soon' + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + + + + if iter>patience: + print 'we have diverged' + break + + + time_n= time_n + batch_size + end_time = time.clock() + if verbose == True: + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + print iter + + #save the model and the weights + numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value) + numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\ + learning_rate_list=learning_rate_list) + + return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60) + + +if __name__ == '__main__': + mlp_full_nist(True) + +def jobman_mlp_full_nist(state,channel): + (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ + nb_max_exemples=state.nb_max_exemples,\ + nb_hidden=state.nb_hidden,\ + adaptive_lr=state.adaptive_lr,\ + tau=state.tau,\ + main_class=state.main_class,\ + start_ratio=state.start_ratio,\ + end_ratio=state.end_ratio) + state.train_error=train_error + state.validation_error=validation_error + state.test_error=test_error + state.nb_exemples=nb_exemples + state.time=time + return channel.COMPLETE + + \ No newline at end of file
--- a/datasets/defs.py Sun Apr 25 12:31:22 2010 -0400 +++ b/datasets/defs.py Sun Apr 25 13:54:57 2010 -0400 @@ -1,5 +1,5 @@ __all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all', 'ocr', - 'nist_P07', 'mnist'] + 'nist_P07', 'PNIST07', 'mnist'] from ftfile import FTDataSet from gzpklfile import GzpklDataSet @@ -52,6 +52,15 @@ valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')], valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')], indtype=theano.config.floatX, inscale=255., maxsize=maxsize) + +#Added PNIST07 +PNIST07 = lambda maxsize=None, min_file=0, max_file=100: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/PNIST07_train'+str(i)+'_data.ft') for i in range(min_file, max_file)], + train_lbl = [os.path.join(DATA_PATH,'data/PNIST07_train'+str(i)+'_labels.ft') for i in range(min_file, max_file)], + test_data = [os.path.join(DATA_PATH,'data/PNIST07_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'data/PNIST07_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'data/PNIST07_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'data/PNIST07_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) mnist = lambda maxsize=None: GzpklDataSet(os.path.join(DATA_PATH,'mnist.pkl.gz'), maxsize=maxsize)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/config.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,177 @@ +''' +These are parameters used by nist_sda.py. They'll end up as globals in there. + +Rename this file to config.py and configure as needed. +DON'T add the renamed file to the repository, as others might use it +without realizing it, with dire consequences. +''' + +# Set this to True when you want to run cluster tests, ie. you want +# to run on the cluster, many jobs, but want to reduce the training +# set size and the number of epochs, so you know everything runs +# fine on the cluster. +# Set this PRIOR to inserting your test jobs in the DB. +TEST_CONFIG = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +NIST_ALL_TRAIN_SIZE = 649081 +# valid et test =82587 82587 + +# change "sandbox" when you're ready +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/rifaisal_csda' +EXPERIMENT_PATH = "ift6266.deep.convolutional_dae.salah_exp.nist_csda.jobman_entrypoint" + +##Pour lancer des travaux sur le cluster: (il faut etre ou se trouve les fichiers) +##python nist_sda.py jobman_insert +##dbidispatch --condor --repeat_jobs=2 jobman sql 'postgres://ift6266h10@gershwin/ift6266h10_db/pannetis_finetuningSDA0' . #C'est le path dans config.py + +# reduce training set to that many examples +REDUCE_TRAIN_TO = None +# that's a max, it usually doesn't get to that point +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 +#Set the finetune dataset +FINETUNE_SET=1 +#Set the pretrain dataset used. 0: NIST, 1:P07 +PRETRAIN_CHOICE=1 + + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + + +# This is to configure insertion of jobs on the cluster. +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. + + +JOB_VALS = {'pretraining_lr': [0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10], + 'kernels' : [[[52,5,5], [32,3,3]], [[52,7,7], [52,3,3]]], + 'mlp_size' : [[1000],[500]], + 'imgshp' : [[32,32]], + 'max_pool_layers' : [[[2,2],[2,2]]], + 'corruption_levels': [[0.2,0.1]], + 'minibatch_size': [100], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'max_finetuning_epochs_P07':[1000], + 'finetuning_lr':[0.1,0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2], + 'finetune_set':[1], + 'pretrain_choice':[1] + } + +DEFAULT_HP_NIST = {'pretraining_lr': 0.01, + 'pretraining_epochs_per_layer': 1, + 'kernels' : [[4,5,5], [2,3,3]], + 'mlp_size' : [10], + 'imgshp' : [32,32], + 'max_pool_layers' : [[2,2],[2,2]], + 'corruption_levels': [0.1,0.2], + 'minibatch_size': 20, + 'max_finetuning_epochs':MAX_FINETUNING_EPOCHS, + 'max_finetuning_epochs_P07':1000, + 'finetuning_lr':0.1, #0.001 was very bad, so we leave it out + 'num_hidden_layers':2, + 'finetune_set':1, + 'pretrain_choice':1, + #'reduce_train_to':1000, + } + + + +##[pannetis@ceylon test]$ python nist_sda.py test_jobman_entrypoint +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/TMP_DBI/configobj.py +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/TMP_DBI/utils.py +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/config.py +##WARNING: untracked file /u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/config2.py +##Creating optimizer with state, DD{'reduce_train_to': 11000, 'pretraining_epochs_per_layer': 2, 'hidden_layers_sizes': 300, 'num_hidden_layers': 2, 'corruption_levels': 0.20000000000000001, 'finetuning_lr': 0.10000000000000001, 'pretrain_choice': 0, 'max_finetuning_epochs': 2, 'version_pylearn': '08b37147dec1', 'finetune_set': -1, 'pretraining_lr': 0.10000000000000001, 'version_ift6266': 'a6b6b1140de9', 'version_theano': 'fb6c3a06cb65', 'minibatch_size': 20} +##SdaSgdOptimizer, max_minibatches = 11000 +##C##n_outs 62 +##pretrain_lr 0.1 +##finetune_lr 0.1 +##---- +## +##pretraining with NIST +## +##STARTING PRETRAINING, time = 2010-03-29 15:07:43.945981 +##Pre-training layer 0, epoch 0, cost 113.562562494 +##Pre-training layer 0, epoch 1, cost 113.410032944 +##Pre-training layer 1, epoch 0, cost 98.4539954687 +##Pre-training layer 1, epoch 1, cost 97.8658966686 +##Pretraining took 9.011333 minutes +## +##SERIE OF 3 DIFFERENT FINETUNINGS +## +## +##finetune with NIST +## +## +##STARTING FINETUNING, time = 2010-03-29 15:16:46.512235 +##epoch 1, minibatch 4999, validation error on P07 : 29.511250 % +## epoch 1, minibatch 4999, test error on dataset NIST (train data) of best model 40.408509 % +## epoch 1, minibatch 4999, test error on dataset P07 of best model 96.700000 % +##epoch 1, minibatch 9999, validation error on P07 : 25.560000 % +## epoch 1, minibatch 9999, test error on dataset NIST (train data) of best model 34.778969 % +## epoch 1, minibatch 9999, test error on dataset P07 of best model 97.037500 % +## +##Optimization complete with best validation score of 25.560000 %,with test performance 34.778969 % on dataset NIST +##The test score on the P07 dataset is 97.037500 +##The finetuning ran for 3.281833 minutes +## +## +##finetune with P07 +## +## +##STARTING FINETUNING, time = 2010-03-29 15:20:06.346009 +##epoch 1, minibatch 4999, validation error on NIST : 65.226250 % +## epoch 1, minibatch 4999, test error on dataset P07 (train data) of best model 84.465000 % +## epoch 1, minibatch 4999, test error on dataset NIST of best model 65.965237 % +##epoch 1, minibatch 9999, validation error on NIST : 58.745000 % +## epoch 1, minibatch 9999, test error on dataset P07 (train data) of best model 80.405000 % +## epoch 1, minibatch 9999, test error on dataset NIST of best model 61.341923 % +## +##Optimization complete with best validation score of 58.745000 %,with test performance 80.405000 % on dataset P07 +##The test score on the NIST dataset is 61.341923 +##The finetuning ran for 3.299500 minutes +## +## +##finetune with NIST (done earlier) followed by P07 (written here) +## +## +##STARTING FINETUNING, time = 2010-03-29 15:23:27.947374 +##epoch 1, minibatch 4999, validation error on NIST : 83.975000 % +## epoch 1, minibatch 4999, test error on dataset P07 (train data) of best model 83.872500 % +## epoch 1, minibatch 4999, test error on dataset NIST of best model 43.170010 % +##epoch 1, minibatch 9999, validation error on NIST : 79.775000 % +## epoch 1, minibatch 9999, test error on dataset P07 (train data) of best model 80.971250 % +## epoch 1, minibatch 9999, test error on dataset NIST of best model 49.017468 % +## +##Optimization complete with best validation score of 79.775000 %,with test performance 80.971250 % on dataset P07 +##The test score on the NIST dataset is 49.017468 +##The finetuning ran for 2.851500 minutes +## +## +##finetune with NIST only on the logistic regression on top. +## All hidden units output are input of the logistic regression +## +## +##STARTING FINETUNING, time = 2010-03-29 15:26:21.430557 +##epoch 1, minibatch 4999, validation error on P07 : 95.223750 % +## epoch 1, minibatch 4999, test error on dataset NIST (train data) of best model 93.268765 % +## epoch 1, minibatch 4999, test error on dataset P07 of best model 96.535000 % +##epoch 1, minibatch 9999, validation error on P07 : 95.223750 % +## +##Optimization complete with best validation score of 95.223750 %,with test performance 93.268765 % on dataset NIST +##The test score on the P07 dataset is 96.535000 +##The finetuning ran for 2.013167 minutes +##Closing remaining open files: /u/pannetis/IFT6266/test/series.h5... done +##[pannetis@ceylon test]$ + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/nist_csda.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,261 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs +from copy import copy + +from sgd_optimization_new import CSdASgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +from ift6266 import datasets +from config import * + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + #REDUCE_TRAIN_TO = 40000 + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + if state.has_key('decrease_lr'): + decrease_lr = state['decrease_lr'] + else : + decrease_lr = 0 + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = 100000#NIST_ALL_TRAIN_SIZE + + #To be sure variables will not be only in the if statement + PATH = '' + nom_reptrain = '' + nom_serie = "" + if state['pretrain_choice'] == 0: + nom_serie="series_NIST.h5" + elif state['pretrain_choice'] == 1: + nom_serie="series_P07.h5" + + series = create_series(state.num_hidden_layers,nom_serie) + + + print "Creating optimizer with state, ", state + + optimizer = CSdASgdOptimizer(dataset=datasets.nist_all(), + hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) + + parameters=[] + #Number of files of P07 used for pretraining + nb_file=0 + if state['pretrain_choice'] == 0: + print('\n\tpretraining with NIST\n') + optimizer.pretrain(datasets.nist_all()) + elif state['pretrain_choice'] == 1: + #To know how many file will be used during pretraining + nb_file = int(state['pretraining_epochs_per_layer']) + state['pretraining_epochs_per_layer'] = 1 #Only 1 time over the dataset + if nb_file >=100: + sys.exit("The code does not support this much pretraining epoch (99 max with P07).\n"+ + "You have to correct the code (and be patient, P07 is huge !!)\n"+ + "or reduce the number of pretraining epoch to run the code (better idea).\n") + print('\n\tpretraining with P07') + optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file)) + channel.save() + + #Set some of the parameters used for the finetuning + if state.has_key('finetune_set'): + finetune_choice=state['finetune_set'] + else: + finetune_choice=FINETUNE_SET + + if state.has_key('max_finetuning_epochs'): + max_finetune_epoch_NIST=state['max_finetuning_epochs'] + else: + max_finetune_epoch_NIST=MAX_FINETUNING_EPOCHS + + if state.has_key('max_finetuning_epochs_P07'): + max_finetune_epoch_P07=state['max_finetuning_epochs_P07'] + else: + max_finetune_epoch_P07=max_finetune_epoch_NIST + + #Decide how the finetune is done + + if finetune_choice == 0: + print('\n\n\tfinetune with NIST\n\n') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,decrease=decrease_lr) + channel.save() + if finetune_choice == 1: + print('\n\n\tfinetune with P07\n\n') + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0,decrease=decrease_lr) + channel.save() + if finetune_choice == 2: + print('\n\n\tfinetune with P07 followed by NIST\n\n') + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=20,decrease=decrease_lr) + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=21,decrease=decrease_lr) + channel.save() + if finetune_choice == 3: + print('\n\n\tfinetune with NIST only on the logistic regression on top (but validation on P07).\n\ + All hidden units output are input of the logistic regression\n\n') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,special=1,decrease=decrease_lr) + + + if finetune_choice==-1: + print('\nSERIE OF 4 DIFFERENT FINETUNINGS') + print('\n\n\tfinetune with NIST\n\n') + sys.stdout.flush() + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,decrease=decrease_lr) + channel.save() + print('\n\n\tfinetune with P07\n\n') + sys.stdout.flush() + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0,decrease=decrease_lr) + channel.save() + print('\n\n\tfinetune with P07 (done earlier) followed by NIST (written here)\n\n') + sys.stdout.flush() + optimizer.reload_parameters('params_finetune_P07.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=21,decrease=decrease_lr) + channel.save() + print('\n\n\tfinetune with NIST only on the logistic regression on top.\n\ + All hidden units output are input of the logistic regression\n\n') + sys.stdout.flush() + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1,special=1,decrease=decrease_lr) + channel.save() + + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(num_hidden_layers, nom_serie): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, nom_serie), "w") + + #REDUCE_EVERY=10 + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + # comment out series we don't want to save + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +if __name__ == '__main__': + + args = sys.argv[1:] + + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() + + if len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DD(DEFAULT_HP_NIST), chanmock) + + else: + print "Bad arguments" +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/sgd_optimization_new.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,394 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys +import pickle + +from jobman import DD +import jobman, jobman.sql +from copy import copy + +from stacked_convolutional_dae_uit import CSdA + +from ift6266.utils.seriestables import * + +buffersize=1000 + +default_series = { \ + 'reconstruction_error' : DummySeries(), + 'training_error' : DummySeries(), + 'validation_error' : DummySeries(), + 'test_error' : DummySeries(), + 'params' : DummySeries() + } + +def itermax(iter, max): + for i,it in enumerate(iter): + if i >= max: + break + yield it +def get_conv_shape(kernels,imgshp,batch_size,max_pool_layers): + # Returns the dimension at the output of the convoluational net + # and a list of Image and kernel shape for every + # Convolutional layer + conv_layers=[] + init_layer = [ [ kernels[0][0],1,kernels[0][1],kernels[0][2] ],\ + [ batch_size , 1, imgshp[0], imgshp[1] ], + max_pool_layers[0] ] + conv_layers.append(init_layer) + + conv_n_out = int((32-kernels[0][2]+1)/max_pool_layers[0][0]) + + for i in range(1,len(kernels)): + layer = [ [ kernels[i][0],kernels[i-1][0],kernels[i][1],kernels[i][2] ],\ + [ batch_size, kernels[i-1][0],conv_n_out,conv_n_out ], + max_pool_layers[i] ] + conv_layers.append(layer) + conv_n_out = int( (conv_n_out - kernels[i][2]+1)/max_pool_layers[i][0]) + conv_n_out=kernels[-1][0]*conv_n_out**2 + return conv_n_out,conv_layers + + + + + +class CSdASgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, max_minibatches=None): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + self.parameters_pre=[] + + self.max_minibatches = max_minibatches + print "CSdASgdOptimizer, max_minibatches =", max_minibatches + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + + self.series = series + + self.rng = numpy.random.RandomState(1234) + self.init_classifier() + + sys.stdout.flush() + + def init_classifier(self): + print "Constructing classifier" + + n_ins,convlayers = get_conv_shape(self.hp.kernels,self.hp.imgshp,self.hp.minibatch_size,self.hp.max_pool_layers) + + self.classifier = CSdA(n_ins_mlp = n_ins, + batch_size = self.hp.minibatch_size, + conv_hidden_layers_sizes = convlayers, + mlp_hidden_layers_sizes = self.hp.mlp_size, + corruption_levels = self.hp.corruption_levels, + rng = self.rng, + n_out = self.n_outs, + pretrain_lr = self.hp.pretraining_lr, + finetune_lr = self.hp.finetuning_lr) + + + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain(self.dataset) + self.finetune(self.dataset) + + def pretrain(self,dataset): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + un_fichier=int(819200.0/self.hp.minibatch_size) #Number of batches in a P07 file + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + batch_index=0 + count=0 + num_files=0 + for x,y in dataset.train(self.hp.minibatch_size): + if x.shape[0] != self.hp.minibatch_size: + continue + c = self.classifier.pretrain_functions[i](x) + count +=1 + + self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 + + #if batch_index % 100 == 0: + # print "100 batches" + + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break + + #When we pass through the data only once (the case with P07) + #There is approximately 800*1024=819200 examples per file (1k per example and files are 800M) + if self.hp.pretraining_epochs_per_layer == 1 and count%un_fichier == 0: + print 'Pre-training layer %i, epoch %d, cost '%(i,num_files),c + num_files+=1 + sys.stdout.flush() + self.series['params'].append((num_files,), self.classifier.all_params) + + #When NIST is used + if self.hp.pretraining_epochs_per_layer > 1: + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + #To be able to load them later for tests on finetune + self.parameters_pre=[copy(x.value) for x in self.classifier.params] + f = open('params_pretrain.txt', 'w') + pickle.dump(self.parameters_pre,f) + f.close() + def finetune(self,dataset,dataset_test,num_finetune,ind_test,special=0,decrease=0): + + if special != 0 and special != 1: + sys.exit('Bad value for variable special. Must be in {0,1}') + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + minibatch_size = self.hp.minibatch_size + if ind_test == 0 or ind_test == 20: + nom_test = "NIST" + nom_train="P07" + else: + nom_test = "P07" + nom_train = "NIST" + + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) + + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.mb_per_epoch, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + if self.max_minibatches and validation_frequency > self.max_minibatches: + validation_frequency = self.max_minibatches / 2 + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + total_mb_index = 0 + minibatch_index = 0 + parameters_finetune=[] + learning_rate = self.hp.finetuning_lr + + + while (epoch < num_finetune) and (not done_looping): + epoch = epoch + 1 + + for x,y in dataset.train(minibatch_size,bufsize=buffersize): + + minibatch_index += 1 + + if x.shape[0] != self.hp.minibatch_size: + print 'bim' + continue + + cost_ij = self.classifier.finetune(x,y)#,learning_rate) + total_mb_index += 1 + + self.series["training_error"].append((epoch, minibatch_index), cost_ij) + + if (total_mb_index+1) % validation_frequency == 0: + #minibatch_index += 1 + #The validation set is always NIST (we want the model to be good on NIST) + + iter=dataset_test.valid(minibatch_size,bufsize=buffersize) + + + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + + validation_losses = [] + + for x,y in iter: + if x.shape[0] != self.hp.minibatch_size: + print 'bim' + continue + validation_losses.append(validate_model(x,y)) + + this_validation_loss = numpy.mean(validation_losses) + + self.series["validation_error"].\ + append((epoch, minibatch_index), this_validation_loss*100.) + + print('epoch %i, minibatch %i, validation error on NIST : %f %%' % \ + (epoch, minibatch_index+1, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, total_mb_index * patience_increase) + + # save best validation score, iteration number and parameters + best_validation_loss = this_validation_loss + best_iter = total_mb_index + parameters_finetune=[copy(x.value) for x in self.classifier.params] + + # test it on the test set + iter = dataset.test(minibatch_size,bufsize=buffersize) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + test_losses = [] + test_losses2 = [] + for x,y in iter: + if x.shape[0] != self.hp.minibatch_size: + print 'bim' + continue + test_losses.append(test_model(x,y)) + + test_score = numpy.mean(test_losses) + + #test it on the second test set + iter2 = dataset_test.test(minibatch_size,bufsize=buffersize) + if self.max_minibatches: + iter2 = itermax(iter2, self.max_minibatches) + for x,y in iter2: + if x.shape[0] != self.hp.minibatch_size: + continue + test_losses2.append(test_model(x,y)) + + test_score2 = numpy.mean(test_losses2) + + self.series["test_error"].\ + append((epoch, minibatch_index), test_score*100.) + + print((' epoch %i, minibatch %i, test error on dataset %s (train data) of best ' + 'model %f %%') % + (epoch, minibatch_index+1,nom_train, + test_score*100.)) + + print((' epoch %i, minibatch %i, test error on dataset %s of best ' + 'model %f %%') % + (epoch, minibatch_index+1,nom_test, + test_score2*100.)) + + if patience <= total_mb_index: + done_looping = True + break #to exit the FOR loop + + sys.stdout.flush() + + # useful when doing tests + if self.max_minibatches and minibatch_index >= self.max_minibatches: + break + + if decrease == 1: + learning_rate /= 2 #divide the learning rate by 2 for each new epoch + + self.series['params'].append((epoch,), self.classifier.all_params) + + if done_looping == True: #To exit completly the fine-tuning + break #to exit the WHILE loop + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + print(('\nOptimization complete with best validation score of %f %%,' + 'with test performance %f %% on dataset %s ') % + (best_validation_loss * 100., test_score*100.,nom_train)) + print(('The test score on the %s dataset is %f')%(nom_test,test_score2*100.)) + + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + sys.stdout.flush() + + #Save a copy of the parameters in a file to be able to get them in the future + + if special == 1: #To keep a track of the value of the parameters + f = open('params_finetune_stanford.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + elif ind_test == 0 | ind_test == 20: #To keep a track of the value of the parameters + f = open('params_finetune_P07.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + + elif ind_test== 1: #For the run with 2 finetunes. It will be faster. + f = open('params_finetune_NIST.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + elif ind_test== 21: #To keep a track of the value of the parameters + f = open('params_finetune_P07_then_NIST.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + #Set parameters like they where right after pre-train or finetune + def reload_parameters(self,which): + + #self.parameters_pre=pickle.load('params_pretrain.txt') + f = open(which) + self.parameters_pre=pickle.load(f) + f.close() + for idx,x in enumerate(self.parameters_pre): + if x.dtype=='float64': + self.classifier.params[idx].value=theano._asarray(copy(x),dtype=theano.config.floatX) + else: + self.classifier.params[idx].value=copy(x) + + def training_error(self,dataset): + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + + iter2 = dataset.train(self.hp.minibatch_size,bufsize=buffersize) + train_losses2 = [test_model(x,y) for x,y in iter2] + train_score2 = numpy.mean(train_losses2) + print "Training error is: " + str(train_score2)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/stacked_convolutional_dae_uit.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,243 @@ +import numpy +import theano +import time +import sys +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import theano.sandbox.softsign +import copy +from theano.tensor.signal import downsample +from theano.tensor.nnet import conv + +sys.path.append('../../') +#import ift6266.datasets +import ift6266.datasets +from ift6266.baseline.log_reg.log_reg import LogisticRegression + +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.tanh(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + +class dA_conv(object): + + def __init__(self, input, filter_shape, corruption_level = 0.1, + shared_W = None, shared_b = None, image_shape = None, num = 0,batch_size=20): + + theano_rng = RandomStreams() + + fan_in = numpy.prod(filter_shape[1:]) + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) + + center = theano.shared(value = 1, name="center") + scale = theano.shared(value = 2, name="scale") + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(fan_in+fan_out)), + high = numpy.sqrt(6./(fan_in+fan_out)), + size = filter_shape), dtype = theano.config.floatX) + initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros((filter_shape[1],),dtype=theano.config.floatX) + + self.b_prime = theano.shared(value = initial_b_prime, name = "b_prime") + + self.x = input + + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level,dtype=theano.config.floatX) * self.x + + conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape, + image_shape=image_shape, + unroll_kern=4,unroll_batch=4, + border_mode='valid') + + + self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x')) + + + da_filter_shape = [ filter_shape[1], filter_shape[0], filter_shape[2],\ + filter_shape[3] ] + da_image_shape = [ batch_size, filter_shape[0], image_shape[2]-filter_shape[2]+1, + image_shape[3]-filter_shape[3]+1 ] + #import pdb; pdb.set_trace() + initial_W_prime = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(fan_in+fan_out)), \ + high = numpy.sqrt(6./(fan_in+fan_out)), \ + size = da_filter_shape), dtype = theano.config.floatX) + self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime") + + conv2_out = conv.conv2d(self.y, self.W_prime, + filter_shape = da_filter_shape,\ + image_shape = da_image_shape, \ + unroll_kern=4,unroll_batch=4, \ + border_mode='full') + + self.z = (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale + + if num != 0 : + scaled_x = (self.x + center) / scale + else: + scaled_x = self.x + self.L = - T.sum( scaled_x*T.log(self.z) + (1-scaled_x)*T.log(1-self.z), axis=1 ) + + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + +class LeNetConvPoolLayer(object): + + def __init__(self, rng, input, filter_shape, image_shape=None, poolsize=(2,2)): + self.input = input + + W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX) + self.W = theano.shared(value=W_values) + + b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + self.b = theano.shared(value=b_values) + + conv_out = conv.conv2d(input, self.W, + filter_shape=filter_shape, image_shape=image_shape, + unroll_kern=4,unroll_batch=4) + + + fan_in = numpy.prod(filter_shape[1:]) + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize) + + W_bound = numpy.sqrt(6./(fan_in + fan_out)) + self.W.value = numpy.asarray( + rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), + dtype = theano.config.floatX) + + + pooled_out = downsample.max_pool2D(conv_out, poolsize, ignore_border=True) + + self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + self.params = [self.W, self.b] + + +class CSdA(): + def __init__(self, n_ins_mlp,batch_size, conv_hidden_layers_sizes, + mlp_hidden_layers_sizes, corruption_levels, rng, n_out, + pretrain_lr, finetune_lr): + + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(mlp_hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + #update_locals(self, locals()) + + + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + self.n_layers = len(conv_hidden_layers_sizes) + self.mlp_n_layers = len(mlp_hidden_layers_sizes) + + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + + for i in xrange( self.n_layers ): + filter_shape=conv_hidden_layers_sizes[i][0] + image_shape=conv_hidden_layers_sizes[i][1] + max_poolsize=conv_hidden_layers_sizes[i][2] + + if i == 0 : + layer_input=self.x.reshape((batch_size, 1, 32, 32)) + else: + layer_input=self.layers[-1].output + + layer = LeNetConvPoolLayer(rng, input=layer_input, + image_shape=image_shape, + filter_shape=filter_shape, + poolsize=max_poolsize) + print 'Convolutional layer', str(i+1), 'created' + + self.layers += [layer] + self.params += layer.params + + da_layer = dA_conv(corruption_level = corruption_levels[0], + input = layer_input, + shared_W = layer.W, shared_b = layer.b, + filter_shape=filter_shape, + image_shape = image_shape, num=i , batch_size=batch_size) + + gparams = T.grad(da_layer.cost, da_layer.params) + + updates = {} + for param, gparam in zip(da_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + update_fn = theano.function([self.x], da_layer.cost, updates = updates) + + self.pretrain_functions += [update_fn] + + for i in xrange( self.mlp_n_layers ): + if i == 0 : + input_size = n_ins_mlp + else: + input_size = mlp_hidden_layers_sizes[i-1] + + if i == 0 : + if len( self.layers ) == 0 : + layer_input=self.x + else : + layer_input = self.layers[-1].output.flatten(2) + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + mlp_hidden_layers_sizes[i] ) + + self.layers += [layer] + self.params += layer.params + + print 'MLP layer', str(i+1), 'created' + + self.logLayer = LogisticRegression(input=self.layers[-1].output, \ + n_in=mlp_hidden_layers_sizes[-1], n_out=n_out) + + + self.params += self.logLayer.params + self.all_params = self.params + cost = self.logLayer.negative_log_likelihood(self.y) + + gparams = T.grad(cost, self.params) + + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([self.x, self.y], cost, updates = updates) + + self.errors = self.logLayer.errors(self.y) + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/salah_exp/utils.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/crbm/crbm.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,374 @@ +import sys +import os, os.path + +import numpy + +import theano + +USING_GPU = "gpu" in theano.config.device + +import theano.tensor as T +from theano.tensor.nnet import conv, sigmoid + +if not USING_GPU: + from theano.tensor.shared_randomstreams import RandomStreams +else: + from theano.sandbox.rng_mrg import MRG_RandomStreams + +_PRINT_GRAPHS = True + +def _init_conv_biases(num_filters, varname, rng=numpy.random): + b_shp = (num_filters,) + b = theano.shared( numpy.asarray( + rng.uniform(low=-.5, high=.5, size=b_shp), + dtype=theano.config.floatX), name=varname) + return b + +def _init_conv_weights(conv_params, varname, rng=numpy.random): + cp = conv_params + + # initialize shared variable for weights. + w_shp = conv_params.as_conv2d_shape_tuple() + w_bound = numpy.sqrt(cp.num_input_planes * \ + cp.height_filters * cp.width_filters) + W = theano.shared( numpy.asarray( + rng.uniform( + low=-1.0 / w_bound, + high=1.0 / w_bound, + size=w_shp), + dtype=theano.config.floatX), name=varname) + + return W + +# Shape of W for conv2d +class ConvolutionParams: + def __init__(self, num_filters, num_input_planes, height_filters, width_filters): + self.num_filters = num_filters + self.num_input_planes = num_input_planes + self.height_filters = height_filters + self.width_filters = width_filters + + def as_conv2d_shape_tuple(self): + cp = self + return (cp.num_filters, cp.num_input_planes, + cp.height_filters, cp.width_filters) + +class CRBM: + def __init__(self, minibatch_size, image_size, conv_params, + learning_rate, sparsity_lambda, sparsity_p): + ''' + Parameters + ---------- + image_size + height, width + ''' + self.minibatch_size = minibatch_size + self.image_size = image_size + self.conv_params = conv_params + + ''' + Dimensions: + 0- minibatch + 1- plane/color + 2- y (rows) + 3- x (cols) + ''' + self.x = T.tensor4('x') + self.h = T.tensor4('h') + + self.lr = theano.shared(numpy.asarray(learning_rate, + dtype=theano.config.floatX)) + self.sparsity_lambda = \ + theano.shared( \ + numpy.asarray( \ + sparsity_lambda, + dtype=theano.config.floatX)) + self.sparsity_p = \ + theano.shared( \ + numpy.asarray(sparsity_p, \ + dtype=theano.config.floatX)) + + self.numpy_rng = numpy.random.RandomState(1234) + + if not USING_GPU: + self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30)) + else: + self.theano_rng = MRG_RandomStreams(234, use_cuda=True) + + self._init_params() + self._init_functions() + + def _get_visibles_shape(self): + imsz = self.image_size + return (self.minibatch_size, + self.conv_params.num_input_planes, + imsz[0], imsz[1]) + + def _get_hiddens_shape(self): + cp = self.conv_params + imsz = self.image_size + wf, hf = cp.height_filters, cp.width_filters + return (self.minibatch_size, cp.num_filters, + imsz[0] - hf + 1, imsz[1] - wf + 1) + + def _init_params(self): + cp = self.conv_params + + self.W = _init_conv_weights(cp, 'W') + self.b_h = _init_conv_biases(cp.num_filters, 'b_h') + ''' + Lee09 mentions "all visible units share a single bias c" + but for upper layers it's pretty clear we need one + per plane, by symmetry + ''' + self.b_x = _init_conv_biases(cp.num_input_planes, 'b_x') + + self.params = [self.W, self.b_h, self.b_x] + + # flip filters horizontally and vertically + W_flipped = self.W[:, :, ::-1, ::-1] + # also have to invert the filters/num_planes + self.W_tilde = W_flipped.dimshuffle(1,0,2,3) + + ''' + I_up and I_down come from the symbol used in the + Lee 2009 CRBM paper + ''' + def _I_up(self, visibles_mb): + ''' + output of conv is features maps of size + image_size - filter_size + 1 + The dimshuffle serves to broadcast b_h so that it + corresponds to output planes + ''' + fshp = self.conv_params.as_conv2d_shape_tuple() + return conv.conv2d(visibles_mb, self.W, + filter_shape=fshp) + \ + self.b_h.dimshuffle('x',0,'x','x') + + def _I_down(self, hiddens_mb): + ''' + notice border_mode='full'... we want to get + back the original size + so we get feature_map_size + filter_size - 1 + The dimshuffle serves to broadcast b_x so that + it corresponds to output planes + ''' + fshp = list(self.conv_params.as_conv2d_shape_tuple()) + # num_filters and num_planes swapped + fshp[0], fshp[1] = fshp[1], fshp[0] + return conv.conv2d(hiddens_mb, self.W_tilde, + border_mode='full',filter_shape=tuple(fshp)) + \ + self.b_x.dimshuffle('x',0,'x','x') + + def _mean_free_energy(self, visibles_mb): + ''' + visibles_mb is mb_size x num_planes x h x w + + we want to match the summed input planes + (second dimension, first is mb index) + to respective bias terms for the visibles + The dimshuffle isn't really necessary, + but I put it there for clarity. + ''' + vbias_term = \ + self.b_x.dimshuffle('x',0) * \ + T.sum(visibles_mb,axis=[2,3]) + # now sum over term per planes, get one free energy + # contribution per element of minibatch + vbias_term = - T.sum(vbias_term, axis=1) + + ''' + Here it's a bit more complex, a few points: + - The usual free energy, in the fully connected case, + is a sum over all hiddens. + We do the same thing here, but each unit has limited + connectivity and there's weight reuse. + Therefore we only need to first do the convolutions + (with I_up) which gives us + what would normally be the Wx+b_h for each hidden. + Once we have this, + we take the log(1+exp(sum for this hidden)) elemwise + for each hidden, + then we sum for all hiddens in one example of the minibatch. + + - Notice that we reuse the same b_h everywhere instead of + using one b per hidden, + so the broadcasting for b_h done in I_up is all right. + + That sum is over all hiddens, so all filters + (planes of hiddens), x, and y. + In the end we get one free energy contribution per + example of the minibatch. + ''' + softplused = T.log(1.0+T.exp(self._I_up(visibles_mb))) + # h_sz = self._get_hiddens_shape() + # this simplifies the sum + # num_hiddens = h_sz[1] * h_sz[2] * h_sz[3] + # reshaped = T.reshape(softplused, + # (self.minibatch_size, num_hiddens)) + + # this is because the 0,1,1,1 sum pattern is not + # implemented on gpu, but the 1,0,1,1 pattern is + dimshuffled = softplused.dimshuffle(1,0,2,3) + xh_and_hbias_term = - T.sum(dimshuffled, axis=[0,2,3]) + + ''' + both bias_term and vbias_term end up with one + contributor to free energy per minibatch + so we mean over minibatches + ''' + return T.mean(vbias_term + xh_and_hbias_term) + + def _init_functions(self): + # propup + # b_h is broadcasted keeping in mind we want it to + # correspond to each new plane (corresponding to filters) + I_up = self._I_up(self.x) + # expected values for the distributions for each hidden + E_h_given_x = sigmoid(I_up) + # might be needed if we ever want a version where we + # take expectations instead of samples for CD learning + self.E_h_given_x_func = theano.function([self.x], E_h_given_x) + + if _PRINT_GRAPHS: + print "----------------------\nE_h_given_x_func" + theano.printing.debugprint(self.E_h_given_x_func) + + h_sample_given_x = \ + self.theano_rng.binomial( \ + size = self._get_hiddens_shape(), + n = 1, + p = E_h_given_x, + dtype = theano.config.floatX) + + self.h_sample_given_x_func = \ + theano.function([self.x], + h_sample_given_x) + + if _PRINT_GRAPHS: + print "----------------------\nh_sample_given_x_func" + theano.printing.debugprint(self.h_sample_given_x_func) + + # propdown + I_down = self._I_down(self.h) + E_x_given_h = sigmoid(I_down) + self.E_x_given_h_func = theano.function([self.h], E_x_given_h) + + if _PRINT_GRAPHS: + print "----------------------\nE_x_given_h_func" + theano.printing.debugprint(self.E_x_given_h_func) + + x_sample_given_h = \ + self.theano_rng.binomial( \ + size = self._get_visibles_shape(), + n = 1, + p = E_x_given_h, + dtype = theano.config.floatX) + + self.x_sample_given_h_func = \ + theano.function([self.h], + x_sample_given_h) + + if _PRINT_GRAPHS: + print "----------------------\nx_sample_given_h_func" + theano.printing.debugprint(self.x_sample_given_h_func) + + ############################################## + # cd update done by grad of free energy + + x_tilde = T.tensor4('x_tilde') + cd_update_cost = self._mean_free_energy(self.x) - \ + self._mean_free_energy(x_tilde) + + cd_grad = T.grad(cd_update_cost, self.params) + # This is NLL minimization so we use a - + cd_updates = {self.W: self.W - self.lr * cd_grad[0], + self.b_h: self.b_h - self.lr * cd_grad[1], + self.b_x: self.b_x - self.lr * cd_grad[2]} + + cd_returned = [cd_update_cost, + cd_grad[0], cd_grad[1], cd_grad[2], + self.lr * cd_grad[0], + self.lr * cd_grad[1], + self.lr * cd_grad[2]] + self.cd_return_desc = \ + ['cd_update_cost', + 'cd_grad_W', 'cd_grad_b_h', 'cd_grad_b_x', + 'lr_times_cd_grad_W', + 'lr_times_cd_grad_b_h', + 'lr_times_cd_grad_b_x'] + + self.cd_update_function = \ + theano.function([self.x, x_tilde], + cd_returned, updates=cd_updates) + + if _PRINT_GRAPHS: + print "----------------------\ncd_update_function" + theano.printing.debugprint(self.cd_update_function) + + ############## + # sparsity update, based on grad for b_h only + + ''' + This mean returns an array of shape + (num_hiddens_planes, feature_map_height, feature_map_width) + (so it's a mean over each unit's activation) + ''' + mean_expected_activation = T.mean(E_h_given_x, axis=0) + # sparsity_p is broadcasted everywhere + sparsity_update_cost = \ + T.sqr(self.sparsity_p - mean_expected_activation) + sparsity_update_cost = \ + T.sum(T.sum(T.sum( \ + sparsity_update_cost, axis=2), axis=1), axis=0) + sparsity_grad = T.grad(sparsity_update_cost, [self.W, self.b_h]) + + sparsity_returned = \ + [sparsity_update_cost, + sparsity_grad[0], sparsity_grad[1], + self.sparsity_lambda * self.lr * sparsity_grad[0], + self.sparsity_lambda * self.lr * sparsity_grad[1]] + self.sparsity_return_desc = \ + ['sparsity_update_cost', + 'sparsity_grad_W', + 'sparsity_grad_b_h', + 'lambda_lr_times_sparsity_grad_W', + 'lambda_lr_times_sparsity_grad_b_h'] + + # gradient _descent_ so we use a - + sparsity_update = \ + {self.b_h: self.b_h - \ + self.sparsity_lambda * self.lr * sparsity_grad[1], + self.W: self.W - \ + self.sparsity_lambda * self.lr * sparsity_grad[0]} + self.sparsity_update_function = \ + theano.function([self.x], + sparsity_returned, updates=sparsity_update) + + if _PRINT_GRAPHS: + print "----------------------\nsparsity_update_function" + theano.printing.debugprint(self.sparsity_update_function) + + def CD_step(self, x): + h1 = self.h_sample_given_x_func(x) + x2 = self.x_sample_given_h_func(h1) + return self.cd_update_function(x, x2) + + def sparsity_step(self, x): + return self.sparsity_update_function(x) + + # these two also operate on minibatches + + def random_gibbs_samples(self, num_updown_steps): + start_x = self.numpy_rng.rand(*self._get_visibles_shape()) + return self.gibbs_samples_from(start_x, num_updown_steps) + + def gibbs_samples_from(self, start_x, num_updown_steps): + x_sample = start_x + for i in xrange(num_updown_steps): + h_sample = self.h_sample_given_x_func(x_sample) + x_sample = self.x_sample_given_h_func(h_sample) + return x_sample + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/crbm/mnist_config.py.example Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,111 @@ +# ---------------------------------------------------------------------------- +# BEGIN EXPERIMENT ISOLATION CODE + +# Path to pass to jobman sqlschedule. IMPORTANT TO CHANGE TO REFLECT YOUR CLONE. +# Make sure this is accessible from the default $PYTHONPATH (in your .bashrc) +# (and make sure every subdirectory has its __init__.py file) +EXPERIMENT_PATH = "ift6266_mnistcrbm_exp1.ift6266.deep.crbm.mnist_crbm.jobman_entrypoint" + +def isolate_experiment(): + ''' + This makes sure we use the codebase clone created for this experiment. + I.e. if you want to make modifications to the codebase but don't want your + running experiment code to be impacted by those changes, first copy the + codebase somewhere, and configure this section. It will make sure we import + from the right place. + + MUST BE DONE BEFORE IMPORTING ANYTHING ELSE + (Leave this comment there so others will understand what's going on) + ''' + + # Place where you copied modules that should be frozen for this experiment + codebase_clone_path = "/u/savardf/ift6266/experiment_clones/ift6266_mnistcrbm_exp1" + + # Places where there might be conflicting modules from your $PYTHONPATH + remove_these_from_pythonpath = ["/u/savardf/ift6266/dev_code"] + + import sys + sys.path[0:0] = [codebase_clone_path] + + # remove paths we specifically don't want in $PYTHONPATH + for bad_path in remove_these_from_pythonpath: + sys.path[:] = [el for el in sys.path if not el in (bad_path, bad_path+"/")] + + # Make the imports + import ift6266 + + # Just making sure we're importing from the right place + modules_to_check = [ift6266] + for module in modules_to_check: + if not codebase_clone_path in module.__path__[0]: + raise RuntimeError("Module loaded from incorrect path "+module.__path__[0]) + +# END EXPERIMENT ISOLATION CODE +# ---------------------------------------------------------------------------- + +from jobman import DD + +''' +These are parameters used by mnist_crbm.py. They'll end up as globals in there. + +Rename this file to config.py and configure as needed. +DON'T add the renamed file to the repository, as others might use it +without realizing it, with dire consequences. +''' + +# change "sandbox" when you're ready +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere' + +# Set this to True when you want to run cluster tests, ie. you want +# to run on the cluster, many jobs, but want to reduce the training +# set size and the number of epochs, so you know everything runs +# fine on the cluster. +# Set this PRIOR to inserting your test jobs in the DB. +TEST_CONFIG = False + +# save params at training end +SAVE_PARAMS = True + +IMAGE_OUTPUT_DIR = 'img/' + +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 + +# print series to stdout too (otherwise just produce the HDF5 file) +SERIES_STDOUT_TOO = False + +VISUALIZE_EVERY = 20000 +GIBBS_STEPS_IN_VIZ_CHAIN = 1000 + +if TEST_CONFIG: + REDUCE_EVERY = 10 + VISUALIZE_EVERY = 20 + +# This is to configure insertion of jobs on the cluster. +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'learning_rate': [1.0, 0.1, 0.01], + 'sparsity_lambda': [3.0,0.5], + 'sparsity_p': [0.3,0.05], + 'num_filters': [40,15], + 'filter_size': [12,7], + 'minibatch_size': [20], + 'num_epochs': [20]} + +# Just useful for tests... minimal number of epochs +# Useful when launching a single local job +DEFAULT_STATE = DD({'learning_rate': 0.1, + 'sparsity_lambda': 1.0, + 'sparsity_p': 0.05, + 'num_filters': 40, + 'filter_size': 12, + 'minibatch_size': 10, + 'num_epochs': 20}) + +# To reinsert duplicate of jobs that crashed +REINSERT_COLS = ['learning_rate','sparsity_lambda','sparsity_p','num_filters','filter_size','minibatch_size','dupe'] +#REINSERT_JOB_VALS = [\ +# [,2],] +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/crbm/mnist_crbm.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,234 @@ +#!/usr/bin/python + +import sys +import os, os.path + +# do this before importing custom modules +from mnist_config import * + +if not (len(sys.argv) > 1 and sys.argv[1] in \ + ('test_jobman_entrypoint', 'run_local')): + # in those cases don't use isolated code, use dev code + print "Running experiment isolation code" + isolate_experiment() + +import numpy as N + +import theano +import theano.tensor as T + +from crbm import CRBM, ConvolutionParams + +import pylearn, pylearn.version +from pylearn.datasets import MNIST +from pylearn.io.image_tiling import tile_raster_images + +import Image + +from pylearn.io.seriestables import * +import tables + +import ift6266 + +import utils + +def setup_workdir(): + if not os.path.exists(IMAGE_OUTPUT_DIR): + os.mkdir(IMAGE_OUTPUT_DIR) + if not os.path.exists(IMAGE_OUTPUT_DIR): + print "For some reason mkdir(IMAGE_OUTPUT_DIR) failed!" + sys.exit(1) + print "Created image output dir" + elif os.path.isfile(IMAGE_OUTPUT_DIR): + print "IMAGE_OUTPUT_DIR is not a directory!" + sys.exit(1) + +#def filename_from_time(suffix): +# import datetime +# return str(datetime.datetime.now()) + suffix + ".png" + +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + channel.save() + + setup_workdir() + + crbm = MnistCrbm(state) + crbm.train() + + return channel.COMPLETE + +class MnistCrbm(object): + def __init__(self, state): + self.state = state + + if TEST_CONFIG: + self.mnist = MNIST.first_1k() + print "Test config, so loaded MNIST first 1000" + else: + self.mnist = MNIST.full()#first_10k() + print "Loaded MNIST full" + + self.cp = ConvolutionParams( \ + num_filters=state.num_filters, + num_input_planes=1, + height_filters=state.filter_size, + width_filters=state.filter_size) + + self.image_size = (28,28) + + self.minibatch_size = state.minibatch_size + + self.lr = state.learning_rate + self.sparsity_lambda = state.sparsity_lambda + # about 1/num_filters, so only one filter active at a time + # 40 * 0.05 = ~2 filters active for any given pixel + self.sparsity_p = state.sparsity_p + + self.crbm = CRBM( \ + minibatch_size=self.minibatch_size, + image_size=self.image_size, + conv_params=self.cp, + learning_rate=self.lr, + sparsity_lambda=self.sparsity_lambda, + sparsity_p=self.sparsity_p) + + self.num_epochs = state.num_epochs + + self.init_series() + + def init_series(self): + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + cd_series_names = self.crbm.cd_return_desc + series['cd'] = \ + utils.get_accumulator_series_array( \ + h5f, 'cd', cd_series_names, + REDUCE_EVERY, + stdout_too=SERIES_STDOUT_TOO) + + sparsity_series_names = self.crbm.sparsity_return_desc + series['sparsity'] = \ + utils.get_accumulator_series_array( \ + h5f, 'sparsity', sparsity_series_names, + REDUCE_EVERY, + stdout_too=SERIES_STDOUT_TOO) + + # so first we create the names for each table, based on + # position of each param in the array + params_stdout = [] + if SERIES_STDOUT_TOO: + params_stdout = [StdoutAppendTarget()] + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=['W','b_h','b_x'], + hdf5_file=h5f, + index_names=('epoch','minibatch'), + other_targets=params_stdout) + + self.series = series + + def train(self): + num_minibatches = len(self.mnist.train.x) / self.minibatch_size + + for epoch in xrange(self.num_epochs): + for mb_index in xrange(num_minibatches): + mb_x = self.mnist.train.x \ + [mb_index : mb_index+self.minibatch_size] + mb_x = mb_x.reshape((self.minibatch_size, 1, 28, 28)) + + #E_h = crbm.E_h_given_x_func(mb_x) + #print "Shape of E_h", E_h.shape + + cd_return = self.crbm.CD_step(mb_x) + sp_return = self.crbm.sparsity_step(mb_x) + + self.series['cd'].append( \ + (epoch, mb_index), cd_return) + self.series['sparsity'].append( \ + (epoch, mb_index), sp_return) + + total_idx = epoch*num_minibatches + mb_index + + if (total_idx+1) % REDUCE_EVERY == 0: + self.series['params'].append( \ + (epoch, mb_index), self.crbm.params) + + if total_idx % VISUALIZE_EVERY == 0: + self.visualize_gibbs_result(\ + mb_x, GIBBS_STEPS_IN_VIZ_CHAIN, + "gibbs_chain_"+str(epoch)+"_"+str(mb_index)) + self.visualize_gibbs_result(mb_x, 1, + "gibbs_1_"+str(epoch)+"_"+str(mb_index)) + self.visualize_filters( + "filters_"+str(epoch)+"_"+str(mb_index)) + if TEST_CONFIG: + # do a single epoch for cluster tests config + break + + if SAVE_PARAMS: + utils.save_params(self.crbm.params, "params.pkl") + + def visualize_gibbs_result(self, start_x, gibbs_steps, filename): + # Run minibatch_size chains for gibbs_steps + x_samples = None + if not start_x is None: + x_samples = self.crbm.gibbs_samples_from(start_x, gibbs_steps) + else: + x_samples = self.crbm.random_gibbs_samples(gibbs_steps) + x_samples = x_samples.reshape((self.minibatch_size, 28*28)) + + tile = tile_raster_images(x_samples, self.image_size, + (1, self.minibatch_size), output_pixel_vals=True) + + filepath = os.path.join(IMAGE_OUTPUT_DIR, filename+".png") + img = Image.fromarray(tile) + img.save(filepath) + + print "Result of running Gibbs", \ + gibbs_steps, "times outputed to", filepath + + def visualize_filters(self, filename): + cp = self.cp + + # filter size + fsz = (cp.height_filters, cp.width_filters) + tile_shape = (cp.num_filters, cp.num_input_planes) + + filters_flattened = self.crbm.W.value.reshape( + (tile_shape[0]*tile_shape[1], + fsz[0]*fsz[1])) + + tile = tile_raster_images(filters_flattened, fsz, + tile_shape, output_pixel_vals=True) + + filepath = os.path.join(IMAGE_OUTPUT_DIR, filename+".png") + img = Image.fromarray(tile) + img.save(filepath) + + print "Filters (as images) outputed to", filepath + + + +if __name__ == '__main__': + args = sys.argv[1:] + + if len(args) == 0: + print "Bad usage" + elif args[0] == 'jobman_insert': + utils.jobman_insert_job_vals(JOBDB, EXPERIMENT_PATH, JOB_VALS) + elif args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DEFAULT_STATE, chanmock) + elif args[0] == 'run_default': + setup_workdir() + mc = MnistCrbm(DEFAULT_STATE) + mc.train() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/crbm/utils.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,118 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +import jobman +from jobman import DD + +from pylearn.io.seriestables import * +import tables + + + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def jobs_from_reinsert_list(cols, job_vals): + job_list = [] + for vals in job_vals: + job = DD() + for i, col in enumerate(cols): + job[col] = vals[i] + job_list.append(job) + + return job_list + +def save_params(all_params, filename): + import pickle + with open(filename, 'wb') as f: + values = [p.value for p in all_params] + + # -1 for HIGHEST_PROTOCOL + pickle.dump(values, f, -1) + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_job_vals(job_db, experiment_path, job_vals): + jobs = produit_cartesien_jobs(job_vals) + + db = jobman.sql.db(job_db) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: experiment_path}) + jobman.sql.insert_dict(job, db) + +def jobman_insert_specific_jobs(job_db, experiment_path, + insert_cols, insert_vals): + jobs = jobs_from_reinsert_list(insert_cols, insert_vals) + + db = jobman.sql.db(job_db) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: experiment_path}) + jobman.sql.insert_dict(job, db) + +# Just a shortcut for a common case where we need a few +# related Error (float) series +def get_accumulator_series_array( \ + hdf5_file, group_name, series_names, + reduce_every, + index_names=('epoch','minibatch'), + stdout_too=True, + skip_hdf5_append=False): + all_series = [] + + new_group = hdf5_file.createGroup('/', group_name) + + other_targets = [] + if stdout_too: + other_targets = [StdoutAppendTarget()] + + for sn in series_names: + series_base = \ + ErrorSeries(error_name=sn, + table_name=sn, + hdf5_file=hdf5_file, + hdf5_group=new_group._v_pathname, + index_names=index_names, + other_targets=other_targets, + skip_hdf5_append=skip_hdf5_append) + + all_series.append( \ + AccumulatorSeriesWrapper( \ + base_series=series_base, + reduce_every=reduce_every)) + + ret_wrapper = SeriesArrayWrapper(all_series) + + return ret_wrapper + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/rbm/mnistrbm.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,215 @@ +import sys +import os, os.path + +import numpy as N + +import theano +import theano.tensor as T + +from crbm import CRBM, ConvolutionParams + +from pylearn.datasets import MNIST +from pylearn.io.image_tiling import tile_raster_images + +import Image + +from pylearn.io.seriestables import * +import tables + +IMAGE_OUTPUT_DIR = 'img/' + +REDUCE_EVERY = 100 + +def filename_from_time(suffix): + import datetime + return str(datetime.datetime.now()) + suffix + ".png" + +# Just a shortcut for a common case where we need a few +# related Error (float) series + +def get_accumulator_series_array( \ + hdf5_file, group_name, series_names, + reduce_every, + index_names=('epoch','minibatch'), + stdout_too=True, + skip_hdf5_append=False): + all_series = [] + + hdf5_file.createGroup('/', group_name) + + other_targets = [] + if stdout_too: + other_targets = [StdoutAppendTarget()] + + for sn in series_names: + series_base = \ + ErrorSeries(error_name=sn, + table_name=sn, + hdf5_file=hdf5_file, + hdf5_group='/'+group_name, + index_names=index_names, + other_targets=other_targets, + skip_hdf5_append=skip_hdf5_append) + + all_series.append( \ + AccumulatorSeriesWrapper( \ + base_series=series_base, + reduce_every=reduce_every)) + + ret_wrapper = SeriesArrayWrapper(all_series) + + return ret_wrapper + +class ExperienceRbm(object): + def __init__(self): + self.mnist = MNIST.full()#first_10k() + + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + test_set_x , test_set_y = datasets[2] + + + batch_size = 100 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + # initialize storage fot the persistent chain (state = hidden layer of chain) + persistent_chain = theano.shared(numpy.zeros((batch_size, 500))) + + # construct the RBM class + self.rbm = RBM( input = x, n_visible=28*28, \ + n_hidden = 500,numpy_rng = rng, theano_rng = theano_rng) + + # get the cost and the gradient corresponding to one step of CD + + + self.init_series() + + def init_series(self): + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + cd_series_names = self.rbm.cd_return_desc + series['cd'] = \ + get_accumulator_series_array( \ + h5f, 'cd', cd_series_names, + REDUCE_EVERY, + stdout_too=True) + + + + # so first we create the names for each table, based on + # position of each param in the array + params_stdout = StdoutAppendTarget("\n------\nParams") + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=['W','b_h','b_x'], + hdf5_file=h5f, + index_names=('epoch','minibatch'), + other_targets=[params_stdout]) + + self.series = series + + def train(self, persistent, learning_rate): + + training_epochs = 15 + + #get the cost and the gradient corresponding to one step of CD + if persistant: + persistent_chain = theano.shared(numpy.zeros((batch_size, self.rbm.n_hidden))) + cost, updates = self.rbm.cd(lr=learning_rate, persistent=persistent_chain) + + else: + cost, updates = self.rbm.cd(lr=learning_rate) + + dirname = 'lr=%.5f'%self.rbm.learning_rate + os.makedirs(dirname) + os.chdir(dirname) + + # the purpose of train_rbm is solely to update the RBM parameters + train_rbm = theano.function([index], cost, + updates = updates, + givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]}) + + plotting_time = 0. + start_time = time.clock() + + + # go through training epochs + for epoch in xrange(training_epochs): + + # go through the training set + mean_cost = [] + for batch_index in xrange(n_train_batches): + mean_cost += [train_rbm(batch_index)] + + + pretraining_time = (end_time - start_time) + + + + + def sample_from_rbm(self, gibbs_steps, test_set_x): + + # find out the number of test samples + number_of_test_samples = test_set_x.value.shape[0] + + # pick random test examples, with which to initialize the persistent chain + test_idx = rng.randint(number_of_test_samples-20) + persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+20]) + + # define one step of Gibbs sampling (mf = mean-field) + [hid_mf, hid_sample, vis_mf, vis_sample] = self.rbm.gibbs_vhv(persistent_vis_chain) + + # the sample at the end of the channel is returned by ``gibbs_1`` as + # its second output; note that this is computed as a binomial draw, + # therefore it is formed of ints (0 and 1) and therefore needs to + # be converted to the same dtype as ``persistent_vis_chain`` + vis_sample = T.cast(vis_sample, dtype=theano.config.floatX) + + # construct the function that implements our persistent chain + # we generate the "mean field" activations for plotting and the actual samples for + # reinitializing the state of our persistent chain + sample_fn = theano.function([], [vis_mf, vis_sample], + updates = { persistent_vis_chain:vis_sample}) + + # sample the RBM, plotting every `plot_every`-th sample; do this + # until you plot at least `n_samples` + n_samples = 10 + plot_every = 1000 + + for idx in xrange(n_samples): + + # do `plot_every` intermediate samplings of which we do not care + for jdx in xrange(plot_every): + vis_mf, vis_sample = sample_fn() + + # construct image + image = PIL.Image.fromarray(tile_raster_images( + X = vis_mf, + img_shape = (28,28), + tile_shape = (10,10), + tile_spacing = (1,1) ) ) + + image.save('sample_%i_step_%i.png'%(idx,idx*jdx)) + + +if __name__ == '__main__': + mc = ExperienceRbm() + mc.train() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/rbm/rbm.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,427 @@ +"""This tutorial introduces restricted boltzmann machines (RBM) using Theano. + +Boltzmann Machines (BMs) are a particular form of energy-based model which +contain hidden variables. Restricted Boltzmann Machines further restrict BMs +to those without visible-visible and hidden-hidden connections. +""" + +import numpy, time, cPickle, gzip, PIL.Image + +import theano +import theano.tensor as T +import os +import pdb +import numpy +import pylab +import time +import theano.tensor.nnet +import pylearn +import ift6266 +import theano,pylearn.version,ift6266 +from pylearn.io import filetensor as ft +from ift6266 import datasets + +from theano.tensor.shared_randomstreams import RandomStreams + +from utils import tile_raster_images +from logistic_sgd import load_data + +class RBM(object): + """Restricted Boltzmann Machine (RBM) """ + def __init__(self, input=None, n_visible=32*32, n_hidden=500, \ + W = None, hbias = None, vbias = None, numpy_rng = None, + theano_rng = None): + """ + RBM constructor. Defines the parameters of the model along with + basic operations for inferring hidden from visible (and vice-versa), + as well as for performing CD updates. + + :param input: None for standalone RBMs or symbolic variable if RBM is + part of a larger graph. + + :param n_visible: number of visible units + + :param n_hidden: number of hidden units + + :param W: None for standalone RBMs or symbolic variable pointing to a + shared weight matrix in case RBM is part of a DBN network; in a DBN, + the weights are shared between RBMs and layers of a MLP + + :param hbias: None for standalone RBMs or symbolic variable pointing + to a shared hidden units bias vector in case RBM is part of a + different network + + :param vbias: None for standalone RBMs or a symbolic variable + pointing to a shared visible units bias + """ + + self.n_visible = n_visible + self.n_hidden = n_hidden + + + if W is None : + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden+n_visible)), + high = numpy.sqrt(6./(n_hidden+n_visible)), + size = (n_visible, n_hidden)), + dtype = theano.config.floatX) + # theano shared variables for weights and biases + W = theano.shared(value = initial_W, name = 'W') + + if hbias is None : + # create shared variable for hidden units bias + hbias = theano.shared(value = numpy.zeros(n_hidden, + dtype = theano.config.floatX), name='hbias') + + if vbias is None : + # create shared variable for visible units bias + vbias = theano.shared(value =numpy.zeros(n_visible, + dtype = theano.config.floatX),name='vbias') + + if numpy_rng is None: + # create a number generator + numpy_rng = numpy.random.RandomState(1234) + + if theano_rng is None : + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + + # initialize input layer for standalone RBM or layer0 of DBN + self.input = input if input else T.dmatrix('input') + + self.W = W + self.hbias = hbias + self.vbias = vbias + self.theano_rng = theano_rng + + # **** WARNING: It is not a good idea to put things in this list + # other than shared variables created in this function. + self.params = [self.W, self.hbias, self.vbias] + self.batch_size = self.input.shape[0] + + def free_energy(self, v_sample): + ''' Function to compute the free energy ''' + wx_b = T.dot(v_sample, self.W) + self.hbias + vbias_term = T.sum(T.dot(v_sample, self.vbias)) + hidden_term = T.sum(T.log(1+T.exp(wx_b))) + return -hidden_term - vbias_term + + def sample_h_given_v(self, v0_sample): + ''' This function infers state of hidden units given visible units ''' + # compute the activation of the hidden units given a sample of the visibles + h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias) + # get a sample of the hiddens given their activation + h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean) + return [h1_mean, h1_sample] + + def sample_v_given_h(self, h0_sample): + ''' This function infers state of visible units given hidden units ''' + # compute the activation of the visible given the hidden sample + v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias) + # get a sample of the visible given their activation + v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean) + return [v1_mean, v1_sample] + + def gibbs_hvh(self, h0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the hidden state''' + v1_mean, v1_sample = self.sample_v_given_h(h0_sample) + h1_mean, h1_sample = self.sample_h_given_v(v1_sample) + return [v1_mean, v1_sample, h1_mean, h1_sample] + + def gibbs_vhv(self, v0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the visible state''' + h1_mean, h1_sample = self.sample_h_given_v(v0_sample) + v1_mean, v1_sample = self.sample_v_given_h(h1_sample) + return [h1_mean, h1_sample, v1_mean, v1_sample] + + def cd(self, lr = 0.1, persistent=None, k=1): + """ + This functions implements one step of CD-1 or PCD-1 + + :param lr: learning rate used to train the RBM + :param persistent: None for CD. For PCD, shared variable containing old state + of Gibbs chain. This must be a shared variable of size (batch size, number of + hidden units). + + Returns the updates dictionary. The dictionary contains the update rules for weights + and biases but also an update of the shared variable used to store the persistent + chain, if one is used. + """ + + # compute positive phase + ph_mean, ph_sample = self.sample_h_given_v(self.input) + + # decide how to initialize persistent chain: + # for CD, we use the newly generate hidden sample + # for PCD, we initialize from the old state of the chain + if persistent is None: + chain_start = ph_sample + else: + chain_start = persistent + + # perform actual negative phase (the CD-1) + [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start) + + #perform CD-k + if k-1>0: + for i in range(k-1): + [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(nh_sample) + + + + # determine gradients on RBM parameters + g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size + g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size + g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \ + T.dot(nh_mean.T, nv_mean )/ self.batch_size + + gparams = [g_W.T, g_hbias, g_vbias] + + # constructs the update dictionary + updates = {} + for gparam, param in zip(gparams, self.params): + updates[param] = param + gparam * lr + + if persistent: + # Note that this works only if persistent is a shared variable + updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX) + # pseudo-likelihood is a better proxy for PCD + cost = self.get_pseudo_likelihood_cost(updates) + else: + # reconstruction cross-entropy is a better proxy for CD + cost = self.get_reconstruction_cost(updates, nv_mean) + + return cost, updates + + def get_pseudo_likelihood_cost(self, updates): + """Stochastic approximation to the pseudo-likelihood""" + + # index of bit i in expression p(x_i | x_{\i}) + bit_i_idx = theano.shared(value=0, name = 'bit_i_idx') + + # binarize the input image by rounding to nearest integer + xi = T.iround(self.input) + + # calculate free energy for the given bit configuration + fe_xi = self.free_energy(xi) + + # flip bit x_i of matrix xi and preserve all other bits x_{\i} + # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx] + # NB: slice(start,stop,step) is the python object used for + # slicing, e.g. to index matrix x as follows: x[start:stop:step] + xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], + idx_list=(slice(None,None,None),bit_i_idx)) + + # calculate free energy with bit flipped + fe_xi_flip = self.free_energy(xi_flip) + + # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) + cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)) + + # increment bit_i_idx % number as part of updates + updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible + + return cost + + def get_reconstruction_cost(self, updates, nv_mean): + """Approximation to the reconstruction error""" + + cross_entropy = T.mean( + T.sum(self.input*T.log(nv_mean) + + (1 - self.input)*T.log(1-nv_mean), axis = 1)) + + return cross_entropy + + + +def test_rbm(b_size = 25, nhidden = 1000, kk = 1, persistance = 0, + dataset= 0): + """ + Demonstrate *** + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used for training the RBM + + :param training_epochs: number of epochs used for training + + :param dataset: path the the pickled dataset + + """ + + learning_rate=0.1 + + if data_set==0: + datasets=datasets.nist_all() + elif data_set==1: + datasets=datasets.nist_P07() + elif data_set==2: + datasets=datasets.PNIST07() + + + # revoir la recuperation des donnees +## dataset = load_data(dataset) +## +## train_set_x, train_set_y = datasets[0] +## test_set_x , test_set_y = datasets[2] +## training_epochs = 10 # a determiner + + batch_size = b_size # size of the minibatch + + # compute number of minibatches for training, validation and testing + #n_train_batches = train_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + + # construct the RBM class + rbm = RBM( input = x, n_visible=32*32, \ + n_hidden = nhidden, numpy_rng = rng, theano_rng = theano_rng) + + + # initialize storage fot the persistent chain (state = hidden layer of chain) + if persistance == 1: + persistent_chain = theano.shared(numpy.zeros((batch_size, 500))) + # get the cost and the gradient corresponding to one step of CD + cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain, k= kk) + + else: + # get the cost and the gradient corresponding to one step of CD + #persistance_chain = None + cost, updates = rbm.cd(lr=learning_rate, persistent=None, k= kk) + + ################################# + # Training the RBM # + ################################# + dirname = 'data=%i'%dataset + ' persistance=%i'%persistance + ' n_hidden=%i'%n_hidden + 'batch_size=i%'%b_size + os.makedirs(dirname) + os.chdir(dirname) + + # it is ok for a theano function to have no output + # the purpose of train_rbm is solely to update the RBM parameters + train_rbm = theano.function([x], cost, + updates = updates, + ) + + plotting_time = 0.0 + start_time = time.clock() + bufsize = 1000 + + # go through training epochs + costs = [] + for epoch in xrange(training_epochs): + + # go through the training set + mean_cost = [] + for mini_x, mini_y in datasets.train(b_size): + mean_cost += [train_rbm(mini_x)] +## learning_rate = learning_rate - 0.0001 +## learning_rate = learning_rate/(tau+( epoch*batch_index*batch_size)) + + #learning_rate = learning_rate/10 + + costs.append(numpy.mean(mean_cost)) + + # Plot filters after each training epoch + plotting_start = time.clock() + # Construct image from the weight matrix + image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T, + img_shape = (32,32),tile_shape = (10,10), + tile_spacing=(1,1))) + image.save('filters_at_epoch_%i.png'%epoch) + plotting_stop = time.clock() + plotting_time += (plotting_stop - plotting_start) + + end_time = time.clock() + + pretraining_time = (end_time - start_time) - plotting_time + + + + + + + ################################# + # Sampling from the RBM # + ################################# + + # find out the number of test samples + number_of_test_samples = 1000 + + test_set_x, test_y = datasets.test(100*b_size) + # pick random test examples, with which to initialize the persistent chain + test_idx = rng.randint(number_of_test_samples - b_size) + persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+b_size]) + + # define one step of Gibbs sampling (mf = mean-field) + [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain) + + # the sample at the end of the channel is returned by ``gibbs_1`` as + # its second output; note that this is computed as a binomial draw, + # therefore it is formed of ints (0 and 1) and therefore needs to + # be converted to the same dtype as ``persistent_vis_chain`` + vis_sample = T.cast(vis_sample, dtype=theano.config.floatX) + + # construct the function that implements our persistent chain + # we generate the "mean field" activations for plotting and the actual samples for + # reinitializing the state of our persistent chain + sample_fn = theano.function([], [vis_mf, vis_sample], + updates = { persistent_vis_chain:vis_sample}) + + # sample the RBM, plotting every `plot_every`-th sample; do this + # until you plot at least `n_samples` + n_samples = 10 + # run minibatch size chains for gibbs samples (number of negative particles) + plot_every = b_size + + for idx in xrange(n_samples): + + # do `plot_every` intermediate samplings of which we do not care + for jdx in xrange(plot_every): + vis_mf, vis_sample = sample_fn() + + # construct image + image = PIL.Image.fromarray(tile_raster_images( + X = vis_mf, + img_shape = (32,32), + tile_shape = (10,10), + tile_spacing = (1,1) ) ) + #print ' ... plotting sample ', idx + image.save('sample_%i_step_%i.png'%(idx,idx*jdx)) + + #save the model + model = [rbm.W, rbm.vbias, rbm.hbias] + f = fopen('params.txt', 'w') + pickle.dump(model, f) + f.close() + #os.chdir('./..') + return numpy.mean(costs), pretraining_time/360 + + +def experiment(state, channel): + + (mean_cost, time_execution) = test_rbm(b_size = state.b_size,\ + nhidden = state.ndidden,\ + kk = state.kk,\ + persistance = state.persistance,\ + dataset = state.dataset) + + state.mean_costs = mean_costs + state.time_execution = time_execution + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + return channel.COMPLETE + +if __name__ == '__main__': + + test_rbm()
--- a/deep/stacked_dae/v_sylvain/nist_sda.py Sun Apr 25 12:31:22 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/nist_sda.py Sun Apr 25 13:54:57 2010 -0400 @@ -55,6 +55,11 @@ decrease_lr = state['decrease_lr'] else : decrease_lr = 0 + + if state.has_key('decrease_lr_pretrain'): + dec=state['decrease_lr_pretrain'] + else : + dec=0 n_ins = 32*32 n_outs = 62 # 10 digits, 26*2 (lower, capitals) @@ -87,7 +92,7 @@ nb_file=0 if state['pretrain_choice'] == 0: print('\n\tpretraining with NIST\n') - optimizer.pretrain(datasets.nist_all()) + optimizer.pretrain(datasets.nist_all(), decrease = dec) elif state['pretrain_choice'] == 1: #To know how many file will be used during pretraining nb_file = int(state['pretraining_epochs_per_layer']) @@ -97,7 +102,7 @@ "You have to correct the code (and be patient, P07 is huge !!)\n"+ "or reduce the number of pretraining epoch to run the code (better idea).\n") print('\n\tpretraining with P07') - optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file)) + optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file),decrease = dec) channel.save() #Set some of the parameters used for the finetuning
--- a/deep/stacked_dae/v_sylvain/sgd_optimization.py Sun Apr 25 12:31:22 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/sgd_optimization.py Sun Apr 25 13:54:57 2010 -0400 @@ -9,7 +9,8 @@ import datetime import theano.tensor as T import sys -import pickle +#import pickle +import cPickle from jobman import DD import jobman, jobman.sql @@ -87,30 +88,40 @@ self.pretrain(self.dataset) self.finetune(self.dataset) - def pretrain(self,dataset): + def pretrain(self,dataset,decrease=0): print "STARTING PRETRAINING, time = ", datetime.datetime.now() sys.stdout.flush() un_fichier=int(819200.0/self.hp.minibatch_size) #Number of batches in a P07 file start_time = time.clock() + + ######## This is hardcoaded. THe 0.95 parameter is hardcoaded and can be changed at will ### + #Set the decreasing rate of the learning rate. We want the final learning rate to + #be 5% of the original learning rate. The decreasing factor is linear + decreasing = (decrease*self.hp.pretraining_lr)/float(self.hp.pretraining_epochs_per_layer*800000/self.hp.minibatch_size) + ## Pre-train layer-wise for i in xrange(self.classifier.n_layers): # go through pretraining epochs + + #To reset the learning rate to his original value + learning_rate=self.hp.pretraining_lr for epoch in xrange(self.hp.pretraining_epochs_per_layer): # go through the training set batch_index=0 count=0 num_files=0 for x,y in dataset.train(self.hp.minibatch_size): - c = self.classifier.pretrain_functions[i](x) + c = self.classifier.pretrain_functions[i](x,learning_rate) count +=1 self.series["reconstruction_error"].append((epoch, batch_index), c) batch_index+=1 - #if batch_index % 100 == 0: - # print "100 batches" + #If we need to decrease the learning rate for the pretrain + if decrease != 0: + learning_rate -= decreasing # useful when doing tests if self.max_minibatches and batch_index >= self.max_minibatches: @@ -141,7 +152,7 @@ #To be able to load them later for tests on finetune self.parameters_pre=[copy(x.value) for x in self.classifier.params] f = open('params_pretrain.txt', 'w') - pickle.dump(self.parameters_pre,f) + cPickle.dump(self.parameters_pre,f,protocol=-1) f.close() @@ -204,7 +215,7 @@ parameters_finetune=[] if ind_test == 21: - learning_rate = self.hp.finetuning_lr / 5.0 + learning_rate = self.hp.finetuning_lr / 10.0 else: learning_rate = self.hp.finetuning_lr #The initial finetune lr @@ -295,8 +306,8 @@ break if decrease == 1: - if (ind_test == 21 & epoch % 100 == 0) | ind_test == 20: - learning_rate /= 2 #divide the learning rate by 2 for each new epoch of P07 (or 100 of NIST) + if (ind_test == 21 & epoch % 100 == 0) | ind_test == 20: + learning_rate /= 2 #divide the learning rate by 2 for each new epoch of P07 (or 100 of NIST) self.series['params'].append((epoch,), self.classifier.all_params) @@ -322,23 +333,23 @@ if special == 1: #To keep a track of the value of the parameters f = open('params_finetune_stanford.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() elif ind_test == 0 | ind_test == 20: #To keep a track of the value of the parameters f = open('params_finetune_P07.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() elif ind_test== 1: #For the run with 2 finetunes. It will be faster. f = open('params_finetune_NIST.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() elif ind_test== 21: #To keep a track of the value of the parameters f = open('params_finetune_P07_then_NIST.txt', 'w') - pickle.dump(parameters_finetune,f) + cPickle.dump(parameters_finetune,f,protocol=-1) f.close() @@ -347,7 +358,7 @@ #self.parameters_pre=pickle.load('params_pretrain.txt') f = open(which) - self.parameters_pre=pickle.load(f) + self.parameters_pre=cPickle.load(f) f.close() for idx,x in enumerate(self.parameters_pre): if x.dtype=='float64': @@ -366,6 +377,147 @@ train_losses2 = [test_model(x,y) for x,y in iter2] train_score2 = numpy.mean(train_losses2) print "Training error is: " + str(train_score2) + + #To see the prediction of the model, the real answer and the image to judge + def see_error(self, dataset): + import pylab + #The function to know the prediction + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.logLayer.y_pred) + user = [] + nb_total = 0 #total number of exemples seen + nb_error = 0 #total number of errors + for x,y in dataset.test(1): + nb_total += 1 + pred = self.translate(test_model(x,y)) + rep = self.translate(y) + error = pred != rep + print 'prediction: ' + str(pred) +'\t answer: ' + str(rep) + '\t right: ' + str(not(error)) + pylab.imshow(x.reshape((32,32))) + pylab.draw() + if error: + nb_error += 1 + user.append(int(raw_input("1 = The error is normal, 0 = The error is not normal : "))) + print '\t\t character is hard to distinguish: ' + str(user[-1]) + else: + time.sleep(3) + print '\n Over the '+str(nb_total)+' exemples, there is '+str(nb_error)+' errors. \nThe percentage of errors is'+ str(float(nb_error)/float(nb_total)) + print 'The percentage of errors done by the model that an human will also do: ' + str(numpy.mean(user)) + + + + + #To translate the numeric prediction in character if necessary + def translate(self,y): + + if y <= 9: + return y[0] + elif y == 10: + return 'A' + elif y == 11: + return 'B' + elif y == 12: + return 'C' + elif y == 13: + return 'D' + elif y == 14: + return 'E' + elif y == 15: + return 'F' + elif y == 16: + return 'G' + elif y == 17: + return 'H' + elif y == 18: + return 'I' + elif y == 19: + return 'J' + elif y == 20: + return 'K' + elif y == 21: + return 'L' + elif y == 22: + return 'M' + elif y == 23: + return 'N' + elif y == 24: + return 'O' + elif y == 25: + return 'P' + elif y == 26: + return 'Q' + elif y == 27: + return 'R' + elif y == 28: + return 'S' + elif y == 29: + return 'T' + elif y == 30: + return 'U' + elif y == 31: + return 'V' + elif y == 32: + return 'W' + elif y == 33: + return 'X' + elif y == 34: + return 'Y' + elif y == 35: + return 'Z' + + elif y == 36: + return 'a' + elif y == 37: + return 'b' + elif y == 38: + return 'c' + elif y == 39: + return 'd' + elif y == 40: + return 'e' + elif y == 41: + return 'f' + elif y == 42: + return 'g' + elif y == 43: + return 'h' + elif y == 44: + return 'i' + elif y == 45: + return 'j' + elif y == 46: + return 'k' + elif y == 47: + return 'l' + elif y == 48: + return 'm' + elif y == 49: + return 'n' + elif y == 50: + return 'o' + elif y == 51: + return 'p' + elif y == 52: + return 'q' + elif y == 53: + return 'r' + elif y == 54: + return 's' + elif y == 55: + return 't' + elif y == 56: + return 'u' + elif y == 57: + return 'v' + elif y == 58: + return 'w' + elif y == 59: + return 'x' + elif y == 60: + return 'y' + elif y == 61: + return 'z'
--- a/deep/stacked_dae/v_sylvain/stacked_dae.py Sun Apr 25 12:31:22 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/stacked_dae.py Sun Apr 25 13:54:57 2010 -0400 @@ -27,8 +27,10 @@ # initialize the baises b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros((n_out,), dtype = theano.config.floatX) ) - # compute vector of class-membership probabilities in symbolic form - self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + # compute vector of class-membership. This is a sigmoid instead of + #a softmax to be able later to classify as nothing +## self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) #row-wise + self.p_y_given_x = T.nnet.sigmoid(T.dot(input, self.W)+self.b) # compute prediction as class whose probability is maximal in # symbolic form @@ -39,7 +41,13 @@ def negative_log_likelihood(self, y): - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) +## return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]+T.sum(T.log(1-self.p_y_given_x), axis=1)-T.log(1-self.p_y_given_x)[T.arange(y.shape[0]),y]) + + +## def kullback_leibler(self,y): +## return -T.mean(T.log(1/float(self.p_y_given_x))[T.arange(y.shape[0]),y]) + def errors(self, y): # check if y has same dimension of y_pred @@ -71,7 +79,25 @@ self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) self.params = [self.W, self.b] + +class TanhLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = (T.tanh(T.dot(input, self.W) + self.b) + 1.0)/2.0 + # ( *+ 1) /2 is because tanh goes from -1 to 1 and sigmoid goes from 0 to 1 + # I want to use tanh, but the image has to stay the same. The correction is necessary. + self.params = [self.W, self.b] class dA(object): @@ -132,22 +158,24 @@ # Equation (2) # note : y is stored as an attribute of the class so that it can be # used later when stacking dAs. - self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) - # Equation (3) - #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) - # Equation (4) - # note : we sum over the size of a datapoint; if we are using minibatches, - # L will be a vector, with one entry per example in minibatch - #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) - #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) - - # bypassing z to avoid running to log(0) - z_a = T.dot(self.y, self.W_prime) + self.b_prime - log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) - # log(1-sigmoid(z_a)) - log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) - self.L = -T.sum( self.x * (log_sigmoid) \ - + (1.0-self.x) * (log_1_sigmoid), axis=1 ) + +## self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) +## +## # Equation (3) +## #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) +## # Equation (4) +## # note : we sum over the size of a datapoint; if we are using minibatches, +## # L will be a vector, with one entry per example in minibatch +## #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) +## #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) +## +## # bypassing z to avoid running to log(0) +## z_a = T.dot(self.y, self.W_prime) + self.b_prime +## log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) +## # log(1-sigmoid(z_a)) +## log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) +## self.L = -T.sum( self.x * (log_sigmoid) \ +## + (1.0-self.x) * (log_1_sigmoid), axis=1 ) # I added this epsilon to avoid getting log(0) and 1/0 in grad # This means conceptually that there'd be no probability of 0, but that @@ -160,6 +188,20 @@ # of the reconstruction of the corresponding example of the # minibatch. We need to compute the average of all these to get # the cost of the minibatch + + #Or use a Tanh everything is always between 0 and 1, the range is + #changed so it remain the same as when sigmoid is used + self.y = (T.tanh(T.dot(self.tilde_x, self.W ) + self.b)+1.0)/2.0 + + self.z = (T.tanh(T.dot(self.y, self.W_prime) + self.b_prime)+1.0) / 2.0 + #To ensure to do not have a log(0) operation + if self.z <= 0: + self.z = 0.000001 + if self.z >= 1: + self.z = 0.999999 + + self.L = - T.sum( self.x*T.log(self.z) + (1.0-self.x)*T.log(1.0-self.z), axis=1 ) + self.cost = T.mean(self.L) self.params = [ self.W, self.b, self.b_prime ] @@ -204,6 +246,7 @@ self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels self.finetune_lr = T.fscalar('finetune_lr') #To get a dynamic finetune learning rate + self.pretrain_lr = T.fscalar('pretrain_lr') #To get a dynamic pretrain learning rate for i in xrange( self.n_layers ): # construct the sigmoidal layer @@ -222,8 +265,12 @@ layer_input = self.x else: layer_input = self.layers[-1].output + #We have to choose between sigmoidal layer or tanh layer ! - layer = SigmoidalLayer(rng, layer_input, input_size, +## layer = SigmoidalLayer(rng, layer_input, input_size, +## hidden_layers_sizes[i] ) + + layer = TanhLayer(rng, layer_input, input_size, hidden_layers_sizes[i] ) # add the layer to the self.layers += [layer] @@ -244,10 +291,10 @@ # compute the list of updates updates = {} for param, gparam in zip(dA_layer.params, gparams): - updates[param] = param - gparam * pretrain_lr + updates[param] = param - gparam * self.pretrain_lr # create a function that trains the dA - update_fn = theano.function([self.x], dA_layer.cost, \ + update_fn = theano.function([self.x, self.pretrain_lr], dA_layer.cost, \ updates = updates)#, # givens = { # self.x : ensemble})
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/voir_erreurs.py Sun Apr 25 13:54:57 2010 -0400 @@ -0,0 +1,118 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs +from copy import copy + +from sgd_optimization import SdaSgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +from ift6266 import datasets +from config import * + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + n_ins = 32*32 + n_outs = 62 # 10 digits + 26*2 (lower, capitals) + + examples_per_epoch = NIST_ALL_TRAIN_SIZE + + PATH = PATH_P07 + maximum_exemples=int(100) #Maximum number of exemples seen + + + + print "Creating optimizer with state, ", state + + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all(), + hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + examples_per_epoch=examples_per_epoch, \ + max_minibatches=rtt) + + + + + print 'The model is created' + if os.path.exists(PATH+'params_finetune_NIST.txt'): + print ('\n finetune = NIST ') + optimizer.reload_parameters(PATH+'params_finetune_NIST.txt') + print "For" + str(maximum_exemples) + "over the NIST test set: " + optimizer.see_error(datasets.nist_all(maxsize=maximum_exemples)) + + + if os.path.exists(PATH+'params_finetune_P07.txt'): + print ('\n finetune = P07 ') + optimizer.reload_parameters(PATH+'params_finetune_P07.txt') + print "For" + str(maximum_exemples) + "over the P07 test set: " + optimizer.see_error(datasets.nist_P07(maxsize=maximum_exemples)) + + + if os.path.exists(PATH+'params_finetune_NIST_then_P07.txt'): + print ('\n finetune = NIST then P07') + optimizer.reload_parameters(PATH+'params_finetune_NIST_then_P07.txt') + print "For" + str(maximum_exemples) + "over the NIST test set: " + optimizer.see_error(datasets.nist_all(maxsize=maximum_exemples)) + print "For" + str(maximum_exemples) + "over the P07 test set: " + optimizer.see_error(datasets.nist_P07(maxsize=maximum_exemples)) + + if os.path.exists(PATH+'params_finetune_P07_then_NIST.txt'): + print ('\n finetune = P07 then NIST') + optimizer.reload_parameters(PATH+'params_finetune_P07_then_NIST.txt') + print "For" + str(maximum_exemples) + "over the P07 test set: " + optimizer.see_error(datasets.nist_P07(maxsize=maximum_exemples)) + print "For" + str(maximum_exemples) + "over the NIST test set: " + optimizer.see_error(datasets.nist_all(maxsize=maximum_exemples)) + + channel.save() + + return channel.COMPLETE + + + +if __name__ == '__main__': + + + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DD(DEFAULT_HP_NIST), chanmock) + +
--- a/scripts/launch_generate100.py Sun Apr 25 12:31:22 2010 -0400 +++ b/scripts/launch_generate100.py Sun Apr 25 13:54:57 2010 -0400 @@ -1,9 +1,10 @@ #!/usr/bin/env python import os -dir1 = "/data/lisa/data/ift6266h10/" +dir1 = "/data/lisa8/data/ift6266h10/" +dir2="/data/lisa/data/ift6266h10/" -mach = ["maggie16.iro.umontreal.ca,zappa8@iro.umontreal.ca"] +mach = "maggie16.iro.umontreal.ca,zappa8.iro.umontreal.ca,maggie15.iro.umontreal.ca,brams04.iro.umontreal.ca" #test and valid sets for i,s in enumerate(['valid','test']): @@ -16,11 +17,18 @@ os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P07_train%d_data.ft -p %sdata/P07_train%d_params -x %sdata/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i)) #PNIST07 -for i in range(100): - os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/PNIST07_train%d_data.ft -p %sdata/PNIST07_train%d_params -x %sdata/PNIST07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d -t %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i,1)) +for i in xrange(10,100): + os.system("dbidispatch --condor --mem=3900 --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/PNIST07_train%d_data.ft -p %sdata/PNIST07_train%d_params -x %sdata/PNIST07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d -t 1" % (mach, dir1, i, dir1, i, dir1, i, dir2, dir2, dir2, dir2, 100+i)) -#P07 -#for i in [90,94]:#[2,10,13,15,20,49,68,82,86,90,94]: - #os.system("dbidispatch --condor --mem=3900 --os=fc4,fc7,fc9 --machine=maggie16.iro.umontreal.ca --machine=maggie15.iro.umontreal.ca --machine=zappa8@iro.umontreal.ca ./run_pipeline.sh -o %sdata2/P07_train%d_data.ft -p %sdata2/P07_train%d_params -x %sdata2/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1,100+i)) +#PNIST_full_noise +for i in range(100): + os.system("dbidispatch --condor --mem=3900 --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/Pin07_train%d_data.ft -p %sdata/Pin07_train%d_params -x %sdata/Pin07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (mach, dir1, i, dir1, i, dir1, i, dir2, dir2, dir2, dir2,100+i)) + + + +#P07_safe +for i in xrange(89,100,1): + os.system("dbidispatch --condor --mem=3900 --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P07safe_train%d_data.ft -p %sdata/P07safe_train%d_params -x %sdata/P07safe_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.0 -b 0.0 -g 0.0 -s 819200 -y %d" % (mach, dir1, i, dir1, i, dir1, i, dir2, dir2, dir2,dir2,100+i)) +
--- a/scripts/setup_batches.py Sun Apr 25 12:31:22 2010 -0400 +++ b/scripts/setup_batches.py Sun Apr 25 13:54:57 2010 -0400 @@ -15,8 +15,14 @@ lower_train_data = 'lower/lower_train_data.ft' lower_train_labels = 'lower/lower_train_labels.ft' + lower_test_data = 'lower/lower_test_data.ft' + lower_test_labels = 'lower/lower_test_labels.ft' + upper_train_data = 'upper/upper_train_data.ft' upper_train_labels = 'upper/upper_train_labels.ft' + upper_test_data = 'upper/upper_test_data.ft' + upper_test_labels = 'upper/upper_test_labels.ft' + test_data = 'all/all_test_data.ft' test_labels = 'all/all_test_labels.ft' @@ -29,11 +35,16 @@ f_lower_train_data = open(data_path + lower_train_data) f_lower_train_labels = open(data_path + lower_train_labels) + f_lower_test_data = open(data_path + lower_test_data) + f_lower_test_labels = open(data_path + lower_test_labels) + f_upper_train_data = open(data_path + upper_train_data) f_upper_train_labels = open(data_path + upper_train_labels) + f_upper_test_data = open(data_path + upper_test_data) + f_upper_test_labels = open(data_path + upper_test_labels) - f_test_data = open(data_path + test_data) - f_test_labels = open(data_path + test_labels) + #f_test_data = open(data_path + test_data) + #f_test_labels = open(data_path + test_labels) self.raw_digits_train_data = ft.read(f_digits_train_data) self.raw_digits_train_labels = ft.read(f_digits_train_labels) @@ -42,11 +53,16 @@ self.raw_lower_train_data = ft.read(f_lower_train_data) self.raw_lower_train_labels = ft.read(f_lower_train_labels) + self.raw_lower_test_data = ft.read(f_lower_test_data) + self.raw_lower_test_labels = ft.read(f_lower_test_labels) + self.raw_upper_train_data = ft.read(f_upper_train_data) self.raw_upper_train_labels = ft.read(f_upper_train_labels) + self.raw_upper_test_data = ft.read(f_upper_test_data) + self.raw_upper_test_labels = ft.read(f_upper_test_labels) - self.raw_test_data = ft.read(f_test_data) - self.raw_test_labels = ft.read(f_test_labels) + #self.raw_test_data = ft.read(f_test_data) + #self.raw_test_labels = ft.read(f_test_labels) f_digits_train_data.close() f_digits_train_labels.close() @@ -55,41 +71,73 @@ f_lower_train_data.close() f_lower_train_labels.close() + f_lower_test_data.close() + f_lower_test_labels.close() + f_upper_train_data.close() f_upper_train_labels.close() + f_upper_test_data.close() + f_upper_test_labels.close() - f_test_data.close() - f_test_labels.close() + #f_test_data.close() + #f_test_labels.close() print 'Data opened' - def set_batches(self, start_ratio = -1, end_ratio = -1, batch_size = 20, verbose = False): + def set_batches(self, main_class = "d", start_ratio = -1, end_ratio = -1, batch_size = 20, verbose = False): self.batch_size = batch_size digits_train_size = len(self.raw_digits_train_labels) digits_test_size = len(self.raw_digits_test_labels) lower_train_size = len(self.raw_lower_train_labels) + upper_train_size = len(self.raw_upper_train_labels) + upper_test_size = len(self.raw_upper_test_labels) if verbose == True: print 'digits_train_size = %d' %digits_train_size print 'digits_test_size = %d' %digits_test_size print 'lower_train_size = %d' %lower_train_size print 'upper_train_size = %d' %upper_train_size + print 'upper_test_size = %d' %upper_test_size - # define main and other datasets - raw_main_train_data = self.raw_digits_train_data - raw_other_train_data1 = self.raw_lower_train_labels - raw_other_train_data2 = self.raw_upper_train_labels - raw_test_data = self.raw_digits_test_data - #raw_test_data = self.raw_test_data + if main_class == "u": + # define main and other datasets + raw_main_train_data = self.raw_upper_train_data + raw_other_train_data1 = self.raw_lower_train_labels + raw_other_train_data2 = self.raw_digits_train_labels + raw_test_data = self.raw_upper_test_data + + raw_main_train_labels = self.raw_upper_train_labels + raw_other_train_labels1 = self.raw_lower_train_labels + raw_other_train_labels2 = self.raw_digits_train_labels + raw_test_labels = self.raw_upper_test_labels - raw_main_train_labels = self.raw_digits_train_labels - raw_other_train_labels1 = self.raw_lower_train_labels - raw_other_train_labels2 = self.raw_upper_train_labels - raw_test_labels = self.raw_digits_test_labels - #raw_test_labels = self.raw_test_labels + elif main_class == "l": + # define main and other datasets + raw_main_train_data = self.raw_lower_train_data + raw_other_train_data1 = self.raw_upper_train_labels + raw_other_train_data2 = self.raw_digits_train_labels + raw_test_data = self.raw_lower_test_data + + raw_main_train_labels = self.raw_lower_train_labels + raw_other_train_labels1 = self.raw_upper_train_labels + raw_other_train_labels2 = self.raw_digits_train_labels + raw_test_labels = self.raw_lower_test_labels + + else: + main_class = "d" + # define main and other datasets + raw_main_train_data = self.raw_digits_train_data + raw_other_train_data1 = self.raw_lower_train_labels + raw_other_train_data2 = self.raw_upper_train_labels + raw_test_data = self.raw_digits_test_data + + raw_main_train_labels = self.raw_digits_train_labels + raw_other_train_labels1 = self.raw_lower_train_labels + raw_other_train_labels2 = self.raw_upper_train_labels + raw_test_labels = self.raw_digits_test_labels main_train_size = len(raw_main_train_labels) other_train_size1 = len(raw_other_train_labels1) @@ -113,6 +161,7 @@ self.end_ratio = end_ratio if verbose == True: + print 'main class : %s' %main_class print 'start_ratio = %f' %self.start_ratio print 'end_ratio = %f' %self.end_ratio