# HG changeset patch # User Xavier Glorot # Date 1268756050 14400 # Node ID 2024368a8d3d0ab3e7b359b0a3bd7ed74e40ebee # Parent 0de14b2034c60f89d0635b4b803072aba30db291# Parent 3c54cb3713ef1b1ab708d2dcb28f86ade57949e3 merge diff -r 0de14b2034c6 -r 2024368a8d3d baseline/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d baseline/conv_mlp/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d baseline/conv_mlp/convolutional_mlp.py --- a/baseline/conv_mlp/convolutional_mlp.py Tue Mar 16 12:13:49 2010 -0400 +++ b/baseline/conv_mlp/convolutional_mlp.py Tue Mar 16 12:14:10 2010 -0400 @@ -26,7 +26,8 @@ import theano.sandbox.softsign import pylearn.datasets.MNIST from pylearn.io import filetensor as ft -from theano.sandbox import conv, downsample +from theano.tensor.signal import downsample +from theano.tensor.nnet import conv class LeNetConvPoolLayer(object): diff -r 0de14b2034c6 -r 2024368a8d3d baseline/deep_mlp/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d baseline/log_reg/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d baseline/log_reg/log_reg.py --- a/baseline/log_reg/log_reg.py Tue Mar 16 12:13:49 2010 -0400 +++ b/baseline/log_reg/log_reg.py Tue Mar 16 12:14:10 2010 -0400 @@ -35,11 +35,11 @@ """ __docformat__ = 'restructedtext en' -import numpy, time, cPickle, gzip +import numpy, time import theano import theano.tensor as T - +from ift6266 import datasets class LogisticRegression(object): """Multi-class Logistic Regression Class @@ -112,6 +112,8 @@ # i.e., the mean log-likelihood across the minibatch. return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] ) + def MSE(self, y): + return -T.mean(abs((self.p_t_given_x)[T.arange(y.shape[0]), y]-y)**2) def errors( self, y ): """Return a float representing the number of errors in the minibatch @@ -135,109 +137,15 @@ else: raise NotImplementedError() -def shared_dataset( data_xy ): - """ Function that loads the dataset into shared variables - - The reason we store our dataset in shared variables is to allow - Theano to copy it into the GPU memory (when code is run on GPU). - Since copying data into the GPU is slow, copying a minibatch everytime - is needed (the default behaviour if the data is not in a shared - variable) would lead to a large decrease in performance. - """ - data_x, data_y = data_xy - shared_x = theano.shared( numpy.asarray( data_x, dtype = theano.config.floatX ) ) - shared_y = theano.shared( numpy.asarray( data_y, dtype = theano.config.floatX ) ) - # When storing data on the GPU it has to be stored as floats - # therefore we will store the labels as ``floatX`` as well - # (``shared_y`` does exactly that). But during our computations - # we need them as ints (we use labels as index, and if they are - # floats it doesn't make sense) therefore instead of returning - # ``shared_y`` we will have to cast it to int. This little hack - # lets ous get around this issue - return shared_x, T.cast( shared_y, 'int32' ) - -def load_data_pkl_gz( dataset ): - ''' Loads the dataset - - :type dataset: string - :param dataset: the path to the dataset (here MNIST) - ''' - - #-------------------------------------------------------------------------------------------------------------------- - # Load Data - #-------------------------------------------------------------------------------------------------------------------- - - - print '... loading data' - - # Load the dataset - f = gzip.open(dataset,'rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - test_set_x, test_set_y = shared_dataset( test_set ) - valid_set_x, valid_set_y = shared_dataset( valid_set ) - train_set_x, train_set_y = shared_dataset( train_set ) - - rval = [ ( train_set_x, train_set_y ), ( valid_set_x,valid_set_y ), ( test_set_x, test_set_y ) ] - return rval - -##def load_data_ft( verbose = False,\ -## data_path = '/data/lisa/data/nist/by_class/'\ -## train_data = 'all/all_train_data.ft',\ -## train_labels = 'all/all_train_labels.ft',\ -## test_data = 'all/all_test_data.ft',\ -## test_labels = 'all/all_test_labels.ft'): -## -## train_data_file = open(data_path + train_data) -## train_labels_file = open(data_path + train_labels) -## test_labels_file = open(data_path + test_data) -## test_data_file = open(data_path + test_labels) -## -## raw_train_data = ft.read( train_data_file) -## raw_train_labels = ft.read(train_labels_file) -## raw_test_data = ft.read( test_labels_file) -## raw_test_labels = ft.read( test_data_file) -## -## f.close() -## g.close() -## i.close() -## h.close() -## -## -## test_set_x, test_set_y = shared_dataset(test_set) -## valid_set_x, valid_set_y = shared_dataset(valid_set) -## train_set_x, train_set_y = shared_dataset(train_set) -## -## rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)] -## return rval -## #create a validation set the same size as the test size -## #use the end of the training array for this purpose -## #discard the last remaining so we get a %batch_size number -## test_size=len(raw_test_labels) -## test_size = int(test_size/batch_size) -## test_size*=batch_size -## train_size = len(raw_train_data) -## train_size = int(train_size/batch_size) -## train_size*=batch_size -## validation_size =test_size -## offset = train_size-test_size -## if verbose == True: -## print 'train size = %d' %train_size -## print 'test size = %d' %test_size -## print 'valid size = %d' %validation_size -## print 'offset = %d' %offset -## -## - #-------------------------------------------------------------------------------------------------------------------- # MAIN #-------------------------------------------------------------------------------------------------------------------- def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \ - dataset_name = 'mnist.pkl.gz', image_size = 28 * 28, nb_class = 10, \ + dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10, \ patience = 5000, patience_increase = 2, improvement_threshold = 0.995): + #28 * 28 = 784 """ Demonstrate stochastic gradient descent optimization of a log-linear model @@ -254,9 +162,8 @@ :type batch_size: int :param batch_size: size of the minibatch - :type dataset_name: string - :param dataset: the path of the MNIST dataset file from - http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + :type dataset: dataset + :param dataset: a dataset instance from ift6266.datasets :type image_size: int :param image_size: size of the input image in pixels (width * height) @@ -275,17 +182,6 @@ """ - datasets = load_data_pkl_gz( dataset_name ) - - train_set_x, train_set_y = datasets[0] - valid_set_x, valid_set_y = datasets[1] - test_set_x , test_set_y = datasets[2] - - # compute number of minibatches for training, validation and testing - n_train_batches = train_set_x.value.shape[0] / batch_size - n_valid_batches = valid_set_x.value.shape[0] / batch_size - n_test_batches = test_set_x.value.shape[0] / batch_size - #-------------------------------------------------------------------------------------------------------------------- # Build actual model #-------------------------------------------------------------------------------------------------------------------- @@ -308,17 +204,11 @@ # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch - test_model = theano.function( inputs = [ index ], - outputs = classifier.errors( y ), - givens = { - x:test_set_x[ index * batch_size: ( index + 1 ) * batch_size ], - y:test_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) + test_model = theano.function( inputs = [ x, y ], + outputs = classifier.errors( y )) - validate_model = theano.function( inputs = [ index ], - outputs = classifier.errors( y ), - givens = { - x:valid_set_x[ index * batch_size: ( index + 1 ) * batch_size ], - y:valid_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) + validate_model = theano.function( inputs = [ x, y ], + outputs = classifier.errors( y )) # compute the gradient of cost with respect to theta = ( W, b ) g_W = T.grad( cost = cost, wrt = classifier.W ) @@ -331,12 +221,9 @@ # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` - train_model = theano.function( inputs = [ index ], + train_model = theano.function( inputs = [ x, y ], outputs = cost, - updates = updates, - givens = { - x: train_set_x[ index * batch_size: ( index + 1 ) * batch_size ], - y: train_set_y[ index * batch_size: ( index + 1 ) * batch_size ] } ) + updates = updates) #-------------------------------------------------------------------------------------------------------------------- # Train model @@ -349,38 +236,38 @@ # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = min( n_train_batches, patience * 0.5 ) + validation_frequency = patience * 0.5 # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch - best_params = None + best_params = None best_validation_loss = float('inf') - test_score = 0. - start_time = time.clock() + test_score = 0. + start_time = time.clock() done_looping = False - n_epochs = nb_max_examples / train_set_x.value.shape[0] - epoch = 0 + n_iters = nb_max_examples / batch_size + epoch = 0 + iter = 0 - while ( epoch < n_epochs ) and ( not done_looping ): + while ( iter < n_iters ) and ( not done_looping ): epoch = epoch + 1 - for minibatch_index in xrange( n_train_batches ): + for x, y in dataset.train(batch_size): - minibatch_avg_cost = train_model( minibatch_index ) + minibatch_avg_cost = train_model( x, y ) # iteration number - iter = epoch * n_train_batches + minibatch_index + iter += 1 - if ( iter + 1 ) % validation_frequency == 0: + if iter % validation_frequency == 0: # compute zero-one loss on validation set - validation_losses = [ validate_model( i ) for i in xrange( n_valid_batches ) ] + validation_losses = [ validate_model( xv, yv ) for xv, yv in dataset.valid(batch_size) ] this_validation_loss = numpy.mean( validation_losses ) - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - ( epoch, minibatch_index + 1,n_train_batches, \ - this_validation_loss*100. ) ) + print('epoch %i, iter %i, validation error %f %%' % \ + ( epoch, iter, this_validation_loss*100. ) ) # if we got the best validation score until now @@ -393,12 +280,12 @@ best_validation_loss = this_validation_loss # test it on the test set - test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)] test_score = numpy.mean(test_losses) - print((' epoch %i, minibatch %i/%i, test error of best ' + print((' epoch %i, iter %i, test error of best ' 'model %f %%') % \ - (epoch, minibatch_index+1, n_train_batches,test_score*100.)) + (epoch, iter, test_score*100.)) if patience <= iter : done_looping = True @@ -410,20 +297,25 @@ ( best_validation_loss * 100., test_score * 100.)) print ('The code ran for %f minutes' % ((end_time-start_time) / 60.)) - ###### return validation_error, test_error, nb_exemples, time + return best_validation_loss, test_score, iter*batch_size, (end_time-start_time) / 60. if __name__ == '__main__': log_reg() def jobman_log_reg(state, channel): - (validation_error, test_error, nb_exemples, time) = log_reg( learning_rate = state.learning_rate,\ - nb_max_examples = state.nb_max_examples,\ - batch_size = state.batch_size,\ - dataset_name = state.dataset_name, \ + print state + (validation_error, test_error, nb_exemples, time) = log_reg( learning_rate = state.learning_rate, \ + nb_max_examples = state.nb_max_examples, \ + batch_size = state.batch_size,\ image_size = state.image_size, \ - nb_class = state.nb_class ) - + nb_class = state.nb_class, \ + patience = state.patience, \ + patience_increase = state.patience_increase, \ + improvement_threshold = state.improvement_threshold ) + + + print state state.validation_error = validation_error state.test_error = test_error state.nb_exemples = nb_exemples diff -r 0de14b2034c6 -r 2024368a8d3d baseline/mlp/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d baseline/mlp/mlp_get_error_from_model.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/mlp/mlp_get_error_from_model.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,151 @@ +__docformat__ = 'restructedtext en' + +import pdb +import numpy as np +import pylab +import time +import pylearn +from pylearn.io import filetensor as ft + +data_path = '/data/lisa/data/nist/by_class/' +test_data = 'all/all_train_data.ft' +test_labels = 'all/all_train_labels.ft' + +def read_test_data(mlp_model): + + + #read the data + h = open(data_path+test_data) + i= open(data_path+test_labels) + raw_test_data = ft.read(h) + raw_test_labels = ft.read(i) + i.close() + h.close() + + #read the model chosen + a=np.load(mlp_model) + W1=a['W1'] + W2=a['W2'] + b1=a['b1'] + b2=a['b2'] + + return (W1,b1,W2,b2,raw_test_data,raw_test_labels) + + + + +def get_total_test_error(everything): + + W1=everything[0] + b1=everything[1] + W2=everything[2] + b2=everything[3] + test_data=everything[4] + test_labels=everything[5] + total_error_count=0 + total_exemple_count=0 + + nb_error_count=0 + nb_exemple_count=0 + + char_error_count=0 + char_exemple_count=0 + + min_error_count=0 + min_exemple_count=0 + + maj_error_count=0 + maj_exemple_count=0 + + for i in range(test_labels.size): + total_exemple_count = total_exemple_count +1 + #get activation for layer 1 + a0=np.dot(np.transpose(W1),np.transpose(test_data[i]/255.0)) + b1 + #add non linear function to layer 1 activation + a0_out=np.tanh(a0) + + #get activation for output layer + a1= np.dot(np.transpose(W2),a0_out) + b2 + #add non linear function for output activation (softmax) + a1_exp = np.exp(a1) + sum_a1=np.sum(a1_exp) + a1_out=a1_exp/sum_a1 + + predicted_class=np.argmax(a1_out) + wanted_class=test_labels[i] + + if(predicted_class!=wanted_class): + total_error_count = total_error_count +1 + + #get grouped based error + #with a priori +# if(wanted_class>9 and wanted_class<35): +# min_exemple_count=min_exemple_count+1 +# predicted_class=np.argmax(a1_out[10:35])+10 +# if(predicted_class!=wanted_class): +# min_error_count=min_error_count+1 +# if(wanted_class<10): +# nb_exemple_count=nb_exemple_count+1 +# predicted_class=np.argmax(a1_out[0:10]) +# if(predicted_class!=wanted_class): +# nb_error_count=nb_error_count+1 +# if(wanted_class>34): +# maj_exemple_count=maj_exemple_count+1 +# predicted_class=np.argmax(a1_out[35:])+35 +# if(predicted_class!=wanted_class): +# maj_error_count=maj_error_count+1 +# +# if(wanted_class>9): +# char_exemple_count=char_exemple_count+1 +# predicted_class=np.argmax(a1_out[10:])+10 +# if(predicted_class!=wanted_class): +# char_error_count=char_error_count+1 + + + + #get grouped based error + #with no a priori + if(wanted_class>9 and wanted_class<35): + min_exemple_count=min_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + min_error_count=min_error_count+1 + if(wanted_class<10): + nb_exemple_count=nb_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + nb_error_count=nb_error_count+1 + if(wanted_class>34): + maj_exemple_count=maj_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + maj_error_count=maj_error_count+1 + + if(wanted_class>9): + char_exemple_count=char_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + char_error_count=char_error_count+1 + + + #convert to float + return ( total_exemple_count,nb_exemple_count,char_exemple_count,min_exemple_count,maj_exemple_count,\ + total_error_count,nb_error_count,char_error_count,min_error_count,maj_error_count,\ + total_error_count*100.0/total_exemple_count*1.0,\ + nb_error_count*100.0/nb_exemple_count*1.0,\ + char_error_count*100.0/char_exemple_count*1.0,\ + min_error_count*100.0/min_exemple_count*1.0,\ + maj_error_count*100.0/maj_exemple_count*1.0) + + + + + + + + + + + + + \ No newline at end of file diff -r 0de14b2034c6 -r 2024368a8d3d baseline/mlp/mlp_nist.py --- a/baseline/mlp/mlp_nist.py Tue Mar 16 12:13:49 2010 -0400 +++ b/baseline/mlp/mlp_nist.py Tue Mar 16 12:14:10 2010 -0400 @@ -31,6 +31,7 @@ import time import theano.tensor.nnet import pylearn +import theano,pylearn.version from pylearn.io import filetensor as ft data_path = '/data/lisa/data/nist/by_class/' @@ -174,17 +175,22 @@ nb_max_exemples=1000000,\ batch_size=20,\ nb_hidden = 500,\ - nb_targets = 62): + nb_targets = 62, + tau=1e6): configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr] + #save initial learning rate if classical adaptive lr is used + initial_lr=learning_rate + total_validation_error_list = [] total_train_error_list = [] learning_rate_list=[] best_training_error=float('inf'); + f = open(data_path+train_data) g= open(data_path+train_labels) @@ -315,6 +321,8 @@ n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples n_iter = n_iter/n_minibatches + 1 #round up n_iter=max(1,n_iter) # run at least once on short debug call + time_n=0 #in unit of exemples + if verbose == True: @@ -325,6 +333,9 @@ epoch = iter / n_minibatches minibatch_index = iter % n_minibatches + + if adaptive_lr==2: + classifier.lr.value = tau*initial_lr/(tau+time_n) # get the minibatches corresponding to `iter` modulo @@ -364,6 +375,8 @@ print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \ (epoch, minibatch_index+1, n_minibatches, \ this_validation_loss*100.,this_train_loss*100)) + print 'learning rate = %f' %classifier.lr.value + print 'time = %i' %time_n #save the learning rate @@ -425,6 +438,7 @@ break + time_n= time_n + batch_size end_time = time.clock() if verbose == True: print(('Optimization complete. Best validation score of %f %% ' @@ -448,7 +462,8 @@ (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ nb_max_exemples=state.nb_max_exemples,\ nb_hidden=state.nb_hidden,\ - adaptive_lr=state.adaptive_lr) + adaptive_lr=state.adaptive_lr,\ + tau=state.tau) state.train_error=train_error state.validation_error=validation_error state.test_error=test_error diff -r 0de14b2034c6 -r 2024368a8d3d datasets/defs.py --- a/datasets/defs.py Tue Mar 16 12:13:49 2010 -0400 +++ b/datasets/defs.py Tue Mar 16 12:14:10 2010 -0400 @@ -1,38 +1,54 @@ -__all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all', 'ocr'] +__all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all', 'ocr', + 'nist_P07', 'mnist'] from ftfile import FTDataSet +from gzpklfile import GzpklDataSet import theano - -NIST_PATH = '/data/lisa/data/nist/by_class/' -DATA_PATH = '/data/lisa/data/ift6266h10/' +import os -nist_digits = FTDataSet(train_data = [NIST_PATH+'digits/digits_train_data.ft'], - train_lbl = [NIST_PATH+'digits/digits_train_labels.ft'], - test_data = [NIST_PATH+'digits/digits_test_data.ft'], - test_lbl = [NIST_PATH+'digits/digits_test_labels.ft'], +# if the environmental variables exist, get the path from them, +# otherwise fall back on the default +NIST_PATH = os.getenv('NIST_PATH','/data/lisa/data/nist/by_class/') +DATA_PATH = os.getenv('DATA_PATH','/data/lisa/data/ift6266h10/') + +nist_digits = FTDataSet(train_data = [os.path.join(NIST_PATH,'digits/digits_train_data.ft')], + train_lbl = [os.path.join(NIST_PATH,'digits/digits_train_labels.ft')], + test_data = [os.path.join(NIST_PATH,'digits/digits_test_data.ft')], + test_lbl = [os.path.join(NIST_PATH,'digits/digits_test_labels.ft')], indtype=theano.config.floatX, inscale=255.) -nist_lower = FTDataSet(train_data = [NIST_PATH+'lower/lower_train_data.ft'], - train_lbl = [NIST_PATH+'lower/lower_train_labels.ft'], - test_data = [NIST_PATH+'lower/lower_test_data.ft'], - test_lbl = [NIST_PATH+'lower/lower_test_labels.ft'], +nist_lower = FTDataSet(train_data = [os.path.join(NIST_PATH,'lower/lower_train_data.ft')], + train_lbl = [os.path.join(NIST_PATH,'lower/lower_train_labels.ft')], + test_data = [os.path.join(NIST_PATH,'lower/lower_test_data.ft')], + test_lbl = [os.path.join(NIST_PATH,'lower/lower_test_labels.ft')], indtype=theano.config.floatX, inscale=255.) -nist_upper = FTDataSet(train_data = [NIST_PATH+'upper/upper_train_data.ft'], - train_lbl = [NIST_PATH+'upper/upper_train_labels.ft'], - test_data = [NIST_PATH+'upper/upper_test_data.ft'], - test_lbl = [NIST_PATH+'upper/upper_test_labels.ft'], +nist_upper = FTDataSet(train_data = [os.path.join(NIST_PATH,'upper/upper_train_data.ft')], + train_lbl = [os.path.join(NIST_PATH,'upper/upper_train_labels.ft')], + test_data = [os.path.join(NIST_PATH,'upper/upper_test_data.ft')], + test_lbl = [os.path.join(NIST_PATH,'upper/upper_test_labels.ft')], indtype=theano.config.floatX, inscale=255.) -nist_all = FTDataSet(train_data = [DATA_PATH+'train_data.ft'], - train_lbl = [DATA_PATH+'train_labels.ft'], - test_data = [DATA_PATH+'test_data.ft'], - test_lbl = [DATA_PATH+'test_labels.ft'], - valid_data = [DATA_PATH+'valid_data.ft'], - valid_lbl = [DATA_PATH+'valid_labels.ft'], +nist_all = FTDataSet(train_data = [os.path.join(DATA_PATH,'train_data.ft')], + train_lbl = [os.path.join(DATA_PATH,'train_labels.ft')], + test_data = [os.path.join(DATA_PATH,'test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'valid_labels.ft')], indtype=theano.config.floatX, inscale=255.) -ocr = FTDataSet(train_data = [DATA_PATH+'ocr_train_data.ft'], - train_lbl = [DATA_PATH+'ocr_train_labels.ft'], - test_data = [DATA_PATH+'ocr_test_data.ft'], - test_lbl = [DATA_PATH+'ocr_test_labels.ft'], - valid_data = [DATA_PATH+'ocr_valid_data.ft'], - valid_lbl = [DATA_PATH+'ocr_valid_labels.ft']) +ocr = FTDataSet(train_data = [os.path.join(DATA_PATH,'ocr_train_data.ft')], + train_lbl = [os.path.join(DATA_PATH,'ocr_train_labels.ft')], + test_data = [os.path.join(DATA_PATH,'ocr_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'ocr_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'ocr_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'ocr_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255.) + +nist_P07 = FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(100)], + train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(100)], + test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255.) + +mnist = GzpklDataSet(os.path.join(DATA_PATH,'mnist.pkl.gz')) diff -r 0de14b2034c6 -r 2024368a8d3d datasets/ftfile.py --- a/datasets/ftfile.py Tue Mar 16 12:13:49 2010 -0400 +++ b/datasets/ftfile.py Tue Mar 16 12:14:10 2010 -0400 @@ -193,12 +193,19 @@ if valid_data is None: total_valid_size = sum(FTFile(td).size for td in test_data) valid_size = total_valid_size/len(train_data) - self._train = FTData(train_data, train_lbl, size=-valid_size) - self._valid = FTData(train_data, train_lbl, skip=-valid_size) + self._train = FTData(train_data, train_lbl, size=-valid_size, + inscale=inscale, outscale=outscale, indtype=indtype, + outdtype=outdtype) + self._valid = FTData(train_data, train_lbl, skip=-valid_size, + inscale=inscale, outscale=outscale, indtype=indtype, + outdtype=outdtype) else: - self._train = FTData(train_data, train_lbl) - self._valid = FTData(valid_data, valid_lbl) - self._test = FTData(test_data, test_lbl) + self._train = FTData(train_data, train_lbl,inscale=inscale, + outscale=outscale, indtype=indtype, outdtype=outdtype) + self._valid = FTData(valid_data, valid_lbl,inscale=inscale, + outscale=outscale, indtype=indtype, outdtype=outdtype) + self._test = FTData(test_data, test_lbl,inscale=inscale, + outscale=outscale, indtype=indtype, outdtype=outdtype) def _return_it(self, batchsize, bufsize, ftdata): return izip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), diff -r 0de14b2034c6 -r 2024368a8d3d datasets/gzpklfile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/gzpklfile.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,39 @@ +import gzip +try: + import cPickle as pickle +except ImportError: + import pickle + +from dataset import DataSet +from dsetiter import DataIterator +from itertools import izip + +class ArrayFile(object): + def __init__(self, ary): + self.ary = ary + self.pos = 0 + + def read(self, num): + res = self.ary[self.pos:self.pos+num] + self.pos += num + return res + +class GzpklDataSet(DataSet): + def __init__(self, fname): + self._fname = fname + self._train = 0 + self._valid = 1 + self._test = 2 + + def _load(self): + f = gzip.open(self._fname, 'rb') + try: + self.datas = pickle.load(f) + finally: + f.close() + + def _return_it(self, batchsz, bufsz, id): + if not hasattr(self, 'datas'): + self._load() + return izip(DataIterator([ArrayFile(self.datas[id][0])], batchsz, bufsz), + DataIterator([ArrayFile(self.datas[id][1])], batchsz, bufsz)) diff -r 0de14b2034c6 -r 2024368a8d3d deep/autoencoder/DA_training.py --- a/deep/autoencoder/DA_training.py Tue Mar 16 12:13:49 2010 -0400 +++ b/deep/autoencoder/DA_training.py Tue Mar 16 12:14:10 2010 -0400 @@ -93,7 +93,12 @@ theano_rng = RandomStreams() # create a numpy random generator numpy_rng = numpy.random.RandomState() - + + # print the parameter of the DA + if True : + print 'input size = %d' %n_visible + print 'hidden size = %d' %n_hidden + print 'complexity = %2.2f' %complexity # initial values for weights and biases # note : W' was written as `W_prime` and b' as `b_prime` @@ -250,7 +255,7 @@ # construct the denoising autoencoder class n_ins = 32*32 - encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins))) + encoder = dA(n_ins, n_code_layer, complexity, input = x.reshape((batch_size,n_ins))) # Train autoencoder @@ -363,7 +368,7 @@ test_score)) if patience <= iter : - print('iter (%i) is superior than patience(%i). break', iter, patience) + print('iter (%i) is superior than patience(%i). break', (iter, patience)) break @@ -451,7 +456,7 @@ # construct the denoising autoencoder class n_ins = 28*28 - encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins))) + encoder = dA(n_ins, n_code_layer, complexity, input = x.reshape((batch_size,n_ins))) # Train autoencoder diff -r 0de14b2034c6 -r 2024368a8d3d deep/autoencoder/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d deep/convolutional_dae/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d deep/convolutional_dae/stacked_convolutional_dae.py --- a/deep/convolutional_dae/stacked_convolutional_dae.py Tue Mar 16 12:13:49 2010 -0400 +++ b/deep/convolutional_dae/stacked_convolutional_dae.py Tue Mar 16 12:14:10 2010 -0400 @@ -7,44 +7,10 @@ from theano.tensor.signal import downsample from theano.tensor.nnet import conv -import gzip -import cPickle - - -class LogisticRegression(object): - - def __init__(self, input, n_in, n_out): - - self.W = theano.shared( value=numpy.zeros((n_in,n_out), - dtype = theano.config.floatX) ) - - self.b = theano.shared( value=numpy.zeros((n_out,), - dtype = theano.config.floatX) ) - - self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) - - self.y_pred=T.argmax(self.p_y_given_x, axis=1) - - self.params = [self.W, self.b] - - def negative_log_likelihood(self, y): - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - def MSE(self, y): - return -T.mean(abs((self.p_y_given_x)[T.arange(y.shape[0]),y]-y)**2) +from ift6266 import datasets - def errors(self, y): - if y.ndim != self.y_pred.ndim: - raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) - - - if y.dtype.startswith('int'): - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() - +from ift6266.baseline.log_reg.log_reg import LogisticRegression class SigmoidalLayer(object): def __init__(self, rng, input, n_in, n_out): @@ -65,8 +31,9 @@ class dA_conv(object): - def __init__(self, corruption_level = 0.1, input = None, shared_W = None,\ - shared_b = None, filter_shape = None, image_shape = None, poolsize = (2,2)): + def __init__(self, input, filter_shape, corruption_level = 0.1, + shared_W = None, shared_b = None, image_shape = None, + poolsize = (2,2)): theano_rng = RandomStreams() @@ -80,18 +47,16 @@ self.W = shared_W self.b = shared_b else: - initial_W = numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(fan_in+fan_out)), \ - high = numpy.sqrt(6./(fan_in+fan_out)), \ + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(fan_in+fan_out)), + high = numpy.sqrt(6./(fan_in+fan_out)), size = filter_shape), dtype = theano.config.floatX) - initial_b = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX) - - + initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.W = theano.shared(value = initial_W, name = "W") self.b = theano.shared(value = initial_b, name = "b") - initial_b_prime= numpy.zeros((filter_shape[1],)) + initial_b_prime= numpy.zeros((filter_shape[1],),dtype=theano.config.floatX) self.W_prime=T.dtensor4('W_prime') @@ -99,11 +64,10 @@ self.x = input - self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level,dtype=theano.config.floatX) * self.x - conv1_out = conv.conv2d(self.tilde_x, self.W, \ - filter_shape=filter_shape, \ - image_shape=image_shape, border_mode='valid') + conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape, + image_shape=image_shape, border_mode='valid') self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x')) @@ -111,19 +75,15 @@ da_filter_shape = [ filter_shape[1], filter_shape[0], filter_shape[2],\ filter_shape[3] ] - da_image_shape = [ image_shape[0],filter_shape[0],image_shape[2]-filter_shape[2]+1, \ - image_shape[3]-filter_shape[3]+1 ] initial_W_prime = numpy.asarray( numpy.random.uniform( \ low = -numpy.sqrt(6./(fan_in+fan_out)), \ high = numpy.sqrt(6./(fan_in+fan_out)), \ size = da_filter_shape), dtype = theano.config.floatX) self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime") - #import pdb;pdb.set_trace() - - conv2_out = conv.conv2d(self.y, self.W_prime, \ - filter_shape = da_filter_shape, image_shape = da_image_shape ,\ - border_mode='full') + conv2_out = conv.conv2d(self.y, self.W_prime, + filter_shape = da_filter_shape, + border_mode='full') self.z = (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale @@ -134,19 +94,16 @@ self.cost = T.mean(self.L) self.params = [ self.W, self.b, self.b_prime ] - - class LeNetConvPoolLayer(object): - def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2,2)): - assert image_shape[1]==filter_shape[1] + def __init__(self, rng, input, filter_shape, image_shape=None, poolsize=(2,2)): self.input = input W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX) - self.W = theano.shared(value = W_values) + self.W = theano.shared(value=W_values) - b_values = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX) - self.b = theano.shared(value= b_values) + b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + self.b = theano.shared(value=b_values) conv_out = conv.conv2d(input, self.W, filter_shape=filter_shape, image_shape=image_shape) @@ -168,67 +125,60 @@ class SdA(): - def __init__(self, input, n_ins_conv, n_ins_mlp, train_set_x, train_set_y, batch_size, \ - conv_hidden_layers_sizes, mlp_hidden_layers_sizes, corruption_levels, \ - rng, n_out, pretrain_lr, finetune_lr): - + def __init__(self, input, n_ins_mlp, conv_hidden_layers_sizes, + mlp_hidden_layers_sizes, corruption_levels, rng, n_out, + pretrain_lr, finetune_lr): + self.layers = [] self.pretrain_functions = [] self.params = [] self.conv_n_layers = len(conv_hidden_layers_sizes) self.mlp_n_layers = len(mlp_hidden_layers_sizes) - - index = T.lscalar() # index to a [mini]batch - self.x = T.dmatrix('x') # the data is presented as rasterized images + + self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of - - for i in xrange( self.conv_n_layers ): - filter_shape=conv_hidden_layers_sizes[i][0] image_shape=conv_hidden_layers_sizes[i][1] max_poolsize=conv_hidden_layers_sizes[i][2] if i == 0 : - layer_input=self.x.reshape((batch_size,1,28,28)) + layer_input=self.x.reshape((self.x.shape[0], 1, 32, 32)) else: layer_input=self.layers[-1].output - - layer = LeNetConvPoolLayer(rng, input=layer_input, \ - image_shape=image_shape, \ - filter_shape=filter_shape,poolsize=max_poolsize) - print 'Convolutional layer '+str(i+1)+' created' - + + layer = LeNetConvPoolLayer(rng, input=layer_input, + image_shape=image_shape, + filter_shape=filter_shape, + poolsize=max_poolsize) + print 'Convolutional layer', str(i+1), 'created' + self.layers += [layer] self.params += layer.params - - da_layer = dA_conv(corruption_level = corruption_levels[0],\ - input = layer_input, \ - shared_W = layer.W, shared_b = layer.b,\ - filter_shape = filter_shape , image_shape = image_shape ) - - + + da_layer = dA_conv(corruption_level = corruption_levels[0], + input = layer_input, + shared_W = layer.W, shared_b = layer.b, + filter_shape = filter_shape, + image_shape = image_shape ) + gparams = T.grad(da_layer.cost, da_layer.params) - + updates = {} for param, gparam in zip(da_layer.params, gparams): - updates[param] = param - gparam * pretrain_lr - - - update_fn = theano.function([index], da_layer.cost, \ - updates = updates, - givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size]} ) - + updates[param] = param - gparam * pretrain_lr + + update_fn = theano.function([self.x], da_layer.cost, updates = updates) + self.pretrain_functions += [update_fn] - + for i in xrange( self.mlp_n_layers ): if i == 0 : input_size = n_ins_mlp else: input_size = mlp_hidden_layers_sizes[i-1] - + if i == 0 : if len( self.layers ) == 0 : layer_input=self.x @@ -236,72 +186,43 @@ layer_input = self.layers[-1].output.flatten(2) else: layer_input = self.layers[-1].output - + layer = SigmoidalLayer(rng, layer_input, input_size, mlp_hidden_layers_sizes[i] ) - + self.layers += [layer] self.params += layer.params - - print 'MLP layer '+str(i+1)+' created' + print 'MLP layer', str(i+1), 'created' self.logLayer = LogisticRegression(input=self.layers[-1].output, \ n_in=mlp_hidden_layers_sizes[-1], n_out=n_out) self.params += self.logLayer.params - + cost = self.logLayer.negative_log_likelihood(self.y) + + gparams = T.grad(cost, self.params) - gparams = T.grad(cost, self.params) updates = {} - for param,gparam in zip(self.params, gparams): updates[param] = param - gparam*finetune_lr - - self.finetune = theano.function([index], cost, - updates = updates, - givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size], - self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) - + + self.finetune = theano.function([self.x, self.y], cost, updates = updates) + + self.errors = self.logLayer.errors(self.y) - self.errors = self.logLayer.errors(self.y) - - - def sgd_optimization_mnist( learning_rate=0.1, pretraining_epochs = 2, \ pretrain_lr = 0.01, training_epochs = 1000, \ - dataset='mnist.pkl.gz'): - - f = gzip.open(dataset,'rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - - def shared_dataset(data_xy): - data_x, data_y = data_xy - shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) - shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) - return shared_x, T.cast(shared_y, 'int32') - - - test_set_x, test_set_y = shared_dataset(test_set) - valid_set_x, valid_set_y = shared_dataset(valid_set) - train_set_x, train_set_y = shared_dataset(train_set) - + dataset=datasets.nist_digits): + batch_size = 500 # size of the minibatch - - n_train_batches = train_set_x.value.shape[0] / batch_size - n_valid_batches = valid_set_x.value.shape[0] / batch_size - n_test_batches = test_set_x.value.shape[0] / batch_size - # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1d vector of - # [int] labels - layer0_input = x.reshape((batch_size,1,28,28)) + # [int] labels + layer0_input = x.reshape((x.shape[0],1,32,32)) # Setup the convolutional layers with their DAs(add as many as you want) @@ -310,45 +231,34 @@ ker1=2 ker2=2 conv_layers=[] - conv_layers.append([[ker1,1,5,5], [batch_size,1,28,28], [2,2] ]) - conv_layers.append([[ker2,ker1,5,5], [batch_size,ker1,12,12], [2,2] ]) + conv_layers.append([[ker1,1,5,5], None, [2,2] ]) + conv_layers.append([[ker2,ker1,5,5], None, [2,2] ]) # Setup the MLP layers of the network mlp_layers=[500] - network = SdA(input = layer0_input, n_ins_conv = 28*28, n_ins_mlp = ker2*4*4, \ - train_set_x = train_set_x, train_set_y = train_set_y, batch_size = batch_size, - conv_hidden_layers_sizes = conv_layers, \ - mlp_hidden_layers_sizes = mlp_layers, \ - corruption_levels = corruption_levels , n_out = 10, \ - rng = rng , pretrain_lr = pretrain_lr , finetune_lr = learning_rate ) + network = SdA(input = layer0_input, n_ins_mlp = ker2*4*4, + conv_hidden_layers_sizes = conv_layers, + mlp_hidden_layers_sizes = mlp_layers, + corruption_levels = corruption_levels , n_out = 10, + rng = rng , pretrain_lr = pretrain_lr , + finetune_lr = learning_rate ) - test_model = theano.function([index], network.errors, - givens = { - network.x: test_set_x[index*batch_size:(index+1)*batch_size], - network.y: test_set_y[index*batch_size:(index+1)*batch_size]}) + test_model = theano.function([network.x, network.y], network.errors) - validate_model = theano.function([index], network.errors, - givens = { - network.x: valid_set_x[index*batch_size:(index+1)*batch_size], - network.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) - - - start_time = time.clock() for i in xrange(len(network.layers)-len(mlp_layers)): for epoch in xrange(pretraining_epochs): - for batch_index in xrange(n_train_batches): - c = network.pretrain_functions[i](batch_index) - print 'pre-training convolution layer %i, epoch %d, cost '%(i,epoch),c + for x, y in dataset.train(batch_size): + c = network.pretrain_functions[i](x) + print 'pre-training convolution layer %i, epoch %d, cost '%(i,epoch), c patience = 10000 # look as this many examples regardless patience_increase = 2. # WAIT THIS MUCH LONGER WHEN A NEW BEST IS # FOUND improvement_threshold = 0.995 # a relative improvement of this much is - validation_frequency = min(n_train_batches, patience/2) - + validation_frequency = patience/2 best_params = None best_validation_loss = float('inf') @@ -357,23 +267,21 @@ done_looping = False epoch = 0 - + iter = 0 + while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 - for minibatch_index in xrange(n_train_batches): + for x, y in dataset.train(batch_size): - cost_ij = network.finetune(minibatch_index) - iter = epoch * n_train_batches + minibatch_index - - if (iter+1) % validation_frequency == 0: + cost_ij = network.finetune(x, y) + iter += 1 + + if iter % validation_frequency == 0: + validation_losses = [test_model(xv, yv) for xv, yv in dataset.valid(batch_size)] + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, iter %i, validation error %f %%' % \ + (epoch, iter, this_validation_loss*100.)) - validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] - this_validation_loss = numpy.mean(validation_losses) - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, n_train_batches, \ - this_validation_loss*100.)) - - # if we got the best validation score until now if this_validation_loss < best_validation_loss: @@ -381,35 +289,28 @@ if this_validation_loss < best_validation_loss * \ improvement_threshold : patience = max(patience, iter * patience_increase) - + # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter - + # test it on the test set - test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)] test_score = numpy.mean(test_losses) - print((' epoch %i, minibatch %i/%i, test error of best ' + print((' epoch %i, iter %i, test error of best ' 'model %f %%') % - (epoch, minibatch_index+1, n_train_batches, - test_score*100.)) - - + (epoch, iter, test_score*100.)) + if patience <= iter : - done_looping = True - break - + done_looping = True + break + end_time = time.clock() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score*100.)) print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - - - - - if __name__ == '__main__': sgd_optimization_mnist() diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/nist_sda.py --- a/deep/stacked_dae/nist_sda.py Tue Mar 16 12:13:49 2010 -0400 +++ b/deep/stacked_dae/nist_sda.py Tue Mar 16 12:14:10 2010 -0400 @@ -21,28 +21,35 @@ import jobman, jobman.sql from pylearn.io import filetensor -from utils import produit_croise_jobs +from utils import produit_cartesien_jobs from sgd_optimization import SdaSgdOptimizer from ift6266.utils.scalar_series import * +############################################################################## +# GLOBALS + TEST_CONFIG = False NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' - -JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/fsavard_sda2' +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4' +EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint" REDUCE_TRAIN_TO = None MAX_FINETUNING_EPOCHS = 1000 -REDUCE_EVERY = 1000 # number of minibatches before taking means for valid error etc. +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 1000 + if TEST_CONFIG: REDUCE_TRAIN_TO = 1000 MAX_FINETUNING_EPOCHS = 2 REDUCE_EVERY = 10 -EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint" - +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], 'pretraining_epochs_per_layer': [10,20], 'hidden_layers_sizes': [300,800], @@ -57,13 +64,19 @@ 'pretraining_lr':0.1, 'pretraining_epochs_per_layer':20, 'max_finetuning_epochs':2, - 'hidden_layers_sizes':300, + 'hidden_layers_sizes':800, 'corruption_levels':0.2, 'minibatch_size':20, #'reduce_train_to':300, 'num_hidden_layers':2}) +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: +ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint +''' def jobman_entrypoint(state, channel): + # record mercurial versions of each package pylearn.version.record_versions(state,[theano,ift6266,pylearn]) channel.save() @@ -71,10 +84,12 @@ print "Will load NIST" - nist = NIST(20) + nist = NIST(minibatch_size=20) print "NIST loaded" + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. rtt = None if state.has_key('reduce_train_to'): rtt = state['reduce_train_to'] @@ -82,7 +97,7 @@ rtt = REDUCE_TRAIN_TO if rtt: - print "Reducing training set to "+str( rtt)+ " examples" + print "Reducing training set to "+str(rtt)+ " examples" nist.reduce_train_set(rtt) train,valid,test = nist.get_tvt() @@ -91,14 +106,9 @@ n_ins = 32*32 n_outs = 62 # 10 digits, 26*2 (lower, capitals) - hls = state.hidden_layers_sizes - cl = state.corruption_levels - nhl = state.num_hidden_layers - state.hidden_layers_sizes = [hls] * nhl - state.corruption_levels = [cl] * nhl - - # b,b',W for each hidden layer + b,W of last layer (logreg) - numparams = nhl * 3 + 2 + # b,b',W for each hidden layer + # + b,W of last layer (logreg) + numparams = state.num_hidden_layers * 3 + 2 series_mux = None series_mux = create_series(workingdir, numparams) @@ -114,11 +124,10 @@ optimizer.finetune() channel.save() - pylearn.version.record_versions(state,[theano,ift6266,pylearn]) - channel.save() - return channel.COMPLETE +# These Series objects are used to save various statistics +# during the training. def create_series(basedir, numparams): mux = SeriesMultiplexer() @@ -140,8 +149,11 @@ return mux +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) def jobman_insert_nist(): - jobs = produit_croise_jobs(JOB_VALS) + jobs = produit_cartesien_jobs(JOB_VALS) db = jobman.sql.db(JOBDB) for job in jobs: @@ -227,35 +239,6 @@ raw_input("Press any key") -# hp for hyperparameters -def sgd_optimization_nist(hp=None, dataset_dir='/data/lisa/data/nist'): - global DEFAULT_HP_NIST - hp = hp and hp or DEFAULT_HP_NIST - - print "Will load NIST" - - import time - t1 = time.time() - nist = NIST(20, reduce_train_to=100) - t2 = time.time() - - print "NIST loaded. time delta = ", t2-t1 - - train,valid,test = nist.get_tvt() - dataset = (train,valid,test) - - print train[0][15] - print type(train[0][1]) - - - print "Lengths train, valid, test: ", len(train[0]), len(valid[0]), len(test[0]) - - n_ins = 32*32 - n_outs = 62 # 10 digits, 26*2 (lower, capitals) - - optimizer = SdaSgdOptimizer(dataset, hp, n_ins, n_outs, input_divider=255.0) - optimizer.train() - if __name__ == '__main__': import sys @@ -269,13 +252,9 @@ jobman_insert_nist() elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': - def f(): - pass - chanmock = DD({'COMPLETE':0,'save':f}) + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) jobman_entrypoint(DEFAULT_HP_NIST, chanmock) - elif len(args) > 0 and args[0] == 'estimate': - estimate_total_time() else: - sgd_optimization_nist() + print "Bad arguments" diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/sgd_optimization.py --- a/deep/stacked_dae/sgd_optimization.py Tue Mar 16 12:13:49 2010 -0400 +++ b/deep/stacked_dae/sgd_optimization.py Tue Mar 16 12:14:10 2010 -0400 @@ -60,25 +60,34 @@ # compute number of minibatches for training, validation and testing self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size - self.n_test_batches = self.test_set_x.value.shape[0] / self.hp.minibatch_size + # remove last batch in case it's incomplete + self.n_test_batches = (self.test_set_x.value.shape[0] / self.hp.minibatch_size) - 1 def init_classifier(self): print "Constructing classifier" + # we don't want to save arrays in DD objects, so + # we recreate those arrays here + nhl = self.hp.num_hidden_layers + layers_sizes = [self.hp.hidden_layers_sizes] * nhl + corruption_levels = [self.hp.corruption_levels] * nhl + # construct the stacked denoising autoencoder class self.classifier = SdA( \ train_set_x= self.train_set_x, \ train_set_y = self.train_set_y,\ batch_size = self.hp.minibatch_size, \ n_ins= self.n_ins, \ - hidden_layers_sizes = self.hp.hidden_layers_sizes, \ + hidden_layers_sizes = layers_sizes, \ n_outs = self.n_outs, \ - corruption_levels = self.hp.corruption_levels,\ + corruption_levels = corruption_levels,\ rng = self.rng,\ pretrain_lr = self.hp.pretraining_lr, \ finetune_lr = self.hp.finetuning_lr,\ input_divider = self.input_divider ) + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + sys.stdout.flush() def train(self): @@ -89,6 +98,9 @@ print "STARTING PRETRAINING, time = ", datetime.datetime.now() sys.stdout.flush() + #time_acc_func = 0.0 + #time_acc_total = 0.0 + start_time = time.clock() ## Pre-train layer-wise for i in xrange(self.classifier.n_layers): @@ -96,7 +108,14 @@ for epoch in xrange(self.hp.pretraining_epochs_per_layer): # go through the training set for batch_index in xrange(self.n_train_batches): + #t1 = time.clock() c = self.classifier.pretrain_functions[i](batch_index) + #t2 = time.clock() + + #time_acc_func += t2 - t1 + + #if batch_index % 500 == 0: + # print "acc / total", time_acc_func / (t2 - start_time), time_acc_func self.series_mux.append("reconstruction_error", c) diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/stacked_dae.py --- a/deep/stacked_dae/stacked_dae.py Tue Mar 16 12:13:49 2010 -0400 +++ b/deep/stacked_dae/stacked_dae.py Tue Mar 16 12:14:10 2010 -0400 @@ -10,6 +10,15 @@ from utils import update_locals +# taken from LeDeepNet/daa.py +# has a special case when taking log(0) (defined =0) +# modified to not take the mean anymore +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + class LogisticRegression(object): def __init__(self, input, n_in, n_out): # initialize with 0 the weights W as a matrix of shape (n_in, n_out) @@ -128,7 +137,21 @@ # Equation (4) # note : we sum over the size of a datapoint; if we are using minibatches, # L will be a vector, with one entry per example in minibatch - self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) + + # bypassing z to avoid running to log(0) + #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime) + #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \ + # + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 ) + + # I added this epsilon to avoid getting log(0) and 1/0 in grad + # This means conceptually that there'd be no probability of 0, but that + # doesn't seem to me as important (maybe I'm wrong?). + eps = 0.00000001 + eps_1 = 1-eps + self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) # note : L is now a vector, where each element is the cross-entropy cost # of the reconstruction of the corresponding example of the # minibatch. We need to compute the average of all these to get @@ -156,6 +179,17 @@ self.all_params = [] self.n_layers = len(hidden_layers_sizes) + print "Creating SdA with params:" + print "batch_size", batch_size + print "hidden_layers_sizes", hidden_layers_sizes + print "corruption_levels", corruption_levels + print "n_ins", n_ins + print "n_outs", n_outs + print "pretrain_lr", pretrain_lr + print "finetune_lr", finetune_lr + print "input_divider", input_divider + print "----" + self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) if len(hidden_layers_sizes) < 1 : diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/utils.py --- a/deep/stacked_dae/utils.py Tue Mar 16 12:13:49 2010 -0400 +++ b/deep/stacked_dae/utils.py Tue Mar 16 12:14:10 2010 -0400 @@ -6,12 +6,21 @@ from jobman import DD # from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) def update_locals(obj, dct): if 'self' in dct: del dct['self'] obj.__dict__.update(dct) -def produit_croise_jobs(val_dict): +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): job_list = [DD()] all_keys = val_dict.keys() @@ -27,9 +36,9 @@ return job_list -def test_produit_croise_jobs(): +def test_produit_cartesien_jobs(): vals = {'a': [1,2], 'b': [3,4,5]} - print produit_croise_jobs(vals) + print produit_cartesien_jobs(vals) # taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v2/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v2/config.py.example --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v2/config.py.example Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,64 @@ +''' +These are parameters used by nist_sda.py. They'll end up as globals in there. + +Rename this file to config.py and configure as needed. +DON'T add the renamed file to the repository, as others might use it +without realizing it, with dire consequences. +''' + +# Set this to True when you want to run cluster tests, ie. you want +# to run on the cluster, many jobs, but want to reduce the training +# set size and the number of epochs, so you know everything runs +# fine on the cluster. +# Set this PRIOR to inserting your test jobs in the DB. +TEST_CONFIG = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +NIST_ALL_TRAIN_SIZE = 649081 +# valid et test =82587 82587 + +# change "sandbox" when you're ready +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere' +EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint" + +# reduce training set to that many examples +REDUCE_TRAIN_TO = None +# that's a max, it usually doesn't get to that point +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + + +# This is to configure insertion of jobs on the cluster. +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10,20], + 'hidden_layers_sizes': [300,800], + 'corruption_levels': [0.1,0.2,0.3], + 'minibatch_size': [20], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2,3]} + +# Just useful for tests... minimal number of epochs +# (This is used when running a single job, locally, when +# calling ./nist_sda.py test_jobman_entrypoint +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':2, + 'max_finetuning_epochs':2, + 'hidden_layers_sizes':800, + 'corruption_levels':0.2, + 'minibatch_size':20, + 'reduce_train_to':10000, + 'num_hidden_layers':1}) + + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v2/nist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v2/nist_sda.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,169 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs + +from sgd_optimization import SdaSgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +from ift6266 import datasets +from config import * + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = NIST_ALL_TRAIN_SIZE + + series = create_series(state.num_hidden_layers) + + print "Creating optimizer with state, ", state + + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, + hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) + + optimizer.pretrain(datasets.nist_all) + channel.save() + + optimizer.finetune(datasets.nist_all) + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + # comment out series we don't want to save + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +if __name__ == '__main__': + + args = sys.argv[1:] + + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() + + if len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DEFAULT_HP_NIST, chanmock) + + else: + print "Bad arguments" + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v2/sgd_optimization.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v2/sgd_optimization.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,243 @@ +#!/usr/bin/python +# coding: utf-8 + +# Generic SdA optimization loop, adapted from the deeplearning.net tutorial + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys + +from jobman import DD +import jobman, jobman.sql + +from stacked_dae import SdA + +from ift6266.utils.seriestables import * + +default_series = { \ + 'reconstruction_error' : DummySeries(), + 'training_error' : DummySeries(), + 'validation_error' : DummySeries(), + 'test_error' : DummySeries(), + 'params' : DummySeries() + } + +def itermax(iter, max): + for i,it in enumerate(iter): + if i >= max: + break + yield it + +class SdaSgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, max_minibatches=None): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + + self.max_minibatches = max_minibatches + print "SdaSgdOptimizer, max_minibatches =", max_minibatches + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + + self.series = series + + self.rng = numpy.random.RandomState(1234) + + self.init_classifier() + + sys.stdout.flush() + + def init_classifier(self): + print "Constructing classifier" + + # we don't want to save arrays in DD objects, so + # we recreate those arrays here + nhl = self.hp.num_hidden_layers + layers_sizes = [self.hp.hidden_layers_sizes] * nhl + corruption_levels = [self.hp.corruption_levels] * nhl + + # construct the stacked denoising autoencoder class + self.classifier = SdA( \ + batch_size = self.hp.minibatch_size, \ + n_ins= self.n_ins, \ + hidden_layers_sizes = layers_sizes, \ + n_outs = self.n_outs, \ + corruption_levels = corruption_levels,\ + rng = self.rng,\ + pretrain_lr = self.hp.pretraining_lr, \ + finetune_lr = self.hp.finetuning_lr) + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain(self.dataset) + self.finetune(self.dataset) + + def pretrain(self,dataset): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + batch_index=0 + for x,y in dataset.train(self.hp.minibatch_size): + c = self.classifier.pretrain_functions[i](x) + + self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 + + #if batch_index % 100 == 0: + # print "100 batches" + + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break + + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + def finetune(self,dataset): + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + minibatch_size = self.hp.minibatch_size + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) + + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) + + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.mb_per_epoch, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + if self.max_minibatches and validation_frequency > self.max_minibatches: + validation_frequency = self.max_minibatches / 2 + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + total_mb_index = 0 + + while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): + epoch = epoch + 1 + minibatch_index = -1 + for x,y in dataset.train(minibatch_size): + minibatch_index += 1 + cost_ij = self.classifier.finetune(x,y) + total_mb_index += 1 + + self.series["training_error"].append((epoch, minibatch_index), cost_ij) + + if (total_mb_index+1) % validation_frequency == 0: + + iter = dataset.valid(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + validation_losses = [validate_model(x,y) for x,y in iter] + this_validation_loss = numpy.mean(validation_losses) + + self.series["validation_error"].\ + append((epoch, minibatch_index), this_validation_loss*100.) + + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, self.mb_per_epoch, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, total_mb_index * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = total_mb_index + + # test it on the test set + iter = dataset.test(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + test_losses = [test_model(x,y) for x,y in iter] + test_score = numpy.mean(test_losses) + + self.series["test_error"].\ + append((epoch, minibatch_index), test_score*100.) + + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, self.mb_per_epoch, + test_score*100.)) + + sys.stdout.flush() + + # useful when doing tests + if self.max_minibatches and minibatch_index >= self.max_minibatches: + break + + self.series['params'].append((epoch,), self.classifier.all_params) + + if patience <= total_mb_index: + done_looping = True + break + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v2/stacked_dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v2/stacked_dae.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,292 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import copy + +from utils import update_locals + +# taken from LeDeepNet/daa.py +# has a special case when taking log(0) (defined =0) +# modified to not take the mean anymore +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + +class LogisticRegression(object): + def __init__(self, input, n_in, n_out): + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + + + +class dA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) + + # bypassing z to avoid running to log(0) + z_a = T.dot(self.y, self.W_prime) + self.b_prime + log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) + # log(1-sigmoid(z_a)) + log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) + self.L = -T.sum( self.x * (log_sigmoid) \ + + (1.0-self.x) * (log_1_sigmoid), axis=1 ) + + # I added this epsilon to avoid getting log(0) and 1/0 in grad + # This means conceptually that there'd be no probability of 0, but that + # doesn't seem to me as important (maybe I'm wrong?). + #eps = 0.00000001 + #eps_1 = 1-eps + #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + # + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + +class SdA(object): + def __init__(self, batch_size, n_ins, + hidden_layers_sizes, n_outs, + corruption_levels, rng, pretrain_lr, finetune_lr): + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + update_locals(self, locals()) + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + # MODIF: added this so we also get the b_primes + # (not used for finetuning... still using ".params") + self.all_params = [] + self.n_layers = len(hidden_layers_sizes) + + print "Creating SdA with params:" + print "batch_size", batch_size + print "hidden_layers_sizes", hidden_layers_sizes + print "corruption_levels", corruption_levels + print "n_ins", n_ins + print "n_outs", n_outs + print "pretrain_lr", pretrain_lr + print "finetune_lr", finetune_lr + print "----" + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + + # allocate symbolic variables for the data + #index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + hidden_layers_sizes[i] ) + # add the layer to the + self.layers += [layer] + self.params += layer.params + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(input_size, hidden_layers_sizes[i], \ + corruption_level = corruption_levels[0],\ + input = layer_input, \ + shared_W = layer.W, shared_b = layer.b) + + self.all_params += dA_layer.params + + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([self.x], dA_layer.cost, \ + updates = updates)#, + # givens = { + # self.x : ensemble}) + # collect this function into a list + #update_fn = theano.function([index], dA_layer.cost, \ + # updates = updates, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) + # collect this function into a list + self.pretrain_functions += [update_fn] + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params += self.logLayer.params + self.all_params += self.logLayer.params + # construct a function that implements one step of finetunining + + # compute the cost, defined as the negative log likelihood + cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + # compute list of updates + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([self.x,self.y], cost, + updates = updates)#, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + # self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) + + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + + self.errors = self.logLayer.errors(self.y) + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v2/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v2/utils.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v_sylvain/__init__.py diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v_sylvain/nist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/nist_sda.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,305 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from ift6266 import datasets + +from utils import produit_cartesien_jobs + +from sgd_optimization import SdaSgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +############################################################################## +# GLOBALS + +TEST_CONFIG = False + +#NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/sylvainpl_sda_vsylvain' +EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v_sylvain.nist_sda.jobman_entrypoint" + +REDUCE_TRAIN_TO = None +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + MINIBATCH_SIZE=20 + +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'pretraining_lr': [0.1],#, 0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10], + 'hidden_layers_sizes': [500], + 'corruption_levels': [0.1], + 'minibatch_size': [20], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[1,1]} + +# Just useful for tests... minimal number of epochs +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':2, + 'max_finetuning_epochs':2, + 'hidden_layers_sizes':500, + 'corruption_levels':0.2, + 'minibatch_size':20, + 'reduce_train_to':10000, + 'num_hidden_layers':1}) + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + workingdir = os.getcwd() + + ########### Il faudrait arranger ici pour train plus petit + +## print "Will load NIST" +## +## nist = NIST(minibatch_size=20) +## +## print "NIST loaded" +## + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = int(state['reduce_train_to']/state['minibatch_size']) + elif REDUCE_TRAIN_TO: + rtt = int(REDUCE_TRAIN_TO/MINIBATCH_SIZE) + + if rtt: + print "Reducing training set to "+str(rtt*state['minibatch_size'])+ " examples" + else: + rtt=float('inf') #No reduction +## nist.reduce_train_set(rtt) +## +## train,valid,test = nist.get_tvt() +## dataset = (train,valid,test) + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + series = create_series(state.num_hidden_layers) + + print "Creating optimizer with state, ", state + + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + series=series) + + optimizer.pretrain(datasets.nist_all,rtt) + channel.save() + + optimizer.finetune(datasets.nist_all,rtt) + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + # comment out series we don't want to save + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +class NIST: + def __init__(self, minibatch_size, basepath=None, reduce_train_to=None): + global NIST_ALL_LOCATION + + self.minibatch_size = minibatch_size + self.basepath = basepath and basepath or NIST_ALL_LOCATION + + self.set_filenames() + + # arrays of 2 elements: .x, .y + self.train = [None, None] + self.test = [None, None] + + self.load_train_test() + + self.valid = [[], []] + self.split_train_valid() + if reduce_train_to: + self.reduce_train_set(reduce_train_to) + + def get_tvt(self): + return self.train, self.valid, self.test + + def set_filenames(self): + self.train_files = ['all_train_data.ft', + 'all_train_labels.ft'] + + self.test_files = ['all_test_data.ft', + 'all_test_labels.ft'] + + def load_train_test(self): + self.load_data_labels(self.train_files, self.train) + self.load_data_labels(self.test_files, self.test) + + def load_data_labels(self, filenames, pair): + for i, fn in enumerate(filenames): + f = open(os.path.join(self.basepath, fn)) + pair[i] = filetensor.read(f) + f.close() + + def reduce_train_set(self, max): + self.train[0] = self.train[0][:max] + self.train[1] = self.train[1][:max] + + if max < len(self.test[0]): + for ar in (self.test, self.valid): + ar[0] = ar[0][:max] + ar[1] = ar[1][:max] + + def split_train_valid(self): + test_len = len(self.test[0]) + + new_train_x = self.train[0][:-test_len] + new_train_y = self.train[1][:-test_len] + + self.valid[0] = self.train[0][-test_len:] + self.valid[1] = self.train[1][-test_len:] + + self.train[0] = new_train_x + self.train[1] = new_train_y + +def test_load_nist(): + print "Will load NIST" + + import time + t1 = time.time() + nist = NIST(20) + t2 = time.time() + + print "NIST loaded. time delta = ", t2-t1 + + tr,v,te = nist.get_tvt() + + print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0]) + + raw_input("Press any key") + +if __name__ == '__main__': + + import sys + + args = sys.argv[1:] + + if len(args) > 0 and args[0] == 'load_nist': + test_load_nist() + + elif len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DEFAULT_HP_NIST, chanmock) + + else: + print "Bad arguments" + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v_sylvain/sgd_optimization.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/sgd_optimization.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,274 @@ +#!/usr/bin/python +# coding: utf-8 + +# Generic SdA optimization loop, adapted from the deeplearning.net tutorial + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys + +from jobman import DD +import jobman, jobman.sql + +from stacked_dae import SdA + +from ift6266.utils.seriestables import * + +##def shared_dataset(data_xy): +## data_x, data_y = data_xy +## if theano.config.device.startswith("gpu"): +## print "TRANSFERING DATASETS (via shared()) TO GPU" +## shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) +## shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) +## shared_y = T.cast(shared_y, 'int32') +## else: +## print "WILL RUN ON CPU, NOT GPU, SO DATASETS REMAIN IN BYTES" +## shared_x = theano.shared(data_x) +## shared_y = theano.shared(data_y) +## return shared_x, shared_y + + ######Les shared seront remplacees utilisant "given" dans les enonces de fonction plus loin +def shared_dataset(batch_size, n_in): + + shared_x = theano.shared(numpy.asarray(numpy.zeros((batch_size,n_in)), dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(numpy.zeros(batch_size), dtype=theano.config.floatX)) + return shared_x, shared_y + +default_series = { \ + 'reconstruction_error' : DummySeries(), + 'training_error' : DummySeries(), + 'validation_error' : DummySeries(), + 'test_error' : DummySeries(), + 'params' : DummySeries() + } + +class SdaSgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series=default_series): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + self.input_divider = input_divider + + self.series = series + + self.rng = numpy.random.RandomState(1234) + + self.init_datasets() + self.init_classifier() + + sys.stdout.flush() + + def init_datasets(self): + print "init_datasets" + sys.stdout.flush() + + #train_set, valid_set, test_set = self.dataset + self.test_set_x, self.test_set_y = shared_dataset(self.hp.minibatch_size,self.n_ins) + self.valid_set_x, self.valid_set_y = shared_dataset(self.hp.minibatch_size,self.n_ins) + self.train_set_x, self.train_set_y = shared_dataset(self.hp.minibatch_size,self.n_ins) + + # compute number of minibatches for training, validation and testing + self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size + self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size + # remove last batch in case it's incomplete + self.n_test_batches = (self.test_set_x.value.shape[0] / self.hp.minibatch_size) - 1 + + def init_classifier(self): + print "Constructing classifier" + + # we don't want to save arrays in DD objects, so + # we recreate those arrays here + nhl = self.hp.num_hidden_layers + layers_sizes = [self.hp.hidden_layers_sizes] * nhl + corruption_levels = [self.hp.corruption_levels] * nhl + + # construct the stacked denoising autoencoder class + self.classifier = SdA( \ + train_set_x= self.train_set_x, \ + train_set_y = self.train_set_y,\ + batch_size = self.hp.minibatch_size, \ + n_ins= self.n_ins, \ + hidden_layers_sizes = layers_sizes, \ + n_outs = self.n_outs, \ + corruption_levels = corruption_levels,\ + rng = self.rng,\ + pretrain_lr = self.hp.pretraining_lr, \ + finetune_lr = self.hp.finetuning_lr,\ + input_divider = self.input_divider ) + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain(self.dataset) + self.finetune(self.dataset) + + def pretrain(self,dataset,reduce): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + batch_index=int(0) + for x,y in dataset.train(self.hp.minibatch_size): + batch_index+=1 + if batch_index > reduce: #If maximum number of mini-batch is used + break + c = self.classifier.pretrain_functions[i](x) + + + self.series["reconstruction_error"].append((epoch, batch_index), c) + + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + def finetune(self,dataset,reduce): + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + #index = T.lscalar() # index to a [mini]batch + minibatch_size = self.hp.minibatch_size + ensemble_x = T.matrix('ensemble_x') + ensemble_y = T.ivector('ensemble_y') + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX)) + test_model = theano.function([ensemble_x,ensemble_y], self.classifier.errors, + givens = { + #self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, + #self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + self.classifier.x: ensemble_x, + self.classifier.y: ensemble_y}) + + validate_model = theano.function([ensemble_x,ensemble_y], self.classifier.errors, + givens = { + #self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, + #self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + self.classifier.x: ensemble_x, + self.classifier.y: ensemble_y}) + + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): + epoch = epoch + 1 + minibatch_index=int(0) + for x,y in dataset.train(minibatch_size): + minibatch_index +=1 + + if minibatch_index > reduce: #If maximum number of mini-batchs is used + break + + cost_ij = self.classifier.finetune(x,y) + iter = epoch * self.n_train_batches + minibatch_index + + self.series["training_error"].append((epoch, minibatch_index), cost_ij) + + if (iter+1) % validation_frequency == 0: + + #validation_losses = [validate_model(x,y) for x,y in dataset.valid(minibatch_size)] + test_index=int(0) + validation_losses=[] + for x,y in dataset.valid(minibatch_size): + test_index+=1 + if test_index > reduce: + break + validation_losses.append(validate_model(x,y)) + this_validation_loss = numpy.mean(validation_losses) + + self.series["validation_error"].\ + append((epoch, minibatch_index), this_validation_loss*100.) + + print('epoch %i, minibatch %i, validation error %f %%' % \ + (epoch, minibatch_index, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + #test_losses = [test_model(x,y) for x,y in dataset.test(minibatch_size)] + test_losses=[] + i=0 + for x,y in dataset.test(minibatch_size): + i+=1 + if i > reduce: + break + test_losses.append(test_model(x,y)) + test_score = numpy.mean(test_losses) + + self.series["test_error"].\ + append((epoch, minibatch_index), test_score*100.) + + print((' epoch %i, minibatch %i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index, + test_score*100.)) + + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v_sylvain/stacked_dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/stacked_dae.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,295 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import copy + +from utils import update_locals + +# taken from LeDeepNet/daa.py +# has a special case when taking log(0) (defined =0) +# modified to not take the mean anymore +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + +class LogisticRegression(object): + def __init__(self, input, n_in, n_out): + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + + + +class dA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) + + # bypassing z to avoid running to log(0) + z_a = T.dot(self.y, self.W_prime) + self.b_prime + log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) + # log(1-sigmoid(z_a)) + log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) + self.L = -T.sum( self.x * (log_sigmoid) \ + + (1.0-self.x) * (log_1_sigmoid), axis=1 ) + + # I added this epsilon to avoid getting log(0) and 1/0 in grad + # This means conceptually that there'd be no probability of 0, but that + # doesn't seem to me as important (maybe I'm wrong?). + #eps = 0.00000001 + #eps_1 = 1-eps + #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + # + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + +class SdA(object): + def __init__(self, train_set_x, train_set_y, batch_size, n_ins, + hidden_layers_sizes, n_outs, + corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + update_locals(self, locals()) + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + # MODIF: added this so we also get the b_primes + # (not used for finetuning... still using ".params") + self.all_params = [] + self.n_layers = len(hidden_layers_sizes) + + print "Creating SdA with params:" + print "batch_size", batch_size + print "hidden_layers_sizes", hidden_layers_sizes + print "corruption_levels", corruption_levels + print "n_ins", n_ins + print "n_outs", n_outs + print "pretrain_lr", pretrain_lr + print "finetune_lr", finetune_lr + print "input_divider", input_divider + print "----" + + #self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + + # allocate symbolic variables for the data + ##index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + ensemble = T.matrix('ensemble') + ensemble_x = T.matrix('ensemble_x') + ensemble_y = T.ivector('ensemble_y') + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + hidden_layers_sizes[i] ) + # add the layer to the + self.layers += [layer] + self.params += layer.params + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(input_size, hidden_layers_sizes[i], \ + corruption_level = corruption_levels[0],\ + input = layer_input, \ + shared_W = layer.W, shared_b = layer.b) + + self.all_params += dA_layer.params + + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([ensemble], dA_layer.cost, \ + updates = updates, + givens = { + self.x : ensemble}) + # collect this function into a list + self.pretrain_functions += [update_fn] + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params += self.logLayer.params + self.all_params += self.logLayer.params + # construct a function that implements one step of finetunining + + # compute the cost, defined as the negative log likelihood + cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + # compute list of updates + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([ensemble_x,ensemble_y], cost, + updates = updates, + givens = { + #self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + #self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) + self.x : ensemble_x, + self.y : ensemble_y} ) + + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + + self.errors = self.logLayer.errors(self.y) + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + diff -r 0de14b2034c6 -r 2024368a8d3d deep/stacked_dae/v_sylvain/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/utils.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) + diff -r 0de14b2034c6 -r 2024368a8d3d scripts/launch_generate100.py --- a/scripts/launch_generate100.py Tue Mar 16 12:13:49 2010 -0400 +++ b/scripts/launch_generate100.py Tue Mar 16 12:14:10 2010 -0400 @@ -3,10 +3,12 @@ import os dir1 = "/data/lisa/data/ift6266h10/" +mach = "brams0c.iro.umontreal.ca,brams02.iro.umontreal.ca,brams03.iro.umontreal.ca,maggie22.iro.umontreal.ca" + for i,s in enumerate(['valid','test']): for j,c in enumerate([0.3,0.5,0.7,1]): l = str(c).replace('.','') - os.system("dbidispatch --condor --os=fc9 --machine=brams0c.iro.umontreal.ca ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m 0.3 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, [20000,80000][i], 200+i*4+j)) + os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m 0.3 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, [20000,80000][i], 200+i*4+j)) for i in range(100): - os.system("dbidispatch --condor --os=fc9 --machine=brams0c.iro.umontreal.ca ./run_pipeline.sh -o %sdata/P07_train%d_data.ft -p %sdata/P07_train%d_params -x %sdata/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i)) + os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P07_train%d_data.ft -p %sdata/P07_train%d_params -x %sdata/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i)) diff -r 0de14b2034c6 -r 2024368a8d3d test.py --- a/test.py Tue Mar 16 12:13:49 2010 -0400 +++ b/test.py Tue Mar 16 12:14:10 2010 -0400 @@ -1,8 +1,7 @@ import doctest, sys, pkgutil -def runTests(options = doctest.ELLIPSIS or doctest.DONT_ACCEPT_TRUE_FOR_1): +def runTests(): import ift6266 - predefs = ift6266.__dict__ for (_, name, ispkg) in pkgutil.walk_packages(ift6266.__path__, ift6266.__name__+'.'): if not ispkg: if name.startswith('ift6266.scripts.') or \ @@ -11,9 +10,21 @@ 'ift6266.data_generation.transformations.testmod', 'ift6266.data_generation.transformations.gimp_script']: continue - print "Testing:", name - __import__(name) - doctest.testmod(sys.modules[name], extraglobs=predefs, optionflags=options) + test(name) + +def test(name): + import ift6266 + predefs = ift6266.__dict__ + options = doctest.ELLIPSIS or doctest.DONT_ACCEPT_TRUE_FOR_1 + print "Testing:", name + __import__(name) + doctest.testmod(sys.modules[name], extraglobs=predefs, optionflags=options) if __name__ == '__main__': - runTests() + if len(sys.argv) > 1: + for mod in sys.argv[1:]: + if mod.endswith('.py'): + mod = mod[:-3] + test(mod) + else: + runTests() diff -r 0de14b2034c6 -r 2024368a8d3d utils/seriestables/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/seriestables/__init__.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,2 @@ +from series import ErrorSeries, BasicStatisticsSeries, AccumulatorSeriesWrapper, SeriesArrayWrapper, SharedParamsStatisticsWrapper, DummySeries + diff -r 0de14b2034c6 -r 2024368a8d3d utils/seriestables/series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/seriestables/series.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,605 @@ +import tables + +import numpy +import time + +############################################################################## +# Utility functions to create IsDescription objects (pytables data types) + +''' +The way these "IsDescription constructor" work is simple: write the +code as if it were in a file, then exec()ute it, leaving us with +a local-scoped LocalDescription which may be used to call createTable. + +It's a small hack, but it's necessary as the names of the columns +are retrieved based on the variable name, which we can't programmatically set +otherwise. +''' + +def _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock, pos=0): + toexec = "" + + if store_timestamp: + toexec += "\ttimestamp = tables.Time32Col(pos="+str(pos)+")\n" + pos += 1 + + if store_cpuclock: + toexec += "\tcpuclock = tables.Float64Col(pos="+str(pos)+")\n" + pos += 1 + + return toexec, pos + +def _get_description_n_ints(int_names, int_width=64, pos=0): + """ + Begins construction of a class inheriting from IsDescription + to construct an HDF5 table with index columns named with int_names. + + See Series().__init__ to see how those are used. + """ + int_constructor = "tables.Int64Col" + if int_width == 32: + int_constructor = "tables.Int32Col" + elif not int_width in (32, 64): + raise "int_width must be left unspecified, or should equal 32 or 64" + + toexec = "" + + for n in int_names: + toexec += "\t" + n + " = " + int_constructor + "(pos=" + str(pos) + ")\n" + pos += 1 + + return toexec, pos + +def _get_description_with_n_ints_n_floats(int_names, float_names, + int_width=64, float_width=32, + store_timestamp=True, store_cpuclock=True): + """ + Constructs a class to be used when constructing a table with PyTables. + + This is useful to construct a series with an index with multiple levels. + E.g. if you want to index your "validation error" with "epoch" first, then + "minibatch_index" second, you'd use two "int_names". + + Parameters + ---------- + int_names : tuple of str + Names of the int (e.g. index) columns + float_names : tuple of str + Names of the float (e.g. error) columns + int_width : {'32', '64'} + Type of ints. + float_width : {'32', '64'} + Type of floats. + store_timestamp : bool + See __init__ of Series + store_cpuclock : bool + See __init__ of Series + + Returns + ------- + A class object, to pass to createTable() + """ + + toexec = "class LocalDescription(tables.IsDescription):\n" + + toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock) + toexec += toexec_ + + toexec_, pos = _get_description_n_ints(int_names, int_width=int_width, pos=pos) + toexec += toexec_ + + float_constructor = "tables.Float32Col" + if float_width == 64: + float_constructor = "tables.Float64Col" + elif not float_width in (32, 64): + raise "float_width must be left unspecified, or should equal 32 or 64" + + for n in float_names: + toexec += "\t" + n + " = " + float_constructor + "(pos=" + str(pos) + ")\n" + pos += 1 + + exec(toexec) + + return LocalDescription + +############################################################################## +# Series classes + +# Shortcut to allow passing a single int as index, instead of a tuple +def _index_to_tuple(index): + if type(index) == tuple: + return index + + if type(index) == list: + index = tuple(index) + return index + + try: + if index % 1 > 0.001 and index % 1 < 0.999: + raise + idx = long(index) + return (idx,) + except: + raise TypeError("index must be a tuple of integers, or at least a single integer") + +class Series(): + """ + Base Series class, with minimal arguments and type checks. + + Yet cannot be used by itself (it's append() method raises an error) + """ + + def __init__(self, table_name, hdf5_file, index_names=('epoch',), + title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True): + """Basic arguments each Series must get. + + Parameters + ---------- + table_name : str + Name of the table to create under group "hd5_group" (other + parameter). No spaces, ie. follow variable naming restrictions. + hdf5_file : open HDF5 file + File opened with openFile() in PyTables (ie. return value of + openFile). + index_names : tuple of str + Columns to use as index for elements in the series, other + example would be ('epoch', 'minibatch'). This would then allow + you to call append(index, element) with index made of two ints, + one for epoch index, one for minibatch index in epoch. + title : str + Title to attach to this table as metadata. Can contain spaces + and be longer then the table_name. + hdf5_group : str + Path of the group (kind of a file) in the HDF5 file under which + to create the table. + store_timestamp : bool + Whether to create a column for timestamps and store them with + each record. + store_cpuclock : bool + Whether to create a column for cpu clock and store it with + each record. + """ + + ######################################### + # checks + + if type(table_name) != str: + raise TypeError("table_name must be a string") + if table_name == "": + raise ValueError("table_name must not be empty") + + if not isinstance(hdf5_file, tables.file.File): + raise TypeError("hdf5_file must be an open HDF5 file (use tables.openFile)") + #if not ('w' in hdf5_file.mode or 'a' in hdf5_file.mode): + # raise ValueError("hdf5_file must be opened in write or append mode") + + if type(index_names) != tuple: + raise TypeError("index_names must be a tuple of strings." + \ + "If you have only one element in the tuple, don't forget " +\ + "to add a comma, e.g. ('epoch',).") + for name in index_names: + if type(name) != str: + raise TypeError("index_names must only contain strings, but also"+\ + "contains a "+str(type(name))+".") + + if type(title) != str: + raise TypeError("title must be a string, even if empty") + + if type(hdf5_group) != str: + raise TypeError("hdf5_group must be a string") + + if type(store_timestamp) != bool: + raise TypeError("store_timestamp must be a bool") + + if type(store_cpuclock) != bool: + raise TypeError("store_timestamp must be a bool") + + ######################################### + + self.table_name = table_name + self.hdf5_file = hdf5_file + self.index_names = index_names + self.title = title + self.hdf5_group = hdf5_group + + self.store_timestamp = store_timestamp + self.store_cpuclock = store_cpuclock + + def append(self, index, element): + raise NotImplementedError + + def _timestamp_cpuclock(self, newrow): + if self.store_timestamp: + newrow["timestamp"] = time.time() + + if self.store_cpuclock: + newrow["cpuclock"] = time.clock() + +class DummySeries(): + """ + To put in a series dictionary instead of a real series, to do nothing + when we don't want a given series to be saved. + + E.g. if we'd normally have a "training_error" series in a dictionary + of series, the training loop would have something like this somewhere: + + series["training_error"].append((15,), 20.0) + + but if we don't want to save the training errors this time, we simply + do + + series["training_error"] = DummySeries() + """ + def append(self, index, element): + pass + +class ErrorSeries(Series): + """ + Most basic Series: saves a single float (called an Error as this is + the most common use case I foresee) along with an index (epoch, for + example) and timestamp/cpu.clock for each of these floats. + """ + + def __init__(self, error_name, table_name, + hdf5_file, index_names=('epoch',), + title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True): + """ + For most parameters, see Series.__init__ + + Parameters + ---------- + error_name : str + In the HDF5 table, column name for the error float itself. + """ + + # most type/value checks are performed in Series.__init__ + Series.__init__(self, table_name, hdf5_file, index_names, title, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock) + + if type(error_name) != str: + raise TypeError("error_name must be a string") + if error_name == "": + raise ValueError("error_name must not be empty") + + self.error_name = error_name + + self._create_table() + + def _create_table(self): + table_description = _get_description_with_n_ints_n_floats( \ + self.index_names, (self.error_name,), + store_timestamp=self.store_timestamp, + store_cpuclock=self.store_cpuclock) + + self._table = self.hdf5_file.createTable(self.hdf5_group, + self.table_name, + table_description, + title=self.title) + + + def append(self, index, error): + """ + Parameters + ---------- + index : tuple of int + Following index_names passed to __init__, e.g. (12, 15) if + index_names were ('epoch', 'minibatch_size'). + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + error : float + Next error in the series. + """ + index = _index_to_tuple(index) + + if len(index) != len(self.index_names): + raise ValueError("index provided does not have the right length (expected " \ + + str(len(self.index_names)) + " got " + str(len(index))) + + # other checks are implicit when calling newrow[..] =, + # which should throw an error if not of the right type + + newrow = self._table.row + + # Columns for index in table are based on index_names + for col_name, value in zip(self.index_names, index): + newrow[col_name] = value + newrow[self.error_name] = error + + # adds timestamp and cpuclock to newrow if necessary + self._timestamp_cpuclock(newrow) + + newrow.append() + + self.hdf5_file.flush() + +# Does not inherit from Series because it does not itself need to +# access the hdf5_file and does not need a series_name (provided +# by the base_series.) +class AccumulatorSeriesWrapper(): + ''' + Wraps a Series by accumulating objects passed its Accumulator.append() + method and "reducing" (e.g. calling numpy.mean(list)) once in a while, + every "reduce_every" calls in fact. + ''' + + def __init__(self, base_series, reduce_every, reduce_function=numpy.mean): + """ + Parameters + ---------- + base_series : Series + This object must have an append(index, value) function. + + reduce_every : int + Apply the reduction function (e.g. mean()) every time we get this + number of elements. E.g. if this is 100, then every 100 numbers + passed to append(), we'll take the mean and call append(this_mean) + on the BaseSeries. + + reduce_function : function + Must take as input an array of "elements", as passed to (this + accumulator's) append(). Basic case would be to take an array of + floats and sum them into one float, for example. + """ + self.base_series = base_series + self.reduce_function = reduce_function + self.reduce_every = reduce_every + + self._buffer = [] + + + def append(self, index, element): + """ + Parameters + ---------- + index : tuple of int + The index used is the one of the last element reduced. E.g. if + you accumulate over the first 1000 minibatches, the index + passed to the base_series.append() function will be 1000. + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + element : float + Element that will be accumulated. + """ + self._buffer.append(element) + + if len(self._buffer) == self.reduce_every: + reduced = self.reduce_function(self._buffer) + self.base_series.append(index, reduced) + self._buffer = [] + + # The >= case should never happen, except if lists + # were appended by accessing _buffer externally (when it's + # intended to be private), which should be a red flag. + assert len(self._buffer) < self.reduce_every + +# Outside of class to fix an issue with exec in Python 2.6. +# My sorries to the god of pretty code. +def _BasicStatisticsSeries_construct_table_toexec(index_names, store_timestamp, store_cpuclock): + toexec = "class LocalDescription(tables.IsDescription):\n" + + toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock) + toexec += toexec_ + + toexec_, pos = _get_description_n_ints(index_names, pos=pos) + toexec += toexec_ + + toexec += "\tmean = tables.Float32Col(pos=" + str(pos) + ")\n" + toexec += "\tmin = tables.Float32Col(pos=" + str(pos+1) + ")\n" + toexec += "\tmax = tables.Float32Col(pos=" + str(pos+2) + ")\n" + toexec += "\tstd = tables.Float32Col(pos=" + str(pos+3) + ")\n" + + # This creates "LocalDescription", which we may then use + exec(toexec) + + return LocalDescription + +# Defaults functions for BasicStatsSeries. These can be replaced. +_basic_stats_functions = {'mean': lambda(x): numpy.mean(x), + 'min': lambda(x): numpy.min(x), + 'max': lambda(x): numpy.max(x), + 'std': lambda(x): numpy.std(x)} + +class BasicStatisticsSeries(Series): + + def __init__(self, table_name, hdf5_file, + stats_functions=_basic_stats_functions, + index_names=('epoch',), title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True): + """ + For most parameters, see Series.__init__ + + Parameters + ---------- + series_name : str + Not optional here. Will be prepended with "Basic statistics for " + + stats_functions : dict, optional + Dictionary with a function for each key "mean", "min", "max", + "std". The function must take whatever is passed to append(...) + and return a single number (float). + """ + + # Most type/value checks performed in Series.__init__ + Series.__init__(self, table_name, hdf5_file, index_names, title, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock) + + if type(hdf5_group) != str: + raise TypeError("hdf5_group must be a string") + + if type(stats_functions) != dict: + # just a basic check. We'll suppose caller knows what he's doing. + raise TypeError("stats_functions must be a dict") + + self.hdf5_group = hdf5_group + + self.stats_functions = stats_functions + + self._create_table() + + def _create_table(self): + table_description = \ + _BasicStatisticsSeries_construct_table_toexec( \ + self.index_names, + self.store_timestamp, self.store_cpuclock) + + self._table = self.hdf5_file.createTable(self.hdf5_group, + self.table_name, table_description) + + def append(self, index, array): + """ + Parameters + ---------- + index : tuple of int + Following index_names passed to __init__, e.g. (12, 15) + if index_names were ('epoch', 'minibatch_size') + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + array + Is of whatever type the stats_functions passed to + __init__ can take. Default is anything numpy.mean(), + min(), max(), std() can take. + """ + index = _index_to_tuple(index) + + if len(index) != len(self.index_names): + raise ValueError("index provided does not have the right length (expected " \ + + str(len(self.index_names)) + " got " + str(len(index))) + + newrow = self._table.row + + for col_name, value in zip(self.index_names, index): + newrow[col_name] = value + + newrow["mean"] = self.stats_functions['mean'](array) + newrow["min"] = self.stats_functions['min'](array) + newrow["max"] = self.stats_functions['max'](array) + newrow["std"] = self.stats_functions['std'](array) + + self._timestamp_cpuclock(newrow) + + newrow.append() + + self.hdf5_file.flush() + +class SeriesArrayWrapper(): + """ + Simply redistributes any number of elements to sub-series to respective + append()s. + + To use if you have many elements to append in similar series, e.g. if you + have an array containing [train_error, valid_error, test_error], and 3 + corresponding series, this allows you to simply pass this array of 3 + values to append() instead of passing each element to each individual + series in turn. + """ + + def __init__(self, base_series_list): + """ + Parameters + ---------- + base_series_list : array or tuple of Series + You must have previously created and configured each of those + series, then put them in an array. This array must follow the + same order as the array passed as ``elements`` parameter of + append(). + """ + self.base_series_list = base_series_list + + def append(self, index, elements): + """ + Parameters + ---------- + index : tuple of int + See for example ErrorSeries.append() + + elements : array or tuple + Array or tuple of elements that will be passed down to + the base_series passed to __init__, in the same order. + """ + if len(elements) != len(self.base_series_list): + raise ValueError("not enough or too much elements provided (expected " \ + + str(len(self.base_series_list)) + " got " + str(len(elements))) + + for series, el in zip(self.base_series_list, elements): + series.append(index, el) + +class SharedParamsStatisticsWrapper(SeriesArrayWrapper): + ''' + Save mean, min/max, std of shared parameters place in an array. + + Here "shared" means "theano.shared", which means elements of the + array will have a .value to use for numpy.mean(), etc. + + This inherits from SeriesArrayWrapper, which provides the append() + method. + ''' + + def __init__(self, arrays_names, new_group_name, hdf5_file, + base_group='/', index_names=('epoch',), title="", + store_timestamp=True, store_cpuclock=True): + """ + For other parameters, see Series.__init__ + + Parameters + ---------- + array_names : array or tuple of str + Name of each array, in order of the array passed to append(). E.g. + ('layer1_b', 'layer1_W', 'layer2_b', 'layer2_W') + + new_group_name : str + Name of a new HDF5 group which will be created under base_group to + store the new series. + + base_group : str + Path of the group under which to create the new group which will + store the series. + + title : str + Here the title is attached to the new group, not a table. + + store_timestamp : bool + Here timestamp and cpuclock are stored in *each* table + + store_cpuclock : bool + Here timestamp and cpuclock are stored in *each* table + """ + + # most other checks done when calling BasicStatisticsSeries + if type(new_group_name) != str: + raise TypeError("new_group_name must be a string") + if new_group_name == "": + raise ValueError("new_group_name must not be empty") + + base_series_list = [] + + new_group = hdf5_file.createGroup(base_group, new_group_name, title=title) + + stats_functions = {'mean': lambda(x): numpy.mean(x.value), + 'min': lambda(x): numpy.min(x.value), + 'max': lambda(x): numpy.max(x.value), + 'std': lambda(x): numpy.std(x.value)} + + for name in arrays_names: + base_series_list.append( + BasicStatisticsSeries( + table_name=name, + hdf5_file=hdf5_file, + index_names=index_names, + stats_functions=stats_functions, + hdf5_group=new_group._v_pathname, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock)) + + SeriesArrayWrapper.__init__(self, base_series_list) + + diff -r 0de14b2034c6 -r 2024368a8d3d utils/seriestables/test_series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/seriestables/test_series.py Tue Mar 16 12:14:10 2010 -0400 @@ -0,0 +1,311 @@ +import tempfile + +import numpy +import numpy.random + +from jobman import DD + +import tables + +from series import * +import series + +################################################# +# Utils + +def compare_floats(f1,f2): + if f1-f2 < 1e-3: + return True + return False + +def compare_lists(it1, it2, floats=False): + if len(it1) != len(it2): + return False + + for el1, el2 in zip(it1, it2): + if floats: + if not compare_floats(el1,el2): + return False + elif el1 != el2: + return False + + return True + +################################################# +# Basic Series class tests + +def test_Series_types(): + pass + +################################################# +# ErrorSeries tests + +def test_ErrorSeries_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + validation_error.append((1,2), 30.0) + validation_error.append((2,1), 28.0) + validation_error.append((2,2), 26.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1,1,2,2]) + assert compare_lists(table.cols.minibatch[:], [1,2,1,2]) + assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0]) + +def test_ErrorSeries_no_index(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + # empty tuple + index_names=tuple(), + title="Validation error with no index") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append(tuple(), 32.0) + validation_error.append(tuple(), 30.0) + validation_error.append(tuple(), 28.0) + validation_error.append(tuple(), 26.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0]) + assert not ("epoch" in dir(table.cols)) + +def test_ErrorSeries_notimestamp(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch", + store_timestamp=False) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1]) + assert not ("timestamp" in dir(table.cols)) + assert "cpuclock" in dir(table.cols) + +def test_ErrorSeries_nocpuclock(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch", + store_cpuclock=False) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1]) + assert not ("cpuclock" in dir(table.cols)) + assert "timestamp" in dir(table.cols) + +def test_AccumulatorSeriesWrapper_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = ErrorSeries(error_name="accumulated_validation_error", + table_name="accumulated_validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Validation error, summed every 3 minibatches, indexed by epoch and minibatch") + + accumulator = AccumulatorSeriesWrapper(base_series=validation_error, + reduce_every=3, reduce_function=numpy.sum) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + accumulator.append((1,1), 32.0) + accumulator.append((1,2), 30.0) + accumulator.append((2,1), 28.0) + accumulator.append((2,2), 26.0) + accumulator.append((3,1), 24.0) + accumulator.append((3,2), 22.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'accumulated_validation_error') + + assert compare_lists(table.cols.epoch[:], [2,3]) + assert compare_lists(table.cols.minibatch[:], [1,2]) + assert compare_lists(table.cols.accumulated_validation_error[:], [90.0,72.0], floats=True) + +def test_BasicStatisticsSeries_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats_series = BasicStatisticsSeries(table_name="b_vector_statistics", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Basic statistics for b vector indexed by epoch and minibatch") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + stats_series.append((1,1), [0.15, 0.20, 0.30]) + stats_series.append((1,2), [-0.18, 0.30, 0.58]) + stats_series.append((2,1), [0.18, -0.38, -0.68]) + stats_series.append((2,2), [0.15, 0.02, 1.9]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'b_vector_statistics') + + assert compare_lists(table.cols.epoch[:], [1,1,2,2]) + assert compare_lists(table.cols.minibatch[:], [1,2,1,2]) + assert compare_lists(table.cols.mean[:], [0.21666667, 0.23333333, -0.29333332, 0.69], floats=True) + assert compare_lists(table.cols.min[:], [0.15000001, -0.18000001, -0.68000001, 0.02], floats=True) + assert compare_lists(table.cols.max[:], [0.30, 0.58, 0.18, 1.9], floats=True) + assert compare_lists(table.cols.std[:], [0.06236095, 0.31382939, 0.35640177, 0.85724366], floats=True) + +def test_SharedParamsStatisticsWrapper_commoncase(h5f=None): + import numpy.random + + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/", + arrays_names=('b1','b2','b3'), hdf5_file=h5f, + index_names=('epoch','minibatch')) + + b1 = DD({'value':numpy.random.rand(5)}) + b2 = DD({'value':numpy.random.rand(5)}) + b3 = DD({'value':numpy.random.rand(5)}) + stats.append((1,1), [b1,b2,b3]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + b1_table = h5f.getNode('/params', 'b1') + b3_table = h5f.getNode('/params', 'b3') + + assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3 + assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3 + assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3 + assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3 + +def test_SharedParamsStatisticsWrapper_notimestamp(h5f=None): + import numpy.random + + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/", + arrays_names=('b1','b2','b3'), hdf5_file=h5f, + index_names=('epoch','minibatch'), + store_timestamp=False) + + b1 = DD({'value':numpy.random.rand(5)}) + b2 = DD({'value':numpy.random.rand(5)}) + b3 = DD({'value':numpy.random.rand(5)}) + stats.append((1,1), [b1,b2,b3]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + b1_table = h5f.getNode('/params', 'b1') + b3_table = h5f.getNode('/params', 'b3') + + assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3 + assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3 + assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3 + assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3 + + assert not ('timestamp' in dir(b1_table.cols)) + +def test_get_desc(): + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + desc = series._get_description_with_n_ints_n_floats(("col1","col2"), ("col3","col4")) + + mytable = h5f.createTable('/', 'mytable', desc) + + # just make sure the columns are there... otherwise this will throw an exception + mytable.cols.col1 + mytable.cols.col2 + mytable.cols.col3 + mytable.cols.col4 + + try: + # this should fail... LocalDescription must be local to get_desc_etc + test = LocalDescription + assert False + except: + assert True + + assert True + +def test_index_to_tuple_floaterror(): + try: + series._index_to_tuple(5.1) + assert False + except TypeError: + assert True + +def test_index_to_tuple_arrayok(): + tpl = series._index_to_tuple([1,2,3]) + assert type(tpl) == tuple and tpl[1] == 2 and tpl[2] == 3 + +def test_index_to_tuple_intbecomestuple(): + tpl = series._index_to_tuple(32) + + assert type(tpl) == tuple and tpl == (32,) + +def test_index_to_tuple_longbecomestuple(): + tpl = series._index_to_tuple(928374928374928L) + + assert type(tpl) == tuple and tpl == (928374928374928L,) + +if __name__ == '__main__': + import tempfile + test_get_desc() + test_ErrorSeries_common_case() + test_BasicStatisticsSeries_common_case() + test_AccumulatorSeriesWrapper_common_case() + test_SharedParamsStatisticsWrapper_commoncase() +