# HG changeset patch # User fsavard # Date 1268692221 14400 # Node ID 42005ec87747360de11e3a7313053bc00baed41a # Parent 9fc641d7adda89273fb5272805e1c6279266ea3e Mergé (manuellement) les changements de Sylvain pour utiliser le code de dataset d'Arnaud, à cette différence près que je n'utilse pas les givens. J'ai probablement une approche différente pour limiter la taille du dataset dans mon débuggage, aussi. diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/config.py.example --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v2/config.py.example Mon Mar 15 18:30:21 2010 -0400 @@ -0,0 +1,64 @@ +''' +These are parameters used by nist_sda.py. They'll end up as globals in there. + +Rename this file to config.py and configure as needed. +DON'T add the renamed file to the repository, as others might use it +without realizing it, with dire consequences. +''' + +# Set this to True when you want to run cluster tests, ie. you want +# to run on the cluster, many jobs, but want to reduce the training +# set size and the number of epochs, so you know everything runs +# fine on the cluster. +# Set this PRIOR to inserting your test jobs in the DB. +TEST_CONFIG = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +NIST_ALL_TRAIN_SIZE = 649081 +# valid et test =82587 82587 + +# change "sandbox" when you're ready +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere' +EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint" + +# reduce training set to that many examples +REDUCE_TRAIN_TO = None +# that's a max, it usually doesn't get to that point +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + + +# This is to configure insertion of jobs on the cluster. +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10,20], + 'hidden_layers_sizes': [300,800], + 'corruption_levels': [0.1,0.2,0.3], + 'minibatch_size': [20], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2,3]} + +# Just useful for tests... minimal number of epochs +# (This is used when running a single job, locally, when +# calling ./nist_sda.py test_jobman_entrypoint +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':2, + 'max_finetuning_epochs':2, + 'hidden_layers_sizes':800, + 'corruption_levels':0.2, + 'minibatch_size':20, + 'reduce_train_to':10000, + 'num_hidden_layers':1}) + + diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/nist_sda.py --- a/deep/stacked_dae/v2/nist_sda.py Mon Mar 15 13:22:20 2010 -0400 +++ b/deep/stacked_dae/v2/nist_sda.py Mon Mar 15 18:30:21 2010 -0400 @@ -29,48 +29,8 @@ from ift6266.utils.seriestables import * import tables -############################################################################## -# GLOBALS - -TEST_CONFIG = False - -NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' -JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/fsavard_sda_v2' -EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint" - -REDUCE_TRAIN_TO = None -MAX_FINETUNING_EPOCHS = 1000 -# number of minibatches before taking means for valid error etc. -REDUCE_EVERY = 100 - -if TEST_CONFIG: - REDUCE_TRAIN_TO = 1000 - MAX_FINETUNING_EPOCHS = 2 - REDUCE_EVERY = 10 - -# Possible values the hyperparameters can take. These are then -# combined with produit_cartesien_jobs so we get a list of all -# possible combinations, each one resulting in a job inserted -# in the jobman DB. -JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], - 'pretraining_epochs_per_layer': [10,20], - 'hidden_layers_sizes': [300,800], - 'corruption_levels': [0.1,0.2,0.3], - 'minibatch_size': [20], - 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], - 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out - 'num_hidden_layers':[2,3]} - -# Just useful for tests... minimal number of epochs -DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, - 'pretraining_lr':0.1, - 'pretraining_epochs_per_layer':2, - 'max_finetuning_epochs':2, - 'hidden_layers_sizes':800, - 'corruption_levels':0.2, - 'minibatch_size':20, - 'reduce_train_to':10000, - 'num_hidden_layers':1}) +from ift6266 import datasets +from config import * ''' Function called by jobman upon launching each job @@ -82,14 +42,6 @@ # TODO: remove this, bad for number of simultaneous requests on DB channel.save() - workingdir = os.getcwd() - - print "Will load NIST" - - nist = NIST(minibatch_size=20) - - print "NIST loaded" - # For test runs, we don't want to use the whole dataset so # reduce it to fewer elements if asked to. rtt = None @@ -97,29 +49,27 @@ rtt = state['reduce_train_to'] elif REDUCE_TRAIN_TO: rtt = REDUCE_TRAIN_TO - - if rtt: - print "Reducing training set to "+str(rtt)+ " examples" - nist.reduce_train_set(rtt) - - train,valid,test = nist.get_tvt() - dataset = (train,valid,test) - + n_ins = 32*32 n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = NIST_ALL_TRAIN_SIZE series = create_series(state.num_hidden_layers) print "Creating optimizer with state, ", state - optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \ + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, + hyperparameters=state, \ n_ins=n_ins, n_outs=n_outs,\ - input_divider=255.0, series=series) + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) - optimizer.pretrain() + optimizer.pretrain(datasets.nist_all) channel.save() - optimizer.finetune() + optimizer.finetune(datasets.nist_all) channel.save() return channel.COMPLETE @@ -200,93 +150,14 @@ print "inserted" -class NIST: - def __init__(self, minibatch_size, basepath=None, reduce_train_to=None): - global NIST_ALL_LOCATION - - self.minibatch_size = minibatch_size - self.basepath = basepath and basepath or NIST_ALL_LOCATION - - self.set_filenames() - - # arrays of 2 elements: .x, .y - self.train = [None, None] - self.test = [None, None] - - self.load_train_test() - - self.valid = [[], []] - self.split_train_valid() - if reduce_train_to: - self.reduce_train_set(reduce_train_to) - - def get_tvt(self): - return self.train, self.valid, self.test - - def set_filenames(self): - self.train_files = ['all_train_data.ft', - 'all_train_labels.ft'] - - self.test_files = ['all_test_data.ft', - 'all_test_labels.ft'] - - def load_train_test(self): - self.load_data_labels(self.train_files, self.train) - self.load_data_labels(self.test_files, self.test) - - def load_data_labels(self, filenames, pair): - for i, fn in enumerate(filenames): - f = open(os.path.join(self.basepath, fn)) - pair[i] = filetensor.read(f) - f.close() - - def reduce_train_set(self, max): - self.train[0] = self.train[0][:max] - self.train[1] = self.train[1][:max] - - if max < len(self.test[0]): - for ar in (self.test, self.valid): - ar[0] = ar[0][:max] - ar[1] = ar[1][:max] - - def split_train_valid(self): - test_len = len(self.test[0]) - - new_train_x = self.train[0][:-test_len] - new_train_y = self.train[1][:-test_len] - - self.valid[0] = self.train[0][-test_len:] - self.valid[1] = self.train[1][-test_len:] - - self.train[0] = new_train_x - self.train[1] = new_train_y - -def test_load_nist(): - print "Will load NIST" - - import time - t1 = time.time() - nist = NIST(20) - t2 = time.time() - - print "NIST loaded. time delta = ", t2-t1 - - tr,v,te = nist.get_tvt() - - print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0]) - - raw_input("Press any key") - if __name__ == '__main__': - import sys - args = sys.argv[1:] - if len(args) > 0 and args[0] == 'load_nist': - test_load_nist() + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() - elif len(args) > 0 and args[0] == 'jobman_insert': + if len(args) > 0 and args[0] == 'jobman_insert': jobman_insert_nist() elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/sgd_optimization.py --- a/deep/stacked_dae/v2/sgd_optimization.py Mon Mar 15 13:22:20 2010 -0400 +++ b/deep/stacked_dae/v2/sgd_optimization.py Mon Mar 15 18:30:21 2010 -0400 @@ -17,19 +17,6 @@ from ift6266.utils.seriestables import * -def shared_dataset(data_xy): - data_x, data_y = data_xy - if theano.config.device.startswith("gpu"): - print "TRANSFERING DATASETS (via shared()) TO GPU" - shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) - shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) - shared_y = T.cast(shared_y, 'int32') - else: - print "WILL RUN ON CPU, NOT GPU, SO DATASETS REMAIN IN BYTES" - shared_x = theano.shared(data_x) - shared_y = theano.shared(data_y) - return shared_x, shared_y - default_series = { \ 'reconstruction_error' : DummySeries(), 'training_error' : DummySeries(), @@ -38,37 +25,33 @@ 'params' : DummySeries() } +def itermax(iter, max): + for i,it in enumerate(iter): + if i >= max: + break + yield i + class SdaSgdOptimizer: - def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series=default_series): + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, max_minibatches=None): self.dataset = dataset self.hp = hyperparameters self.n_ins = n_ins self.n_outs = n_outs - self.input_divider = input_divider + self.max_minibatches = max_minibatches + print "SdaSgdOptimizer, max_minibatches =", max_minibatches + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + self.series = series self.rng = numpy.random.RandomState(1234) - self.init_datasets() self.init_classifier() sys.stdout.flush() - - def init_datasets(self): - print "init_datasets" - sys.stdout.flush() - - train_set, valid_set, test_set = self.dataset - self.test_set_x, self.test_set_y = shared_dataset(test_set) - self.valid_set_x, self.valid_set_y = shared_dataset(valid_set) - self.train_set_x, self.train_set_y = shared_dataset(train_set) - - # compute number of minibatches for training, validation and testing - self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size - self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size - # remove last batch in case it's incomplete - self.n_test_batches = (self.test_set_x.value.shape[0] / self.hp.minibatch_size) - 1 def init_classifier(self): print "Constructing classifier" @@ -81,8 +64,6 @@ # construct the stacked denoising autoencoder class self.classifier = SdA( \ - train_set_x= self.train_set_x, \ - train_set_y = self.train_set_y,\ batch_size = self.hp.minibatch_size, \ n_ins= self.n_ins, \ hidden_layers_sizes = layers_sizes, \ @@ -90,18 +71,17 @@ corruption_levels = corruption_levels,\ rng = self.rng,\ pretrain_lr = self.hp.pretraining_lr, \ - finetune_lr = self.hp.finetuning_lr,\ - input_divider = self.input_divider ) + finetune_lr = self.hp.finetuning_lr) #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") sys.stdout.flush() def train(self): - self.pretrain() - self.finetune() + self.pretrain(self.dataset) + self.finetune(self.dataset) - def pretrain(self): + def pretrain(self,dataset): print "STARTING PRETRAINING, time = ", datetime.datetime.now() sys.stdout.flush() @@ -111,10 +91,19 @@ # go through pretraining epochs for epoch in xrange(self.hp.pretraining_epochs_per_layer): # go through the training set - for batch_index in xrange(self.n_train_batches): - c = self.classifier.pretrain_functions[i](batch_index) + batch_index=0 + for x,y in dataset.train(self.hp.minibatch_size): + c = self.classifier.pretrain_functions[i](x) self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 + + if batch_index % 10000 == 0: + print "10000 batches" + + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c sys.stdout.flush() @@ -128,24 +117,26 @@ sys.stdout.flush() - def finetune(self): + def finetune(self,dataset): print "STARTING FINETUNING, time = ", datetime.datetime.now() - index = T.lscalar() # index to a [mini]batch minibatch_size = self.hp.minibatch_size # create a function to compute the mistakes that are made by the model # on the validation set, or testing set - shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX)) - test_model = theano.function([index], self.classifier.errors, - givens = { - self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, - self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) - validate_model = theano.function([index], self.classifier.errors, - givens = { - self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, - self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) # early-stopping parameters @@ -154,7 +145,7 @@ # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = min(self.n_train_batches, patience/2) + validation_frequency = min(self.mb_per_epoch, patience/2) # go through this many # minibatche before checking the network # on the validation set; in this case we @@ -168,18 +159,24 @@ done_looping = False epoch = 0 + total_mb_index = 0 + while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): epoch = epoch + 1 - for minibatch_index in xrange(self.n_train_batches): - - cost_ij = self.classifier.finetune(minibatch_index) - iter = epoch * self.n_train_batches + minibatch_index + minibatch_index = -1 + for x,y in dataset.train(minibatch_size): + minibatch_index += 1 + cost_ij = self.classifier.finetune(x,y) + total_mb_index += 1 self.series["training_error"].append((epoch, minibatch_index), cost_ij) - if (iter+1) % validation_frequency == 0: + if (total_mb_index+1) % validation_frequency == 0: - validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)] + iter = dataset.valid(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + validation_losses = [validate_model(x,y) for x,y in iter] this_validation_loss = numpy.mean(validation_losses) self.series["validation_error"].\ @@ -196,14 +193,17 @@ #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold : - patience = max(patience, iter * patience_increase) + patience = max(patience, total_mb_index * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss - best_iter = iter + best_iter = total_mb_index # test it on the test set - test_losses = [test_model(i) for i in xrange(self.n_test_batches)] + iter = dataset.test(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + test_losses = [test_model(x,y) for x,y in iter] test_score = numpy.mean(test_losses) self.series["test_error"].\ @@ -216,9 +216,13 @@ sys.stdout.flush() + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break + self.series['params'].append((epoch,), self.classifier.all_params) - if patience <= iter : + if patience <= total_mb_index: done_looping = True break diff -r 9fc641d7adda -r 42005ec87747 deep/stacked_dae/v2/stacked_dae.py --- a/deep/stacked_dae/v2/stacked_dae.py Mon Mar 15 13:22:20 2010 -0400 +++ b/deep/stacked_dae/v2/stacked_dae.py Mon Mar 15 18:30:21 2010 -0400 @@ -165,9 +165,9 @@ class SdA(object): - def __init__(self, train_set_x, train_set_y, batch_size, n_ins, + def __init__(self, batch_size, n_ins, hidden_layers_sizes, n_outs, - corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): + corruption_levels, rng, pretrain_lr, finetune_lr): # Just to make sure those are not modified somewhere else afterwards hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) corruption_levels = copy.deepcopy(corruption_levels) @@ -190,17 +190,14 @@ print "n_outs", n_outs print "pretrain_lr", pretrain_lr print "finetune_lr", finetune_lr - print "input_divider", input_divider print "----" - self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) - if len(hidden_layers_sizes) < 1 : raiseException (' You must have at least one hidden layer ') # allocate symbolic variables for the data - index = T.lscalar() # index to a [mini]batch + #index = T.lscalar() # index to a [mini]batch self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels @@ -247,10 +244,15 @@ updates[param] = param - gparam * pretrain_lr # create a function that trains the dA - update_fn = theano.function([index], dA_layer.cost, \ - updates = updates, - givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) + update_fn = theano.function([self.x], dA_layer.cost, \ + updates = updates)#, + # givens = { + # self.x : ensemble}) + # collect this function into a list + #update_fn = theano.function([index], dA_layer.cost, \ + # updates = updates, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) # collect this function into a list self.pretrain_functions += [update_fn] @@ -273,11 +275,11 @@ for param,gparam in zip(self.params, gparams): updates[param] = param - gparam*finetune_lr - self.finetune = theano.function([index], cost, - updates = updates, - givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, - self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) + self.finetune = theano.function([self.x,self.y], cost, + updates = updates)#, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + # self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y