# HG changeset patch # User fsavard # Date 1267224352 18000 # Node ID b9ea8e2d071a515a3d4901510e26a75f1154b8ed # Parent 4d3d3627df3e8064e570d2a950812815288f3d38 Enlevé ce qui concernait la réutilisation de résultats de préentraînement (trop compliqué pour peu de bénéfice: c'est le finetuning qui est vraiment long diff -r 4d3d3627df3e -r b9ea8e2d071a deep/stacked_dae/nist_sda.py --- a/deep/stacked_dae/nist_sda.py Fri Feb 26 15:25:44 2010 -0500 +++ b/deep/stacked_dae/nist_sda.py Fri Feb 26 17:45:52 2010 -0500 @@ -1,71 +1,86 @@ #!/usr/bin/python # coding: utf-8 +import ift6266 +import pylearn + import numpy import theano import time + +import pylearn.version import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams + import copy - import sys +import os import os.path -from sgd_optimization import SdaSgdOptimizer - from jobman import DD import jobman, jobman.sql from pylearn.io import filetensor from utils import produit_croise_jobs -TEST_CONFIG = False +from sgd_optimization import SdaSgdOptimizer + +SERIES_AVAILABLE = False +try: + from scalar_series import * + SERIES_AVAILABLE = True +except ImportError: + print "Could not import Series" + +TEST_CONFIG = True NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' -JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/' +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda2' + REDUCE_TRAIN_TO = None MAX_FINETUNING_EPOCHS = 1000 +REDUCE_EVERY = 1000 # number of minibatches before taking means for valid error etc. if TEST_CONFIG: - JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/' REDUCE_TRAIN_TO = 1000 MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 -JOBDB_JOBS = JOBDB + 'fsavard_sda1_jobs' -JOBDB_RESULTS = JOBDB + 'fsavard_sda1_results' EXPERIMENT_PATH = "ift6266.scripts.stacked_dae.nist_sda.jobman_entrypoint" -# There used to be -# 'finetuning_lr': [0.00001, 0.0001, 0.001, 0.01, 0.1] -# and -# 'num_hidden_layers':[1,2,3] -# but this is now handled by a special mechanism in SgdOptimizer -# to reuse intermediate results (for the same training of lower layers, -# we can test many finetuning_lr) -JOB_VALS = {'pretraining_lr': [0.1, 0.01, 0.001],#, 0.0001], +JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], 'pretraining_epochs_per_layer': [10,20], 'hidden_layers_sizes': [300,800], - 'corruption_levels': [0.1,0.2], + 'corruption_levels': [0.1,0.2,0.3], 'minibatch_size': [20], - 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS]} -FINETUNING_LR_VALS = [0.1, 0.01, 0.001]#, 0.0001] -NUM_HIDDEN_LAYERS_VALS = [1,2,3] + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2,3]} # Just useful for tests... minimal number of epochs DEFAULT_HP_NIST = DD({'finetuning_lr':0.01, 'pretraining_lr':0.01, 'pretraining_epochs_per_layer':1, 'max_finetuning_epochs':1, - 'hidden_layers_sizes':[1000], - 'corruption_levels':[0.2], - 'minibatch_size':20}) + 'hidden_layers_sizes':1000, + 'corruption_levels':0.2, + 'minibatch_size':20, + 'reduce_train_to':1000, + 'num_hidden_layers':1}) def jobman_entrypoint(state, channel): - state = copy.copy(state) + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + channel.save() + + workingdir = os.getcwd() print "Will load NIST" + sys.stdout.flush() + nist = NIST(20) + print "NIST loaded" + sys.stdout.flush() rtt = None if state.has_key('reduce_train_to'): @@ -83,50 +98,58 @@ n_ins = 32*32 n_outs = 62 # 10 digits, 26*2 (lower, capitals) - db = jobman.sql.db(JOBDB_RESULTS) - optimizer = SdaSgdOptimizer(dataset, state, n_ins, n_outs,\ - input_divider=255.0, job_tree=True, results_db=db, \ - experiment=EXPERIMENT_PATH, \ - finetuning_lr_to_try=FINETUNING_LR_VALS, \ - num_hidden_layers_to_try=NUM_HIDDEN_LAYERS_VALS) - optimizer.train() + hls = state.hidden_layers_sizes + cl = state.corruption_levels + nhl = state.num_hidden_layers + state.hidden_layers_sizes = [hls] * nhl + state.corruption_levels = [cl] * nhl + + # b,b',W for each hidden layer + b,W of last layer (logreg) + numparams = nhl * 3 + 2 + series_mux = None + if SERIES_AVAILABLE: + series_mux = create_series(workingdir, numparams) + + optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + input_divider=255.0, series_mux=series_mux) + + optimizer.pretrain() + channel.save() + + optimizer.finetune() + channel.save() + + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + channel.save() return channel.COMPLETE -def estimate_pretraining_time(job): - job = DD(job) - # time spent on pretraining estimated as O(n^2) where n=num hidens - # no need to multiply by num_hidden_layers, as results from num=1 - # is reused for num=2, or 3, so in the end we get the same time - # as if we were training 3 times a single layer - # constants: - # - 20 mins to pretrain a layer with 1000 units (per 1 epoch) - # - 12 mins to finetune (per 1 epoch) - # basically the job_tree trick gives us a 5 times speedup on the - # pretraining time due to reusing for finetuning_lr - # and gives us a second x2 speedup for reusing previous layers - # to explore num_hidden_layers - return (job.pretraining_epochs_per_layer * 20 / (1000.0*1000) \ - * job.hidden_layer_sizes * job.hidden_layer_sizes) +def create_series(basedir, numparams): + mux = SeriesMultiplexer() + + # comment out series we don't want to save + mux.add_series(AccumulatorSeries(name="reconstruction_error", + reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save + mean=True, + directory=basedir, flush_every=1)) -def estimate_total_time(): - jobs = produit_croise_jobs(JOB_VALS) - sumtime = 0.0 - sum_without = 0.0 - for job in jobs: - sumtime += estimate_pretraining_time(job) - # 12 mins per epoch * 30 epochs - # 5 finetuning_lr per pretraining combination - sum_without = (12*20*len(jobs) + sumtime*2) * len(FINETUNING_LR_VALS) - sumtime += len(FINETUNING_LR_VALS) * len(jobs) * 12 * 20 - print "num jobs=", len(jobs) - print "estimate", sumtime/60, " hours" - print "estimate without tree optimization", sum_without/60, "ratio", sumtime / sum_without + mux.add_series(AccumulatorSeries(name="training_error", + reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save + mean=True, + directory=basedir, flush_every=1)) + + mux.add_series(BaseSeries(name="validation_error", directory=basedir, flush_every=1)) + mux.add_series(BaseSeries(name="test_error", directory=basedir, flush_every=1)) + + mux.add_series(ParamsArrayStats(numparams,name="params",directory=basedir)) + + return mux def jobman_insert_nist(): jobs = produit_croise_jobs(JOB_VALS) - db = jobman.sql.db(JOBDB_JOBS) + db = jobman.sql.db(JOBDB) for job in jobs: job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) jobman.sql.insert_dict(job, db) @@ -250,13 +273,11 @@ elif len(args) > 0 and args[0] == 'jobman_insert': jobman_insert_nist() - elif len(args) > 0 and args[0] == 'test_job_tree': - # dont forget to comment out sql.inserts and make reduce_train_to=100 - print "TESTING JOB TREE" - chanmock = {'COMPLETE':0} - hp = copy.copy(DEFAULT_HP_NIST) - hp.update({'reduce_train_to':100}) - jobman_entrypoint(hp, chanmock) + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0}) + jobman_entrypoint(DEFAULT_HP_NIST, chanmock) + elif len(args) > 0 and args[0] == 'estimate': estimate_total_time() else: diff -r 4d3d3627df3e -r b9ea8e2d071a deep/stacked_dae/sgd_optimization.py --- a/deep/stacked_dae/sgd_optimization.py Fri Feb 26 15:25:44 2010 -0500 +++ b/deep/stacked_dae/sgd_optimization.py Fri Feb 26 17:45:52 2010 -0500 @@ -7,7 +7,6 @@ import theano import time import theano.tensor as T -import copy import sys from jobman import DD @@ -24,44 +23,34 @@ shared_y = theano.shared(data_y) return shared_x, shared_y +class DummyMux(): + def append(self, param1, param2): + pass + class SdaSgdOptimizer: - def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0,\ - job_tree=False, results_db=None,\ - experiment="",\ - num_hidden_layers_to_try=[1,2,3], \ - finetuning_lr_to_try=[0.1, 0.01, 0.001, 0.0001, 0.00001]): - + def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series_mux=None): self.dataset = dataset - self.hp = copy.copy(hyperparameters) + self.hp = hyperparameters self.n_ins = n_ins self.n_outs = n_outs self.input_divider = numpy.asarray(input_divider, dtype=theano.config.floatX) - - self.job_tree = job_tree - self.results_db = results_db - self.experiment = experiment - if self.job_tree: - assert(not results_db is None) - # these hp should not be there, so we insert default values - # we use 3 hidden layers as we'll iterate through 1,2,3 - self.hp.finetuning_lr = 0.1 # dummy value, will be replaced anyway - cl = self.hp.corruption_levels - nh = self.hp.hidden_layers_sizes - self.hp.corruption_levels = [cl,cl,cl] - self.hp.hidden_layers_sizes = [nh,nh,nh] - - self.num_hidden_layers_to_try = num_hidden_layers_to_try - self.finetuning_lr_to_try = finetuning_lr_to_try - - self.printout_frequency = 1000 + + if not series_mux: + series_mux = DummyMux() + print "No series multiplexer set" + self.series_mux = series_mux self.rng = numpy.random.RandomState(1234) self.init_datasets() self.init_classifier() + + sys.stdout.flush() def init_datasets(self): print "init_datasets" + sys.stdout.flush() + train_set, valid_set, test_set = self.dataset self.test_set_x, self.test_set_y = shared_dataset(test_set) self.valid_set_x, self.valid_set_y = shared_dataset(valid_set) @@ -74,6 +63,7 @@ def init_classifier(self): print "Constructing classifier" + # construct the stacked denoising autoencoder class self.classifier = SdA( \ train_set_x= self.train_set_x, \ @@ -88,17 +78,15 @@ finetune_lr = self.hp.finetuning_lr,\ input_divider = self.input_divider ) + sys.stdout.flush() + def train(self): self.pretrain() - if not self.job_tree: - # if job_tree is True, finetuning was already performed - self.finetune() + self.finetune() def pretrain(self): print "STARTING PRETRAINING" - - printout_acc = 0.0 - last_error = 0.0 + sys.stdout.flush() start_time = time.clock() ## Pre-train layer-wise @@ -109,62 +97,17 @@ for batch_index in xrange(self.n_train_batches): c = self.classifier.pretrain_functions[i](batch_index) - printout_acc += c / self.printout_frequency - if (batch_index+1) % self.printout_frequency == 0: - print batch_index, "reconstruction cost avg=", printout_acc - last_error = printout_acc - printout_acc = 0.0 + self.series_mux.append("reconstruction_error", c) print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c - - self.job_splitter(i+1, time.clock()-start_time, last_error) + sys.stdout.flush() end_time = time.clock() print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) - - # Save time by reusing intermediate results - def job_splitter(self, current_pretraining_layer, pretraining_time, last_error): - - state_copy = None - original_classifier = None - - if self.job_tree and current_pretraining_layer in self.num_hidden_layers_to_try: - for lr in self.finetuning_lr_to_try: - sys.stdout.flush() - sys.stderr.flush() - - state_copy = copy.copy(self.hp) - - self.hp.update({'num_hidden_layers':current_pretraining_layer, \ - 'finetuning_lr':lr,\ - 'pretraining_time':pretraining_time,\ - 'last_reconstruction_error':last_error}) + self.hp.update({'pretraining_time': end_time-start_time}) - original_classifier = self.classifier - print "ORIGINAL CLASSIFIER MEANS",original_classifier.get_params_means() - self.classifier = SdA.copy_reusing_lower_layers(original_classifier, current_pretraining_layer, new_finetuning_lr=lr) - - self.finetune() - - self.insert_finished_job() - - print "NEW CLASSIFIER MEANS AFTERWARDS",self.classifier.get_params_means() - print "ORIGINAL CLASSIFIER MEANS AFTERWARDS",original_classifier.get_params_means() - self.classifier = original_classifier - self.hp = state_copy - - def insert_finished_job(self): - job = copy.copy(self.hp) - job[jobman.sql.STATUS] = jobman.sql.DONE - job[jobman.sql.EXPERIMENT] = self.experiment - - # don,t try to store arrays in db - job['hidden_layers_sizes'] = job.hidden_layers_sizes[0] - job['corruption_levels'] = job.corruption_levels[0] - - print "Will insert finished job", job - jobman.sql.insert_dict(jobman.flatten(job), self.results_db) + sys.stdout.flush() def finetune(self): print "STARTING FINETUNING" @@ -205,11 +148,6 @@ done_looping = False epoch = 0 - printout_acc = 0.0 - - if not self.hp.has_key('max_finetuning_epochs'): - self.hp.max_finetuning_epochs = 1000 - while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(self.n_train_batches): @@ -217,10 +155,7 @@ cost_ij = self.classifier.finetune(minibatch_index) iter = epoch * self.n_train_batches + minibatch_index - printout_acc += cost_ij / float(self.printout_frequency * minibatch_size) - if (iter+1) % self.printout_frequency == 0: - print iter, "cost avg=", printout_acc - printout_acc = 0.0 + self.series_mux.append("training_error", cost_ij) if (iter+1) % validation_frequency == 0: @@ -251,6 +186,9 @@ (epoch, minibatch_index+1, self.n_train_batches, test_score*100.)) + sys.stdout.flush() + + self.series_mux.append("params", self.classifier.params) if patience <= iter : done_looping = True @@ -261,6 +199,7 @@ 'best_validation_error':best_validation_loss,\ 'test_score':test_score, 'num_finetuning_epochs':epoch}) + print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score*100.)) diff -r 4d3d3627df3e -r b9ea8e2d071a deep/stacked_dae/stacked_dae.py --- a/deep/stacked_dae/stacked_dae.py Fri Feb 26 15:25:44 2010 -0500 +++ b/deep/stacked_dae/stacked_dae.py Fri Feb 26 17:45:52 2010 -0500 @@ -144,6 +144,9 @@ def __init__(self, train_set_x, train_set_y, batch_size, n_ins, hidden_layers_sizes, n_outs, corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) update_locals(self, locals()) self.layers = [] @@ -239,48 +242,6 @@ self.errors = self.logLayer.errors(self.y) - @classmethod - def copy_reusing_lower_layers(cls, obj, num_hidden_layers, new_finetuning_lr=None): - assert(num_hidden_layers <= obj.n_layers) - - if not new_finetuning_lr: - new_finetuning_lr = obj.finetune_lr - - new_sda = cls(train_set_x= obj.train_set_x, \ - train_set_y = obj.train_set_y,\ - batch_size = obj.batch_size, \ - n_ins= obj.n_ins, \ - hidden_layers_sizes = obj.hidden_layers_sizes[:num_hidden_layers], \ - n_outs = obj.n_outs, \ - corruption_levels = obj.corruption_levels[:num_hidden_layers],\ - rng = obj.rng,\ - pretrain_lr = obj.pretrain_lr, \ - finetune_lr = new_finetuning_lr, \ - input_divider = obj.input_divider ) - - # new_sda.layers contains only the hidden layers actually - for i, layer in enumerate(new_sda.layers): - original_layer = obj.layers[i] - for p1,p2 in zip(layer.params, original_layer.params): - p1.value = p2.value.copy() - - return new_sda - - def get_params_copy(self): - return copy.deepcopy(self.params) - - def set_params_from_copy(self, copy): - # We don't want to replace the var, as the functions have pointers in there - # We only want to replace values. - for i, p in enumerate(self.params): - p.value = copy[i].value - - def get_params_means(self): - s = [] - for p in self.params: - s.append(numpy.mean(p.value)) - return s - if __name__ == '__main__': import sys args = sys.argv[1:]