# HG changeset patch # User fsavard # Date 1269010479 14400 # Node ID c8fe09a6503970bdbcbaed7c729010f33f3a19ca # Parent 3c54cb3713ef1b1ab708d2dcb28f86ade57949e3 Déplacer le nouveau code de stacked_dae de v2 vers le répertoire de base 'stacked_dae', et bougé le vieux code vers le répertoire 'old' diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/__init__.py diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/config.py.example --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/config.py.example Fri Mar 19 10:54:39 2010 -0400 @@ -0,0 +1,105 @@ +# ---------------------------------------------------------------------------- +# BEGIN EXPERIMENT ISOLATION CODE + +''' +This makes sure we use the codebase clone created for this experiment. +I.e. if you want to make modifications to the codebase but don't want your +running experiment code to be impacted by those changes, first copy the +codebase somewhere, and configure this section. It will make sure we import +from the right place. + +MUST BE DONE BEFORE IMPORTING ANYTHING ELSE +(Leave this comment there so others will understand what's going on) +''' + +# Place where you copied modules that should be fixed for this experiment +codebase_clone_path = "/u/savardf/ift6266/experiment_clones/ift6266_experiment10" + +# Places where there might be conflicting modules from your $PYTHONPATH +remove_these_from_pythonpath = ["/u/savardf/ift6266/dev_code"] + +import sys +sys.path[0:0] = [codebase_clone_path] + +# remove paths we specifically don't want in $PYTHONPATH +for bad_path in remove_these_from_pythonpath: + sys.path[:] = [el for el in sys.path if not el (bad_path, bad_path+"/")] + +# Make the imports +import ift6266 + +# Just making sure we're importing from the right place +modules_to_check = [ift6266] +for module in modules_to_check: + assert codebase_clone_path in module.__path__ + +# Path to pass to jobman sqlschedule. IMPORTANT TO CHANGE TO REFLECT YOUR CLONE. +# Make sure this is accessible from the default $PYTHONPATH (in your .bashrc) +EXPERIMENT_PATH = "ift6266_experiment10.deep.stacked_dae.nist_sda.jobman_entrypoint" + +# END EXPERIMENT ISOLATION CODE +# ---------------------------------------------------------------------------- + +''' +These are parameters used by nist_sda.py. They'll end up as globals in there. + +Rename this file to config.py and configure as needed. +DON'T add the renamed file to the repository, as others might use it +without realizing it, with dire consequences. +''' + +# Set this to True when you want to run cluster tests, ie. you want +# to run on the cluster, many jobs, but want to reduce the training +# set size and the number of epochs, so you know everything runs +# fine on the cluster. +# Set this PRIOR to inserting your test jobs in the DB. +TEST_CONFIG = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +NIST_ALL_TRAIN_SIZE = 649081 +# valid et test =82587 82587 + +# change "sandbox" when you're ready +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere' + +# reduce training set to that many examples +REDUCE_TRAIN_TO = None +# that's a max, it usually doesn't get to that point +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + + +# This is to configure insertion of jobs on the cluster. +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10,20], + 'hidden_layers_sizes': [300,800], + 'corruption_levels': [0.1,0.2,0.3], + 'minibatch_size': [20], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2,3]} + +# Just useful for tests... minimal number of epochs +# (This is used when running a single job, locally, when +# calling ./nist_sda.py test_jobman_entrypoint +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':2, + 'max_finetuning_epochs':2, + 'hidden_layers_sizes':800, + 'corruption_levels':0.2, + 'minibatch_size':20, + 'reduce_train_to':10000, + 'num_hidden_layers':1}) + + diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/mnist_sda.py --- a/deep/stacked_dae/mnist_sda.py Tue Mar 16 12:01:31 2010 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -# TODO: This probably doesn't work anymore, adapt to new code in sgd_opt -# Parameterize call to sgd_optimization for MNIST - -import numpy -import theano -import time -import theano.tensor as T -from theano.tensor.shared_randomstreams import RandomStreams - -from sgd_optimization import SdaSgdOptimizer -import cPickle, gzip -from jobman import DD - -MNIST_LOCATION = '/u/savardf/datasets/mnist.pkl.gz' - -def sgd_optimization_mnist(learning_rate=0.1, pretraining_epochs = 2, \ - pretrain_lr = 0.1, training_epochs = 5, \ - dataset='mnist.pkl.gz'): - # Load the dataset - f = gzip.open(dataset,'rb') - # this gives us train, valid, test (each with .x, .y) - dataset = cPickle.load(f) - f.close() - - n_ins = 28*28 - n_outs = 10 - - hyperparameters = DD({'finetuning_lr':learning_rate, - 'pretraining_lr':pretrain_lr, - 'pretraining_epochs_per_layer':pretraining_epochs, - 'max_finetuning_epochs':training_epochs, - 'hidden_layers_sizes':[100], - 'corruption_levels':[0.2], - 'minibatch_size':20}) - - optimizer = SdaSgdOptimizer(dataset, hyperparameters, n_ins, n_outs) - optimizer.pretrain() - optimizer.finetune() - -if __name__ == '__main__': - sgd_optimization_mnist(dataset=MNIST_LOCATION) - diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/nist_sda.py --- a/deep/stacked_dae/nist_sda.py Tue Mar 16 12:01:31 2010 -0400 +++ b/deep/stacked_dae/nist_sda.py Fri Mar 19 10:54:39 2010 -0400 @@ -25,69 +25,23 @@ from sgd_optimization import SdaSgdOptimizer -from ift6266.utils.scalar_series import * - -############################################################################## -# GLOBALS - -TEST_CONFIG = False - -NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' -JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4' -EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint" - -REDUCE_TRAIN_TO = None -MAX_FINETUNING_EPOCHS = 1000 -# number of minibatches before taking means for valid error etc. -REDUCE_EVERY = 1000 - -if TEST_CONFIG: - REDUCE_TRAIN_TO = 1000 - MAX_FINETUNING_EPOCHS = 2 - REDUCE_EVERY = 10 +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables -# Possible values the hyperparameters can take. These are then -# combined with produit_cartesien_jobs so we get a list of all -# possible combinations, each one resulting in a job inserted -# in the jobman DB. -JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], - 'pretraining_epochs_per_layer': [10,20], - 'hidden_layers_sizes': [300,800], - 'corruption_levels': [0.1,0.2,0.3], - 'minibatch_size': [20], - 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], - 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out - 'num_hidden_layers':[2,3]} - -# Just useful for tests... minimal number of epochs -DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, - 'pretraining_lr':0.1, - 'pretraining_epochs_per_layer':20, - 'max_finetuning_epochs':2, - 'hidden_layers_sizes':800, - 'corruption_levels':0.2, - 'minibatch_size':20, - #'reduce_train_to':300, - 'num_hidden_layers':2}) +from ift6266 import datasets +from config import * ''' Function called by jobman upon launching each job -Its path is the one given when inserting jobs: -ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint +Its path is the one given when inserting jobs: see EXPERIMENT_PATH ''' def jobman_entrypoint(state, channel): # record mercurial versions of each package pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB channel.save() - workingdir = os.getcwd() - - print "Will load NIST" - - nist = NIST(minibatch_size=20) - - print "NIST loaded" - # For test runs, we don't want to use the whole dataset so # reduce it to fewer elements if asked to. rtt = None @@ -95,59 +49,93 @@ rtt = state['reduce_train_to'] elif REDUCE_TRAIN_TO: rtt = REDUCE_TRAIN_TO - - if rtt: - print "Reducing training set to "+str(rtt)+ " examples" - nist.reduce_train_set(rtt) - - train,valid,test = nist.get_tvt() - dataset = (train,valid,test) - + n_ins = 32*32 n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = NIST_ALL_TRAIN_SIZE - # b,b',W for each hidden layer - # + b,W of last layer (logreg) - numparams = state.num_hidden_layers * 3 + 2 - series_mux = None - series_mux = create_series(workingdir, numparams) + series = create_series(state.num_hidden_layers) print "Creating optimizer with state, ", state - optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \ + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, + hyperparameters=state, \ n_ins=n_ins, n_outs=n_outs,\ - input_divider=255.0, series_mux=series_mux) + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) - optimizer.pretrain() + optimizer.pretrain(datasets.nist_all) channel.save() - optimizer.finetune() + optimizer.finetune(datasets.nist_all) channel.save() return channel.COMPLETE # These Series objects are used to save various statistics # during the training. -def create_series(basedir, numparams): - mux = SeriesMultiplexer() +def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] # comment out series we don't want to save - mux.add_series(AccumulatorSeries(name="reconstruction_error", - reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save - mean=True, - directory=basedir, flush_every=1)) + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) - mux.add_series(AccumulatorSeries(name="training_error", - reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save - mean=True, - directory=basedir, flush_every=1)) - - mux.add_series(BaseSeries(name="validation_error", directory=basedir, flush_every=1)) - mux.add_series(BaseSeries(name="test_error", directory=basedir, flush_every=1)) - - mux.add_series(ParamsArrayStats(numparams,name="params",directory=basedir)) - - return mux + return series # Perform insertion into the Postgre DB based on combination # of hyperparameter values above @@ -162,93 +150,14 @@ print "inserted" -class NIST: - def __init__(self, minibatch_size, basepath=None, reduce_train_to=None): - global NIST_ALL_LOCATION - - self.minibatch_size = minibatch_size - self.basepath = basepath and basepath or NIST_ALL_LOCATION - - self.set_filenames() - - # arrays of 2 elements: .x, .y - self.train = [None, None] - self.test = [None, None] - - self.load_train_test() - - self.valid = [[], []] - self.split_train_valid() - if reduce_train_to: - self.reduce_train_set(reduce_train_to) - - def get_tvt(self): - return self.train, self.valid, self.test - - def set_filenames(self): - self.train_files = ['all_train_data.ft', - 'all_train_labels.ft'] - - self.test_files = ['all_test_data.ft', - 'all_test_labels.ft'] - - def load_train_test(self): - self.load_data_labels(self.train_files, self.train) - self.load_data_labels(self.test_files, self.test) - - def load_data_labels(self, filenames, pair): - for i, fn in enumerate(filenames): - f = open(os.path.join(self.basepath, fn)) - pair[i] = filetensor.read(f) - f.close() - - def reduce_train_set(self, max): - self.train[0] = self.train[0][:max] - self.train[1] = self.train[1][:max] - - if max < len(self.test[0]): - for ar in (self.test, self.valid): - ar[0] = ar[0][:max] - ar[1] = ar[1][:max] - - def split_train_valid(self): - test_len = len(self.test[0]) - - new_train_x = self.train[0][:-test_len] - new_train_y = self.train[1][:-test_len] - - self.valid[0] = self.train[0][-test_len:] - self.valid[1] = self.train[1][-test_len:] - - self.train[0] = new_train_x - self.train[1] = new_train_y - -def test_load_nist(): - print "Will load NIST" - - import time - t1 = time.time() - nist = NIST(20) - t2 = time.time() - - print "NIST loaded. time delta = ", t2-t1 - - tr,v,te = nist.get_tvt() - - print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0]) - - raw_input("Press any key") - if __name__ == '__main__': - import sys - args = sys.argv[1:] - if len(args) > 0 and args[0] == 'load_nist': - test_load_nist() + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() - elif len(args) > 0 and args[0] == 'jobman_insert': + if len(args) > 0 and args[0] == 'jobman_insert': jobman_insert_nist() elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/old/__init__.py diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/old/mnist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/mnist_sda.py Fri Mar 19 10:54:39 2010 -0400 @@ -0,0 +1,45 @@ +#!/usr/bin/python +# coding: utf-8 + +# TODO: This probably doesn't work anymore, adapt to new code in sgd_opt +# Parameterize call to sgd_optimization for MNIST + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from sgd_optimization import SdaSgdOptimizer +import cPickle, gzip +from jobman import DD + +MNIST_LOCATION = '/u/savardf/datasets/mnist.pkl.gz' + +def sgd_optimization_mnist(learning_rate=0.1, pretraining_epochs = 2, \ + pretrain_lr = 0.1, training_epochs = 5, \ + dataset='mnist.pkl.gz'): + # Load the dataset + f = gzip.open(dataset,'rb') + # this gives us train, valid, test (each with .x, .y) + dataset = cPickle.load(f) + f.close() + + n_ins = 28*28 + n_outs = 10 + + hyperparameters = DD({'finetuning_lr':learning_rate, + 'pretraining_lr':pretrain_lr, + 'pretraining_epochs_per_layer':pretraining_epochs, + 'max_finetuning_epochs':training_epochs, + 'hidden_layers_sizes':[100], + 'corruption_levels':[0.2], + 'minibatch_size':20}) + + optimizer = SdaSgdOptimizer(dataset, hyperparameters, n_ins, n_outs) + optimizer.pretrain() + optimizer.finetune() + +if __name__ == '__main__': + sgd_optimization_mnist(dataset=MNIST_LOCATION) + diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/old/nist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/nist_sda.py Fri Mar 19 10:54:39 2010 -0400 @@ -0,0 +1,260 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs + +from sgd_optimization import SdaSgdOptimizer + +from ift6266.utils.scalar_series import * + +############################################################################## +# GLOBALS + +TEST_CONFIG = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4' +EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint" + +REDUCE_TRAIN_TO = None +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 1000 + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10,20], + 'hidden_layers_sizes': [300,800], + 'corruption_levels': [0.1,0.2,0.3], + 'minibatch_size': [20], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2,3]} + +# Just useful for tests... minimal number of epochs +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':20, + 'max_finetuning_epochs':2, + 'hidden_layers_sizes':800, + 'corruption_levels':0.2, + 'minibatch_size':20, + #'reduce_train_to':300, + 'num_hidden_layers':2}) + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: +ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + channel.save() + + workingdir = os.getcwd() + + print "Will load NIST" + + nist = NIST(minibatch_size=20) + + print "NIST loaded" + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + if rtt: + print "Reducing training set to "+str(rtt)+ " examples" + nist.reduce_train_set(rtt) + + train,valid,test = nist.get_tvt() + dataset = (train,valid,test) + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + # b,b',W for each hidden layer + # + b,W of last layer (logreg) + numparams = state.num_hidden_layers * 3 + 2 + series_mux = None + series_mux = create_series(workingdir, numparams) + + print "Creating optimizer with state, ", state + + optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + input_divider=255.0, series_mux=series_mux) + + optimizer.pretrain() + channel.save() + + optimizer.finetune() + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(basedir, numparams): + mux = SeriesMultiplexer() + + # comment out series we don't want to save + mux.add_series(AccumulatorSeries(name="reconstruction_error", + reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save + mean=True, + directory=basedir, flush_every=1)) + + mux.add_series(AccumulatorSeries(name="training_error", + reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save + mean=True, + directory=basedir, flush_every=1)) + + mux.add_series(BaseSeries(name="validation_error", directory=basedir, flush_every=1)) + mux.add_series(BaseSeries(name="test_error", directory=basedir, flush_every=1)) + + mux.add_series(ParamsArrayStats(numparams,name="params",directory=basedir)) + + return mux + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +class NIST: + def __init__(self, minibatch_size, basepath=None, reduce_train_to=None): + global NIST_ALL_LOCATION + + self.minibatch_size = minibatch_size + self.basepath = basepath and basepath or NIST_ALL_LOCATION + + self.set_filenames() + + # arrays of 2 elements: .x, .y + self.train = [None, None] + self.test = [None, None] + + self.load_train_test() + + self.valid = [[], []] + self.split_train_valid() + if reduce_train_to: + self.reduce_train_set(reduce_train_to) + + def get_tvt(self): + return self.train, self.valid, self.test + + def set_filenames(self): + self.train_files = ['all_train_data.ft', + 'all_train_labels.ft'] + + self.test_files = ['all_test_data.ft', + 'all_test_labels.ft'] + + def load_train_test(self): + self.load_data_labels(self.train_files, self.train) + self.load_data_labels(self.test_files, self.test) + + def load_data_labels(self, filenames, pair): + for i, fn in enumerate(filenames): + f = open(os.path.join(self.basepath, fn)) + pair[i] = filetensor.read(f) + f.close() + + def reduce_train_set(self, max): + self.train[0] = self.train[0][:max] + self.train[1] = self.train[1][:max] + + if max < len(self.test[0]): + for ar in (self.test, self.valid): + ar[0] = ar[0][:max] + ar[1] = ar[1][:max] + + def split_train_valid(self): + test_len = len(self.test[0]) + + new_train_x = self.train[0][:-test_len] + new_train_y = self.train[1][:-test_len] + + self.valid[0] = self.train[0][-test_len:] + self.valid[1] = self.train[1][-test_len:] + + self.train[0] = new_train_x + self.train[1] = new_train_y + +def test_load_nist(): + print "Will load NIST" + + import time + t1 = time.time() + nist = NIST(20) + t2 = time.time() + + print "NIST loaded. time delta = ", t2-t1 + + tr,v,te = nist.get_tvt() + + print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0]) + + raw_input("Press any key") + +if __name__ == '__main__': + + import sys + + args = sys.argv[1:] + + if len(args) > 0 and args[0] == 'load_nist': + test_load_nist() + + elif len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DEFAULT_HP_NIST, chanmock) + + else: + print "Bad arguments" + diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/old/sgd_optimization.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/sgd_optimization.py Fri Mar 19 10:54:39 2010 -0400 @@ -0,0 +1,234 @@ +#!/usr/bin/python +# coding: utf-8 + +# Generic SdA optimization loop, adapted from the deeplearning.net tutorial + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys + +from jobman import DD +import jobman, jobman.sql + +from stacked_dae import SdA + +def shared_dataset(data_xy): + data_x, data_y = data_xy + #shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + #shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + #shared_y = T.cast(shared_y, 'int32') + shared_x = theano.shared(data_x) + shared_y = theano.shared(data_y) + return shared_x, shared_y + +class DummyMux(): + def append(self, param1, param2): + pass + +class SdaSgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series_mux=None): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + self.input_divider = input_divider + + if not series_mux: + series_mux = DummyMux() + print "No series multiplexer set" + self.series_mux = series_mux + + self.rng = numpy.random.RandomState(1234) + + self.init_datasets() + self.init_classifier() + + sys.stdout.flush() + + def init_datasets(self): + print "init_datasets" + sys.stdout.flush() + + train_set, valid_set, test_set = self.dataset + self.test_set_x, self.test_set_y = shared_dataset(test_set) + self.valid_set_x, self.valid_set_y = shared_dataset(valid_set) + self.train_set_x, self.train_set_y = shared_dataset(train_set) + + # compute number of minibatches for training, validation and testing + self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size + self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size + # remove last batch in case it's incomplete + self.n_test_batches = (self.test_set_x.value.shape[0] / self.hp.minibatch_size) - 1 + + def init_classifier(self): + print "Constructing classifier" + + # we don't want to save arrays in DD objects, so + # we recreate those arrays here + nhl = self.hp.num_hidden_layers + layers_sizes = [self.hp.hidden_layers_sizes] * nhl + corruption_levels = [self.hp.corruption_levels] * nhl + + # construct the stacked denoising autoencoder class + self.classifier = SdA( \ + train_set_x= self.train_set_x, \ + train_set_y = self.train_set_y,\ + batch_size = self.hp.minibatch_size, \ + n_ins= self.n_ins, \ + hidden_layers_sizes = layers_sizes, \ + n_outs = self.n_outs, \ + corruption_levels = corruption_levels,\ + rng = self.rng,\ + pretrain_lr = self.hp.pretraining_lr, \ + finetune_lr = self.hp.finetuning_lr,\ + input_divider = self.input_divider ) + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain() + self.finetune() + + def pretrain(self): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + #time_acc_func = 0.0 + #time_acc_total = 0.0 + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + for batch_index in xrange(self.n_train_batches): + #t1 = time.clock() + c = self.classifier.pretrain_functions[i](batch_index) + #t2 = time.clock() + + #time_acc_func += t2 - t1 + + #if batch_index % 500 == 0: + # print "acc / total", time_acc_func / (t2 - start_time), time_acc_func + + self.series_mux.append("reconstruction_error", c) + + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series_mux.append("params", self.classifier.all_params) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + def finetune(self): + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + index = T.lscalar() # index to a [mini]batch + minibatch_size = self.hp.minibatch_size + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX)) + test_model = theano.function([index], self.classifier.errors, + givens = { + self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, + self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + + validate_model = theano.function([index], self.classifier.errors, + givens = { + self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, + self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(self.n_train_batches): + + cost_ij = self.classifier.finetune(minibatch_index) + iter = epoch * self.n_train_batches + minibatch_index + + self.series_mux.append("training_error", cost_ij) + + if (iter+1) % validation_frequency == 0: + + validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) + self.series_mux.append("validation_error", this_validation_loss) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, self.n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = [test_model(i) for i in xrange(self.n_test_batches)] + test_score = numpy.mean(test_losses) + self.series_mux.append("test_error", test_score) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, self.n_train_batches, + test_score*100.)) + + sys.stdout.flush() + + self.series_mux.append("params", self.classifier.all_params) + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + + diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/old/stacked_dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/stacked_dae.py Fri Mar 19 10:54:39 2010 -0400 @@ -0,0 +1,287 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import copy + +from utils import update_locals + +# taken from LeDeepNet/daa.py +# has a special case when taking log(0) (defined =0) +# modified to not take the mean anymore +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + +class LogisticRegression(object): + def __init__(self, input, n_in, n_out): + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + + + +class dA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) + + # bypassing z to avoid running to log(0) + #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime) + #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \ + # + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 ) + + # I added this epsilon to avoid getting log(0) and 1/0 in grad + # This means conceptually that there'd be no probability of 0, but that + # doesn't seem to me as important (maybe I'm wrong?). + eps = 0.00000001 + eps_1 = 1-eps + self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + +class SdA(object): + def __init__(self, train_set_x, train_set_y, batch_size, n_ins, + hidden_layers_sizes, n_outs, + corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + update_locals(self, locals()) + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + # MODIF: added this so we also get the b_primes + # (not used for finetuning... still using ".params") + self.all_params = [] + self.n_layers = len(hidden_layers_sizes) + + print "Creating SdA with params:" + print "batch_size", batch_size + print "hidden_layers_sizes", hidden_layers_sizes + print "corruption_levels", corruption_levels + print "n_ins", n_ins + print "n_outs", n_outs + print "pretrain_lr", pretrain_lr + print "finetune_lr", finetune_lr + print "input_divider", input_divider + print "----" + + self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + hidden_layers_sizes[i] ) + # add the layer to the + self.layers += [layer] + self.params += layer.params + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(input_size, hidden_layers_sizes[i], \ + corruption_level = corruption_levels[0],\ + input = layer_input, \ + shared_W = layer.W, shared_b = layer.b) + + self.all_params += dA_layer.params + + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([index], dA_layer.cost, \ + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) + # collect this function into a list + self.pretrain_functions += [update_fn] + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params += self.logLayer.params + self.all_params += self.logLayer.params + # construct a function that implements one step of finetunining + + # compute the cost, defined as the negative log likelihood + cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + # compute list of updates + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([index], cost, + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) + + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + + self.errors = self.logLayer.errors(self.y) + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/old/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/utils.py Fri Mar 19 10:54:39 2010 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) + diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/sgd_optimization.py --- a/deep/stacked_dae/sgd_optimization.py Tue Mar 16 12:01:31 2010 -0400 +++ b/deep/stacked_dae/sgd_optimization.py Fri Mar 19 10:54:39 2010 -0400 @@ -15,53 +15,43 @@ from stacked_dae import SdA -def shared_dataset(data_xy): - data_x, data_y = data_xy - #shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) - #shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) - #shared_y = T.cast(shared_y, 'int32') - shared_x = theano.shared(data_x) - shared_y = theano.shared(data_y) - return shared_x, shared_y +from ift6266.utils.seriestables import * -class DummyMux(): - def append(self, param1, param2): - pass +default_series = { \ + 'reconstruction_error' : DummySeries(), + 'training_error' : DummySeries(), + 'validation_error' : DummySeries(), + 'test_error' : DummySeries(), + 'params' : DummySeries() + } + +def itermax(iter, max): + for i,it in enumerate(iter): + if i >= max: + break + yield it class SdaSgdOptimizer: - def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series_mux=None): + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, max_minibatches=None): self.dataset = dataset self.hp = hyperparameters self.n_ins = n_ins self.n_outs = n_outs - self.input_divider = input_divider - if not series_mux: - series_mux = DummyMux() - print "No series multiplexer set" - self.series_mux = series_mux + self.max_minibatches = max_minibatches + print "SdaSgdOptimizer, max_minibatches =", max_minibatches + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + + self.series = series self.rng = numpy.random.RandomState(1234) - self.init_datasets() self.init_classifier() sys.stdout.flush() - - def init_datasets(self): - print "init_datasets" - sys.stdout.flush() - - train_set, valid_set, test_set = self.dataset - self.test_set_x, self.test_set_y = shared_dataset(test_set) - self.valid_set_x, self.valid_set_y = shared_dataset(valid_set) - self.train_set_x, self.train_set_y = shared_dataset(train_set) - - # compute number of minibatches for training, validation and testing - self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size - self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size - # remove last batch in case it's incomplete - self.n_test_batches = (self.test_set_x.value.shape[0] / self.hp.minibatch_size) - 1 def init_classifier(self): print "Constructing classifier" @@ -74,8 +64,6 @@ # construct the stacked denoising autoencoder class self.classifier = SdA( \ - train_set_x= self.train_set_x, \ - train_set_y = self.train_set_y,\ batch_size = self.hp.minibatch_size, \ n_ins= self.n_ins, \ hidden_layers_sizes = layers_sizes, \ @@ -83,46 +71,44 @@ corruption_levels = corruption_levels,\ rng = self.rng,\ pretrain_lr = self.hp.pretraining_lr, \ - finetune_lr = self.hp.finetuning_lr,\ - input_divider = self.input_divider ) + finetune_lr = self.hp.finetuning_lr) #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") sys.stdout.flush() def train(self): - self.pretrain() - self.finetune() + self.pretrain(self.dataset) + self.finetune(self.dataset) - def pretrain(self): + def pretrain(self,dataset): print "STARTING PRETRAINING, time = ", datetime.datetime.now() sys.stdout.flush() - #time_acc_func = 0.0 - #time_acc_total = 0.0 - start_time = time.clock() ## Pre-train layer-wise for i in xrange(self.classifier.n_layers): # go through pretraining epochs for epoch in xrange(self.hp.pretraining_epochs_per_layer): # go through the training set - for batch_index in xrange(self.n_train_batches): - #t1 = time.clock() - c = self.classifier.pretrain_functions[i](batch_index) - #t2 = time.clock() + batch_index=0 + for x,y in dataset.train(self.hp.minibatch_size): + c = self.classifier.pretrain_functions[i](x) + + self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 - #time_acc_func += t2 - t1 + #if batch_index % 100 == 0: + # print "100 batches" - #if batch_index % 500 == 0: - # print "acc / total", time_acc_func / (t2 - start_time), time_acc_func - - self.series_mux.append("reconstruction_error", c) + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c sys.stdout.flush() - self.series_mux.append("params", self.classifier.all_params) + self.series['params'].append((epoch,), self.classifier.all_params) end_time = time.clock() @@ -131,24 +117,26 @@ sys.stdout.flush() - def finetune(self): + def finetune(self,dataset): print "STARTING FINETUNING, time = ", datetime.datetime.now() - index = T.lscalar() # index to a [mini]batch minibatch_size = self.hp.minibatch_size # create a function to compute the mistakes that are made by the model # on the validation set, or testing set - shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX)) - test_model = theano.function([index], self.classifier.errors, - givens = { - self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, - self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) - validate_model = theano.function([index], self.classifier.errors, - givens = { - self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, - self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) # early-stopping parameters @@ -157,11 +145,13 @@ # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = min(self.n_train_batches, patience/2) + validation_frequency = min(self.mb_per_epoch, patience/2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch + if self.max_minibatches and validation_frequency > self.max_minibatches: + validation_frequency = self.max_minibatches / 2 best_params = None best_validation_loss = float('inf') @@ -171,22 +161,31 @@ done_looping = False epoch = 0 + total_mb_index = 0 + while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): epoch = epoch + 1 - for minibatch_index in xrange(self.n_train_batches): + minibatch_index = -1 + for x,y in dataset.train(minibatch_size): + minibatch_index += 1 + cost_ij = self.classifier.finetune(x,y) + total_mb_index += 1 - cost_ij = self.classifier.finetune(minibatch_index) - iter = epoch * self.n_train_batches + minibatch_index - - self.series_mux.append("training_error", cost_ij) + self.series["training_error"].append((epoch, minibatch_index), cost_ij) - if (iter+1) % validation_frequency == 0: + if (total_mb_index+1) % validation_frequency == 0: - validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)] + iter = dataset.valid(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + validation_losses = [validate_model(x,y) for x,y in iter] this_validation_loss = numpy.mean(validation_losses) - self.series_mux.append("validation_error", this_validation_loss) + + self.series["validation_error"].\ + append((epoch, minibatch_index), this_validation_loss*100.) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, self.n_train_batches, \ + (epoch, minibatch_index+1, self.mb_per_epoch, \ this_validation_loss*100.)) @@ -196,26 +195,36 @@ #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold : - patience = max(patience, iter * patience_increase) + patience = max(patience, total_mb_index * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss - best_iter = iter + best_iter = total_mb_index # test it on the test set - test_losses = [test_model(i) for i in xrange(self.n_test_batches)] + iter = dataset.test(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + test_losses = [test_model(x,y) for x,y in iter] test_score = numpy.mean(test_losses) - self.series_mux.append("test_error", test_score) + + self.series["test_error"].\ + append((epoch, minibatch_index), test_score*100.) + print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % - (epoch, minibatch_index+1, self.n_train_batches, + (epoch, minibatch_index+1, self.mb_per_epoch, test_score*100.)) sys.stdout.flush() - self.series_mux.append("params", self.classifier.all_params) + # useful when doing tests + if self.max_minibatches and minibatch_index >= self.max_minibatches: + break - if patience <= iter : + self.series['params'].append((epoch,), self.classifier.all_params) + + if patience <= total_mb_index: done_looping = True break diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/stacked_dae.py --- a/deep/stacked_dae/stacked_dae.py Tue Mar 16 12:01:31 2010 -0400 +++ b/deep/stacked_dae/stacked_dae.py Fri Mar 19 10:54:39 2010 -0400 @@ -127,13 +127,13 @@ # this will produce an array of 0s and 1s where 1 has a # probability of 1 - ``corruption_level`` and 0 with # ``corruption_level`` - self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x # Equation (2) # note : y is stored as an attribute of the class so that it can be # used later when stacking dAs. self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) # Equation (3) - self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) # Equation (4) # note : we sum over the size of a datapoint; if we are using minibatches, # L will be a vector, with one entry per example in minibatch @@ -141,17 +141,20 @@ #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) # bypassing z to avoid running to log(0) - #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime) - #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \ - # + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 ) + z_a = T.dot(self.y, self.W_prime) + self.b_prime + log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) + # log(1-sigmoid(z_a)) + log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) + self.L = -T.sum( self.x * (log_sigmoid) \ + + (1.0-self.x) * (log_1_sigmoid), axis=1 ) # I added this epsilon to avoid getting log(0) and 1/0 in grad # This means conceptually that there'd be no probability of 0, but that # doesn't seem to me as important (maybe I'm wrong?). - eps = 0.00000001 - eps_1 = 1-eps - self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ - + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) + #eps = 0.00000001 + #eps_1 = 1-eps + #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + # + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) # note : L is now a vector, where each element is the cross-entropy cost # of the reconstruction of the corresponding example of the # minibatch. We need to compute the average of all these to get @@ -162,9 +165,9 @@ class SdA(object): - def __init__(self, train_set_x, train_set_y, batch_size, n_ins, + def __init__(self, batch_size, n_ins, hidden_layers_sizes, n_outs, - corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): + corruption_levels, rng, pretrain_lr, finetune_lr): # Just to make sure those are not modified somewhere else afterwards hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) corruption_levels = copy.deepcopy(corruption_levels) @@ -187,17 +190,14 @@ print "n_outs", n_outs print "pretrain_lr", pretrain_lr print "finetune_lr", finetune_lr - print "input_divider", input_divider print "----" - self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) - if len(hidden_layers_sizes) < 1 : raiseException (' You must have at least one hidden layer ') # allocate symbolic variables for the data - index = T.lscalar() # index to a [mini]batch + #index = T.lscalar() # index to a [mini]batch self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels @@ -244,10 +244,15 @@ updates[param] = param - gparam * pretrain_lr # create a function that trains the dA - update_fn = theano.function([index], dA_layer.cost, \ - updates = updates, - givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) + update_fn = theano.function([self.x], dA_layer.cost, \ + updates = updates)#, + # givens = { + # self.x : ensemble}) + # collect this function into a list + #update_fn = theano.function([index], dA_layer.cost, \ + # updates = updates, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) # collect this function into a list self.pretrain_functions += [update_fn] @@ -270,11 +275,11 @@ for param,gparam in zip(self.params, gparams): updates[param] = param - gparam*finetune_lr - self.finetune = theano.function([index], cost, - updates = updates, - givens = { - self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, - self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) + self.finetune = theano.function([self.x,self.y], cost, + updates = updates)#, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + # self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/utils.py diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/v2/__init__.py diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/v2/config.py.example --- a/deep/stacked_dae/v2/config.py.example Tue Mar 16 12:01:31 2010 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -''' -These are parameters used by nist_sda.py. They'll end up as globals in there. - -Rename this file to config.py and configure as needed. -DON'T add the renamed file to the repository, as others might use it -without realizing it, with dire consequences. -''' - -# Set this to True when you want to run cluster tests, ie. you want -# to run on the cluster, many jobs, but want to reduce the training -# set size and the number of epochs, so you know everything runs -# fine on the cluster. -# Set this PRIOR to inserting your test jobs in the DB. -TEST_CONFIG = False - -NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' -NIST_ALL_TRAIN_SIZE = 649081 -# valid et test =82587 82587 - -# change "sandbox" when you're ready -JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere' -EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint" - -# reduce training set to that many examples -REDUCE_TRAIN_TO = None -# that's a max, it usually doesn't get to that point -MAX_FINETUNING_EPOCHS = 1000 -# number of minibatches before taking means for valid error etc. -REDUCE_EVERY = 100 - -if TEST_CONFIG: - REDUCE_TRAIN_TO = 1000 - MAX_FINETUNING_EPOCHS = 2 - REDUCE_EVERY = 10 - - -# This is to configure insertion of jobs on the cluster. -# Possible values the hyperparameters can take. These are then -# combined with produit_cartesien_jobs so we get a list of all -# possible combinations, each one resulting in a job inserted -# in the jobman DB. -JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], - 'pretraining_epochs_per_layer': [10,20], - 'hidden_layers_sizes': [300,800], - 'corruption_levels': [0.1,0.2,0.3], - 'minibatch_size': [20], - 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], - 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out - 'num_hidden_layers':[2,3]} - -# Just useful for tests... minimal number of epochs -# (This is used when running a single job, locally, when -# calling ./nist_sda.py test_jobman_entrypoint -DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, - 'pretraining_lr':0.1, - 'pretraining_epochs_per_layer':2, - 'max_finetuning_epochs':2, - 'hidden_layers_sizes':800, - 'corruption_levels':0.2, - 'minibatch_size':20, - 'reduce_train_to':10000, - 'num_hidden_layers':1}) - - diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/v2/nist_sda.py --- a/deep/stacked_dae/v2/nist_sda.py Tue Mar 16 12:01:31 2010 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,169 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -import ift6266 -import pylearn - -import numpy -import theano -import time - -import pylearn.version -import theano.tensor as T -from theano.tensor.shared_randomstreams import RandomStreams - -import copy -import sys -import os -import os.path - -from jobman import DD -import jobman, jobman.sql -from pylearn.io import filetensor - -from utils import produit_cartesien_jobs - -from sgd_optimization import SdaSgdOptimizer - -#from ift6266.utils.scalar_series import * -from ift6266.utils.seriestables import * -import tables - -from ift6266 import datasets -from config import * - -''' -Function called by jobman upon launching each job -Its path is the one given when inserting jobs: see EXPERIMENT_PATH -''' -def jobman_entrypoint(state, channel): - # record mercurial versions of each package - pylearn.version.record_versions(state,[theano,ift6266,pylearn]) - # TODO: remove this, bad for number of simultaneous requests on DB - channel.save() - - # For test runs, we don't want to use the whole dataset so - # reduce it to fewer elements if asked to. - rtt = None - if state.has_key('reduce_train_to'): - rtt = state['reduce_train_to'] - elif REDUCE_TRAIN_TO: - rtt = REDUCE_TRAIN_TO - - n_ins = 32*32 - n_outs = 62 # 10 digits, 26*2 (lower, capitals) - - examples_per_epoch = NIST_ALL_TRAIN_SIZE - - series = create_series(state.num_hidden_layers) - - print "Creating optimizer with state, ", state - - optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, - hyperparameters=state, \ - n_ins=n_ins, n_outs=n_outs,\ - examples_per_epoch=examples_per_epoch, \ - series=series, - max_minibatches=rtt) - - optimizer.pretrain(datasets.nist_all) - channel.save() - - optimizer.finetune(datasets.nist_all) - channel.save() - - return channel.COMPLETE - -# These Series objects are used to save various statistics -# during the training. -def create_series(num_hidden_layers): - - # Replace series we don't want to save with DummySeries, e.g. - # series['training_error'] = DummySeries() - - series = {} - - basedir = os.getcwd() - - h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") - - # reconstruction - reconstruction_base = \ - ErrorSeries(error_name="reconstruction_error", - table_name="reconstruction_error", - hdf5_file=h5f, - index_names=('epoch','minibatch'), - title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") - series['reconstruction_error'] = \ - AccumulatorSeriesWrapper(base_series=reconstruction_base, - reduce_every=REDUCE_EVERY) - - # train - training_base = \ - ErrorSeries(error_name="training_error", - table_name="training_error", - hdf5_file=h5f, - index_names=('epoch','minibatch'), - title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") - series['training_error'] = \ - AccumulatorSeriesWrapper(base_series=training_base, - reduce_every=REDUCE_EVERY) - - # valid and test are not accumulated/mean, saved directly - series['validation_error'] = \ - ErrorSeries(error_name="validation_error", - table_name="validation_error", - hdf5_file=h5f, - index_names=('epoch','minibatch')) - - series['test_error'] = \ - ErrorSeries(error_name="test_error", - table_name="test_error", - hdf5_file=h5f, - index_names=('epoch','minibatch')) - - param_names = [] - for i in range(num_hidden_layers): - param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] - param_names += ['logreg_layer_W', 'logreg_layer_b'] - - # comment out series we don't want to save - series['params'] = SharedParamsStatisticsWrapper( - new_group_name="params", - base_group="/", - arrays_names=param_names, - hdf5_file=h5f, - index_names=('epoch',)) - - return series - -# Perform insertion into the Postgre DB based on combination -# of hyperparameter values above -# (see comment for produit_cartesien_jobs() to know how it works) -def jobman_insert_nist(): - jobs = produit_cartesien_jobs(JOB_VALS) - - db = jobman.sql.db(JOBDB) - for job in jobs: - job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) - jobman.sql.insert_dict(job, db) - - print "inserted" - -if __name__ == '__main__': - - args = sys.argv[1:] - - #if len(args) > 0 and args[0] == 'load_nist': - # test_load_nist() - - if len(args) > 0 and args[0] == 'jobman_insert': - jobman_insert_nist() - - elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': - chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) - jobman_entrypoint(DEFAULT_HP_NIST, chanmock) - - else: - print "Bad arguments" - diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/v2/sgd_optimization.py --- a/deep/stacked_dae/v2/sgd_optimization.py Tue Mar 16 12:01:31 2010 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,243 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -# Generic SdA optimization loop, adapted from the deeplearning.net tutorial - -import numpy -import theano -import time -import datetime -import theano.tensor as T -import sys - -from jobman import DD -import jobman, jobman.sql - -from stacked_dae import SdA - -from ift6266.utils.seriestables import * - -default_series = { \ - 'reconstruction_error' : DummySeries(), - 'training_error' : DummySeries(), - 'validation_error' : DummySeries(), - 'test_error' : DummySeries(), - 'params' : DummySeries() - } - -def itermax(iter, max): - for i,it in enumerate(iter): - if i >= max: - break - yield it - -class SdaSgdOptimizer: - def __init__(self, dataset, hyperparameters, n_ins, n_outs, - examples_per_epoch, series=default_series, max_minibatches=None): - self.dataset = dataset - self.hp = hyperparameters - self.n_ins = n_ins - self.n_outs = n_outs - - self.max_minibatches = max_minibatches - print "SdaSgdOptimizer, max_minibatches =", max_minibatches - - self.ex_per_epoch = examples_per_epoch - self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size - - self.series = series - - self.rng = numpy.random.RandomState(1234) - - self.init_classifier() - - sys.stdout.flush() - - def init_classifier(self): - print "Constructing classifier" - - # we don't want to save arrays in DD objects, so - # we recreate those arrays here - nhl = self.hp.num_hidden_layers - layers_sizes = [self.hp.hidden_layers_sizes] * nhl - corruption_levels = [self.hp.corruption_levels] * nhl - - # construct the stacked denoising autoencoder class - self.classifier = SdA( \ - batch_size = self.hp.minibatch_size, \ - n_ins= self.n_ins, \ - hidden_layers_sizes = layers_sizes, \ - n_outs = self.n_outs, \ - corruption_levels = corruption_levels,\ - rng = self.rng,\ - pretrain_lr = self.hp.pretraining_lr, \ - finetune_lr = self.hp.finetuning_lr) - - #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") - - sys.stdout.flush() - - def train(self): - self.pretrain(self.dataset) - self.finetune(self.dataset) - - def pretrain(self,dataset): - print "STARTING PRETRAINING, time = ", datetime.datetime.now() - sys.stdout.flush() - - start_time = time.clock() - ## Pre-train layer-wise - for i in xrange(self.classifier.n_layers): - # go through pretraining epochs - for epoch in xrange(self.hp.pretraining_epochs_per_layer): - # go through the training set - batch_index=0 - for x,y in dataset.train(self.hp.minibatch_size): - c = self.classifier.pretrain_functions[i](x) - - self.series["reconstruction_error"].append((epoch, batch_index), c) - batch_index+=1 - - #if batch_index % 100 == 0: - # print "100 batches" - - # useful when doing tests - if self.max_minibatches and batch_index >= self.max_minibatches: - break - - print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c - sys.stdout.flush() - - self.series['params'].append((epoch,), self.classifier.all_params) - - end_time = time.clock() - - print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) - self.hp.update({'pretraining_time': end_time-start_time}) - - sys.stdout.flush() - - def finetune(self,dataset): - print "STARTING FINETUNING, time = ", datetime.datetime.now() - - minibatch_size = self.hp.minibatch_size - - # create a function to compute the mistakes that are made by the model - # on the validation set, or testing set - test_model = \ - theano.function( - [self.classifier.x,self.classifier.y], self.classifier.errors) - # givens = { - # self.classifier.x: ensemble_x, - # self.classifier.y: ensemble_y]}) - - validate_model = \ - theano.function( - [self.classifier.x,self.classifier.y], self.classifier.errors) - # givens = { - # self.classifier.x: , - # self.classifier.y: ]}) - - - # early-stopping parameters - patience = 10000 # look as this many examples regardless - patience_increase = 2. # wait this much longer when a new best is - # found - improvement_threshold = 0.995 # a relative improvement of this much is - # considered significant - validation_frequency = min(self.mb_per_epoch, patience/2) - # go through this many - # minibatche before checking the network - # on the validation set; in this case we - # check every epoch - if self.max_minibatches and validation_frequency > self.max_minibatches: - validation_frequency = self.max_minibatches / 2 - - best_params = None - best_validation_loss = float('inf') - test_score = 0. - start_time = time.clock() - - done_looping = False - epoch = 0 - - total_mb_index = 0 - - while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): - epoch = epoch + 1 - minibatch_index = -1 - for x,y in dataset.train(minibatch_size): - minibatch_index += 1 - cost_ij = self.classifier.finetune(x,y) - total_mb_index += 1 - - self.series["training_error"].append((epoch, minibatch_index), cost_ij) - - if (total_mb_index+1) % validation_frequency == 0: - - iter = dataset.valid(minibatch_size) - if self.max_minibatches: - iter = itermax(iter, self.max_minibatches) - validation_losses = [validate_model(x,y) for x,y in iter] - this_validation_loss = numpy.mean(validation_losses) - - self.series["validation_error"].\ - append((epoch, minibatch_index), this_validation_loss*100.) - - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, self.mb_per_epoch, \ - this_validation_loss*100.)) - - - # if we got the best validation score until now - if this_validation_loss < best_validation_loss: - - #improve patience if loss improvement is good enough - if this_validation_loss < best_validation_loss * \ - improvement_threshold : - patience = max(patience, total_mb_index * patience_increase) - - # save best validation score and iteration number - best_validation_loss = this_validation_loss - best_iter = total_mb_index - - # test it on the test set - iter = dataset.test(minibatch_size) - if self.max_minibatches: - iter = itermax(iter, self.max_minibatches) - test_losses = [test_model(x,y) for x,y in iter] - test_score = numpy.mean(test_losses) - - self.series["test_error"].\ - append((epoch, minibatch_index), test_score*100.) - - print((' epoch %i, minibatch %i/%i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, self.mb_per_epoch, - test_score*100.)) - - sys.stdout.flush() - - # useful when doing tests - if self.max_minibatches and minibatch_index >= self.max_minibatches: - break - - self.series['params'].append((epoch,), self.classifier.all_params) - - if patience <= total_mb_index: - done_looping = True - break - - end_time = time.clock() - self.hp.update({'finetuning_time':end_time-start_time,\ - 'best_validation_error':best_validation_loss,\ - 'test_score':test_score, - 'num_finetuning_epochs':epoch}) - - print(('Optimization complete with best validation score of %f %%,' - 'with test performance %f %%') % - (best_validation_loss * 100., test_score*100.)) - print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) - - - diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/v2/stacked_dae.py --- a/deep/stacked_dae/v2/stacked_dae.py Tue Mar 16 12:01:31 2010 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,292 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -import numpy -import theano -import time -import theano.tensor as T -from theano.tensor.shared_randomstreams import RandomStreams -import copy - -from utils import update_locals - -# taken from LeDeepNet/daa.py -# has a special case when taking log(0) (defined =0) -# modified to not take the mean anymore -from theano.tensor.xlogx import xlogx, xlogy0 -# it's target*log(output) -def binary_cross_entropy(target, output, sum_axis=1): - XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) - return -T.sum(XE, axis=sum_axis) - -class LogisticRegression(object): - def __init__(self, input, n_in, n_out): - # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - self.W = theano.shared( value=numpy.zeros((n_in,n_out), - dtype = theano.config.floatX) ) - # initialize the baises b as a vector of n_out 0s - self.b = theano.shared( value=numpy.zeros((n_out,), - dtype = theano.config.floatX) ) - # compute vector of class-membership probabilities in symbolic form - self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) - - # compute prediction as class whose probability is maximal in - # symbolic form - self.y_pred=T.argmax(self.p_y_given_x, axis=1) - - # list of parameters for this layer - self.params = [self.W, self.b] - - def negative_log_likelihood(self, y): - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - def errors(self, y): - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) - - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() - - -class SigmoidalLayer(object): - def __init__(self, rng, input, n_in, n_out): - self.input = input - - W_values = numpy.asarray( rng.uniform( \ - low = -numpy.sqrt(6./(n_in+n_out)), \ - high = numpy.sqrt(6./(n_in+n_out)), \ - size = (n_in, n_out)), dtype = theano.config.floatX) - self.W = theano.shared(value = W_values) - - b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) - self.b = theano.shared(value= b_values) - - self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) - self.params = [self.W, self.b] - - - -class dA(object): - def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ - input = None, shared_W = None, shared_b = None): - self.n_visible = n_visible - self.n_hidden = n_hidden - - # create a Theano random generator that gives symbolic random values - theano_rng = RandomStreams() - - if shared_W != None and shared_b != None : - self.W = shared_W - self.b = shared_b - else: - # initial values for weights and biases - # note : W' was written as `W_prime` and b' as `b_prime` - - # W is initialized with `initial_W` which is uniformely sampled - # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - initial_W = numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_hidden+n_visible)), \ - high = numpy.sqrt(6./(n_hidden+n_visible)), \ - size = (n_visible, n_hidden)), dtype = theano.config.floatX) - initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) - - - # theano shared variables for weights and biases - self.W = theano.shared(value = initial_W, name = "W") - self.b = theano.shared(value = initial_b, name = "b") - - - initial_b_prime= numpy.zeros(n_visible) - # tied weights, therefore W_prime is W transpose - self.W_prime = self.W.T - self.b_prime = theano.shared(value = initial_b_prime, name = "b'") - - # if no input is given, generate a variable representing the input - if input == None : - # we use a matrix because we expect a minibatch of several examples, - # each example being a row - self.x = T.dmatrix(name = 'input') - else: - self.x = input - # Equation (1) - # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs - # note : first argument of theano.rng.binomial is the shape(size) of - # random numbers that it should produce - # second argument is the number of trials - # third argument is the probability of success of any trial - # - # this will produce an array of 0s and 1s where 1 has a - # probability of 1 - ``corruption_level`` and 0 with - # ``corruption_level`` - self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x - # Equation (2) - # note : y is stored as an attribute of the class so that it can be - # used later when stacking dAs. - self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) - # Equation (3) - #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) - # Equation (4) - # note : we sum over the size of a datapoint; if we are using minibatches, - # L will be a vector, with one entry per example in minibatch - #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) - #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) - - # bypassing z to avoid running to log(0) - z_a = T.dot(self.y, self.W_prime) + self.b_prime - log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) - # log(1-sigmoid(z_a)) - log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) - self.L = -T.sum( self.x * (log_sigmoid) \ - + (1.0-self.x) * (log_1_sigmoid), axis=1 ) - - # I added this epsilon to avoid getting log(0) and 1/0 in grad - # This means conceptually that there'd be no probability of 0, but that - # doesn't seem to me as important (maybe I'm wrong?). - #eps = 0.00000001 - #eps_1 = 1-eps - #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ - # + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) - # note : L is now a vector, where each element is the cross-entropy cost - # of the reconstruction of the corresponding example of the - # minibatch. We need to compute the average of all these to get - # the cost of the minibatch - self.cost = T.mean(self.L) - - self.params = [ self.W, self.b, self.b_prime ] - - -class SdA(object): - def __init__(self, batch_size, n_ins, - hidden_layers_sizes, n_outs, - corruption_levels, rng, pretrain_lr, finetune_lr): - # Just to make sure those are not modified somewhere else afterwards - hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) - corruption_levels = copy.deepcopy(corruption_levels) - - update_locals(self, locals()) - - self.layers = [] - self.pretrain_functions = [] - self.params = [] - # MODIF: added this so we also get the b_primes - # (not used for finetuning... still using ".params") - self.all_params = [] - self.n_layers = len(hidden_layers_sizes) - - print "Creating SdA with params:" - print "batch_size", batch_size - print "hidden_layers_sizes", hidden_layers_sizes - print "corruption_levels", corruption_levels - print "n_ins", n_ins - print "n_outs", n_outs - print "pretrain_lr", pretrain_lr - print "finetune_lr", finetune_lr - print "----" - - if len(hidden_layers_sizes) < 1 : - raiseException (' You must have at least one hidden layer ') - - - # allocate symbolic variables for the data - #index = T.lscalar() # index to a [mini]batch - self.x = T.matrix('x') # the data is presented as rasterized images - self.y = T.ivector('y') # the labels are presented as 1D vector of - # [int] labels - - for i in xrange( self.n_layers ): - # construct the sigmoidal layer - - # the size of the input is either the number of hidden units of - # the layer below or the input size if we are on the first layer - if i == 0 : - input_size = n_ins - else: - input_size = hidden_layers_sizes[i-1] - - # the input to this layer is either the activation of the hidden - # layer below or the input of the SdA if you are on the first - # layer - if i == 0 : - layer_input = self.x - else: - layer_input = self.layers[-1].output - - layer = SigmoidalLayer(rng, layer_input, input_size, - hidden_layers_sizes[i] ) - # add the layer to the - self.layers += [layer] - self.params += layer.params - - # Construct a denoising autoencoder that shared weights with this - # layer - dA_layer = dA(input_size, hidden_layers_sizes[i], \ - corruption_level = corruption_levels[0],\ - input = layer_input, \ - shared_W = layer.W, shared_b = layer.b) - - self.all_params += dA_layer.params - - # Construct a function that trains this dA - # compute gradients of layer parameters - gparams = T.grad(dA_layer.cost, dA_layer.params) - # compute the list of updates - updates = {} - for param, gparam in zip(dA_layer.params, gparams): - updates[param] = param - gparam * pretrain_lr - - # create a function that trains the dA - update_fn = theano.function([self.x], dA_layer.cost, \ - updates = updates)#, - # givens = { - # self.x : ensemble}) - # collect this function into a list - #update_fn = theano.function([index], dA_layer.cost, \ - # updates = updates, - # givens = { - # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) - # collect this function into a list - self.pretrain_functions += [update_fn] - - - # We now need to add a logistic layer on top of the MLP - self.logLayer = LogisticRegression(\ - input = self.layers[-1].output,\ - n_in = hidden_layers_sizes[-1], n_out = n_outs) - - self.params += self.logLayer.params - self.all_params += self.logLayer.params - # construct a function that implements one step of finetunining - - # compute the cost, defined as the negative log likelihood - cost = self.logLayer.negative_log_likelihood(self.y) - # compute the gradients with respect to the model parameters - gparams = T.grad(cost, self.params) - # compute list of updates - updates = {} - for param,gparam in zip(self.params, gparams): - updates[param] = param - gparam*finetune_lr - - self.finetune = theano.function([self.x,self.y], cost, - updates = updates)#, - # givens = { - # self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, - # self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) - - # symbolic variable that points to the number of errors made on the - # minibatch given by self.x and self.y - - self.errors = self.logLayer.errors(self.y) - -if __name__ == '__main__': - import sys - args = sys.argv[1:] - diff -r 3c54cb3713ef -r c8fe09a65039 deep/stacked_dae/v2/utils.py --- a/deep/stacked_dae/v2/utils.py Tue Mar 16 12:01:31 2010 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -from __future__ import with_statement - -from jobman import DD - -# from pylearn codebase -# useful in __init__(param1, param2, etc.) to save -# values in self.param1, self.param2... just call -# update_locals(self, locals()) -def update_locals(obj, dct): - if 'self' in dct: - del dct['self'] - obj.__dict__.update(dct) - -# from a dictionary of possible values for hyperparameters, e.g. -# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} -# create a list of other dictionaries representing all the possible -# combinations, thus in this example creating: -# [{'learning_rate': 0.1, 'num_layers': 1}, ...] -# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) -def produit_cartesien_jobs(val_dict): - job_list = [DD()] - all_keys = val_dict.keys() - - for key in all_keys: - possible_values = val_dict[key] - new_job_list = [] - for val in possible_values: - for job in job_list: - to_insert = job.copy() - to_insert.update({key: val}) - new_job_list.append(to_insert) - job_list = new_job_list - - return job_list - -def test_produit_cartesien_jobs(): - vals = {'a': [1,2], 'b': [3,4,5]} - print produit_cartesien_jobs(vals) - - -# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python -"""Simple module for getting amount of memory used by a specified user's -processes on a UNIX system. -It uses UNIX ps utility to get the memory usage for a specified username and -pipe it to awk for summing up per application memory usage and return the total. -Python's Popen() from subprocess module is used for spawning ps and awk. - -""" - -import subprocess - -class MemoryMonitor(object): - - def __init__(self, username): - """Create new MemoryMonitor instance.""" - self.username = username - - def usage(self): - """Return int containing memory used by user's processes.""" - self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, - shell=True, - stdout=subprocess.PIPE, - ) - self.stdout_list = self.process.communicate()[0].split('\n') - return int(self.stdout_list[0]) -