Mercurial > ift6266
diff scripts/stacked_dae/nist_sda.py @ 131:5c79a2557f2f
Un peu de ménage dans code pour stacked DAE, splitté en fichiers dans un nouveau sous-répertoire.
author | savardf |
---|---|
date | Fri, 19 Feb 2010 08:43:10 -0500 |
parents | |
children | 7d8366fb90bf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/stacked_dae/nist_sda.py Fri Feb 19 08:43:10 2010 -0500 @@ -0,0 +1,150 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import os.path + +from sgd_optimization import sgd_optimization + +from jobman import DD +from pylearn.io import filetensor + +from utils import produit_croise_jobs + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' + +# Just useful for tests... minimal number of epochs +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':1, + 'max_finetuning_epochs':1, + 'hidden_layers_sizes':[1000,1000], + 'corruption_levels':[0.2,0.2], + 'minibatch_size':20}) + +def jobman_entrypoint_nist(state, channel): + sgd_optimization_nist(state) + +def jobman_insert_nist(): + vals = {'finetuning_lr': [0.00001, 0.0001, 0.001, 0.01, 0.1], + 'pretraining_lr': [0.00001, 0.0001, 0.001, 0.01, 0.1], + 'pretraining_epochs_per_layer': [2,5,20], + 'hidden_layer_sizes': [100,300,1000], + 'num_hidden_layers':[1,2,3], + 'corruption_levels': [0.1,0.2,0.4], + 'minibatch_size': [5,20,100]} + + jobs = produit_croise_jobs(vals) + + for job in jobs: + insert_job(job) + + +class NIST: + def __init__(self, minibatch_size, basepath=None): + global NIST_ALL_LOCATION + + self.minibatch_size = minibatch_size + self.basepath = basepath and basepath or NIST_ALL_LOCATION + + self.set_filenames() + + # arrays of 2 elements: .x, .y + self.train = [None, None] + self.test = [None, None] + + self.load_train_test() + + self.valid = [[], []] + #self.split_train_valid() + + + def get_tvt(self): + return self.train, self.valid, self.test + + def set_filenames(self): + self.train_files = ['all_train_data.ft', + 'all_train_labels.ft'] + + self.test_files = ['all_test_data.ft', + 'all_test_labels.ft'] + + def load_train_test(self): + self.load_data_labels(self.train_files, self.train) + self.load_data_labels(self.test_files, self.test) + + def load_data_labels(self, filenames, pair): + for i, fn in enumerate(filenames): + f = open(os.path.join(self.basepath, fn)) + pair[i] = filetensor.read(f) + f.close() + + def split_train_valid(self): + test_len = len(self.test[0]) + + new_train_x = self.train[0][:-test_len] + new_train_y = self.train[1][:-test_len] + + self.valid[0] = self.train[0][-test_len:] + self.valid[1] = self.train[1][-test_len:] + + self.train[0] = new_train_x + self.train[1] = new_train_y + +def test_load_nist(): + print "Will load NIST" + + import time + t1 = time.time() + nist = NIST(20) + t2 = time.time() + + print "NIST loaded. time delta = ", t2-t1 + + tr,v,te = nist.get_tvt() + + print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0]) + + raw_input("Press any key") + +# hp for hyperparameters +def sgd_optimization_nist(hp=None, dataset_dir='/data/lisa/data/nist'): + global DEFAULT_HP_NIST + hp = hp and hp or DEFAULT_HP_NIST + + print "Will load NIST" + + import time + t1 = time.time() + nist = NIST(20) + t2 = time.time() + + print "NIST loaded. time delta = ", t2-t1 + + train,valid,test = nist.get_tvt() + dataset = (train,valid,test) + + print "Lenghts train, valid, test: ", len(train[0]), len(valid[0]), len(test[0]) + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + sgd_optimization(dataset, hp, n_ins, n_outs) + +if __name__ == '__main__': + + import sys + + args = sys.argv[1:] + + if len(args) > 0 and args[0] == 'load_nist': + test_load_nist() + + else: + sgd_optimization_nist() +