# HG changeset patch # User fsavard # Date 1269455815 14400 # Node ID 206374eed2fba7068bfd26e98e0c71f7f37d4750 # Parent 43afd29f3dbd46b8d0d58b47cae8732764423b0e# Parent 20ebc1f2a9fe569bad3692d860a3fa297713984c Merge diff -r 43afd29f3dbd -r 206374eed2fb baseline/conv_mlp/convolutional_mlp.py --- a/baseline/conv_mlp/convolutional_mlp.py Wed Mar 24 14:35:11 2010 -0400 +++ b/baseline/conv_mlp/convolutional_mlp.py Wed Mar 24 14:36:55 2010 -0400 @@ -24,9 +24,12 @@ import numpy, theano, cPickle, gzip, time import theano.tensor as T import theano.sandbox.softsign +import sys import pylearn.datasets.MNIST from pylearn.io import filetensor as ft from theano.sandbox import conv, downsample + +from ift6266 import datasets import theano,pylearn.version,ift6266 class LeNetConvPoolLayer(object): @@ -178,81 +181,16 @@ raise NotImplementedError() -def load_dataset(fname,batch=20): - - # repertoire qui contient les donnees NIST - # le repertoire suivant va fonctionner si vous etes connecte sur un ordinateur - # du reseau DIRO - datapath = '/data/lisa/data/nist/by_class/' - # le fichier .ft contient chiffres NIST dans un format efficace. Les chiffres - # sont stockes dans une matrice de NxD, ou N est le nombre d'images, est D est - # le nombre de pixels par image (32x32 = 1024). Chaque pixel de l'image est une - # valeur entre 0 et 255, correspondant a un niveau de gris. Les valeurs sont - # stockees comme des uint8, donc des bytes. - f = open(datapath+'digits/digits_train_data.ft') - # Verifier que vous avez assez de memoire pour loader les donnees au complet - # dans le memoire. Sinon, utilisez ft.arraylike, une classe construite - # specialement pour des fichiers qu'on ne souhaite pas loader dans RAM. - d = ft.read(f) - - # NB: N'oubliez pas de diviser les valeurs des pixels par 255. si jamais vous - # utilisez les donnees commes entrees dans un reseaux de neurones et que vous - # voulez des entres entre 0 et 1. - # digits_train_data.ft contient les images, digits_train_labels.ft contient les - # etiquettes - f = open(datapath+'digits/digits_train_labels.ft') - labels = ft.read(f) - - - # Load the dataset - #f = gzip.open(fname,'rb') - #train_set, valid_set, test_set = cPickle.load(f) - #f.close() - - # make minibatches of size 20 - batch_size = batch # sized of the minibatch - - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = (d[:200000,:],labels[:200000]) - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] - - #print train_batches[500] - - # Dealing with the validation set - (valid_set_x, valid_set_y) = (d[200000:270000,:],labels[200000:270000]) - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = (d[270000:340000,:],labels[270000:340000]) - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] - - - return train_batches, valid_batches, test_batches - - -def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, dataset='mnist.pkl.gz'): +def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, sigmoide_size=500, dataset='mnist.pkl.gz'): rng = numpy.random.RandomState(23455) print 'Before load dataset' - train_batches, valid_batches, test_batches = load_dataset(dataset,batch_size) + dataset=datasets.nist_digits + train_batches= dataset.train(batch_size) + valid_batches=dataset.valid(batch_size) + test_batches=dataset.test(batch_size) + #print valid_batches.shape + #print test_batches.shape print 'After load dataset' ishape = (32,32) # this is the size of NIST images @@ -305,9 +243,9 @@ fshape0=(32-filter_shape0+1)/2 layer1_input = layer0.output.flatten(2) # construct a fully-connected sigmoidal layer - layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=500) + layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=sigmoide_size) - layer2 = LogisticRegression(input=layer1.output, n_in=500, n_out=10) + layer2 = LogisticRegression(input=layer1.output, n_in=sigmoide_size, n_out=10) cost = layer2.negative_log_likelihood(y) test_model = theano.function([x,y], layer2.errors(y)) params = layer2.params+ layer1.params + layer0.params @@ -335,10 +273,10 @@ layer4_input = layer3.output.flatten(2) layer4 = SigmoidalLayer(rng, input=layer4_input, - n_in=n_kern3*fshape3*fshape3, n_out=500) + n_in=n_kern3*fshape3*fshape3, n_out=sigmoide_size) - layer5 = LogisticRegression(input=layer4.output, n_in=500, n_out=10) + layer5 = LogisticRegression(input=layer4.output, n_in=sigmoide_size, n_out=10) cost = layer5.negative_log_likelihood(y) @@ -354,10 +292,10 @@ layer3_input = layer2.output.flatten(2) layer3 = SigmoidalLayer(rng, input=layer3_input, - n_in=n_kern2*fshape2*fshape2, n_out=500) + n_in=n_kern2*fshape2*fshape2, n_out=sigmoide_size) - layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=10) + layer4 = LogisticRegression(input=layer3.output, n_in=sigmoide_size, n_out=10) cost = layer4.negative_log_likelihood(y) @@ -378,11 +316,11 @@ # construct a fully-connected sigmoidal layer layer2 = SigmoidalLayer(rng, input=layer2_input, - n_in=n_kern1*fshape1*fshape1, n_out=500) + n_in=n_kern1*fshape1*fshape1, n_out=sigmoide_size) # classify the values of the fully-connected sigmoidal layer - layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) + layer3 = LogisticRegression(input=layer2.output, n_in=sigmoide_size, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) @@ -414,7 +352,28 @@ # TRAIN MODEL # ############### - n_minibatches = len(train_batches) + #n_minibatches = len(train_batches) + n_minibatches=0 + n_valid=0 + n_test=0 + for x, y in dataset.train(batch_size): + if x.shape[0] == batch_size: + n_minibatches+=1 + n_minibatches*=batch_size + print n_minibatches + + for x, y in dataset.valid(batch_size): + if x.shape[0] == batch_size: + n_valid+=1 + n_valid*=batch_size + print n_valid + + for x, y in dataset.test(batch_size): + if x.shape[0] == batch_size: + n_test+=1 + n_test*=batch_size + print n_test + # early-stopping parameters patience = 10000 # look as this many examples regardless @@ -433,60 +392,65 @@ test_score = 0. start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter * n_minibatches): - - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - - if iter %100 == 0: - print 'training @ iter = ', iter - cost_ij = train_model(x,y) - - if (iter+1) % validation_frequency == 0: + # have a maximum of `n_iter` iterations through the entire dataset + iter=0 + for epoch in xrange(n_iter): + for x, y in train_batches: + if x.shape[0] != batch_size: + continue + iter+=1 - # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, n_minibatches, \ - this_validation_loss*100.)) + # get epoch and minibatch index + #epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + if iter %100 == 0: + print 'training @ iter = ', iter + cost_ij = train_model(x,y) - # if we got the best validation score until now - if this_validation_loss < best_validation_loss: + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + if x.shape[0] != batch_size: + continue + # sum up the errors for each minibatch + this_validation_loss += test_model(x,y) - #improve patience if loss improvement is good enough - if this_validation_loss < best_validation_loss * \ - improvement_threshold : - patience = max(patience, iter * patience_increase) + # get the average by dividing with the number of minibatches + this_validation_loss /= n_valid + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss*100.)) - # save best validation score and iteration number - best_validation_loss = this_validation_loss - best_iter = iter + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: - # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) - print((' epoch %i, minibatch %i/%i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, n_minibatches, - test_score*100.)) + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter - if patience <= iter : - break + # test it on the test set + test_score = 0. + for x,y in test_batches: + if x.shape[0] != batch_size: + continue + test_score += test_model(x,y) + test_score /= n_test + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + if patience <= iter : + break end_time = time.clock() print('Optimization complete.') @@ -502,8 +466,10 @@ def experiment(state, channel): print 'start experiment' - (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1) + (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1,state.sigmoide_size) print 'end experiment' + + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) state.best_validation_loss = best_validation_loss state.test_score = test_score diff -r 43afd29f3dbd -r 206374eed2fb baseline/log_reg/log_reg.py --- a/baseline/log_reg/log_reg.py Wed Mar 24 14:35:11 2010 -0400 +++ b/baseline/log_reg/log_reg.py Wed Mar 24 14:36:55 2010 -0400 @@ -142,7 +142,7 @@ #-------------------------------------------------------------------------------------------------------------------- def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \ - dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10, \ + dataset=datasets.nist_digits(), image_size = 32 * 32, nb_class = 10, \ patience = 5000, patience_increase = 2, improvement_threshold = 0.995): #28 * 28 = 784 diff -r 43afd29f3dbd -r 206374eed2fb data_generation/transformations/pycaptcha/Captcha/File.py --- a/data_generation/transformations/pycaptcha/Captcha/File.py Wed Mar 24 14:35:11 2010 -0400 +++ b/data_generation/transformations/pycaptcha/Captcha/File.py Wed Mar 24 14:36:55 2010 -0400 @@ -7,7 +7,7 @@ # Copyright (C) 2004 Micah Dowty # -import os, random +import os, random, cPickle # Determine the data directory. This can be overridden after import-time if needed. dataDir = os.path.join(os.path.split(os.path.abspath(__file__))[0], "data") @@ -41,7 +41,10 @@ else: path = os.path.join(dataDir, self.basePath, name) if os.path.isdir(path): - for content in os.listdir(path): + f = open(path + '/filelist.pkl') + filelist = cPickle.load(f) + f.close() + for content in filelist: if self._checkExtension(content): paths.append(os.path.join(path, content)) else: diff -r 43afd29f3dbd -r 206374eed2fb data_generation/transformations/ttf2jpg.py --- a/data_generation/transformations/ttf2jpg.py Wed Mar 24 14:35:11 2010 -0400 +++ b/data_generation/transformations/ttf2jpg.py Wed Mar 24 14:36:55 2010 -0400 @@ -10,6 +10,7 @@ import sys, os, fnmatch, random import Image, ImageFont, ImageDraw, numpy +import cPickle class ttf2jpg(): def __init__(self, font_file = ''): @@ -26,8 +27,9 @@ self.char_list.append(chr(ord('A') + i) ) for i in range(0,26): self.char_list.append(chr(ord('a') + i) ) - files = os.listdir(self.font_dir) - self.font_files = fnmatch.filter(files, '*.ttf') + fnmatch.filter(files, '*.TTF') + f = open( self.font_dir + 'filelist.pkl' ,'r') + self.font_files = cPickle.load(f) + f.close() # get font name def get_settings_names(self): diff -r 43afd29f3dbd -r 206374eed2fb datasets/defs.py --- a/datasets/defs.py Wed Mar 24 14:35:11 2010 -0400 +++ b/datasets/defs.py Wed Mar 24 14:36:55 2010 -0400 @@ -43,8 +43,10 @@ valid_lbl = [os.path.join(DATA_PATH,'ocr_valid_labels.ft')], indtype=theano.config.floatX, inscale=255., maxsize=maxsize) -nist_P07 = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(100)], - train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(100)], +#There is 2 more arguments here to can choose smaller datasets based on the file number. +#This is usefull to get different data for pre-training and finetuning +nist_P07 = lambda maxsize=None, min_file=0, max_file=100: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(min_file, max_file)], + train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(min_file, max_file)], test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')], test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')], valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')], diff -r 43afd29f3dbd -r 206374eed2fb datasets/ftfile.py --- a/datasets/ftfile.py Wed Mar 24 14:35:11 2010 -0400 +++ b/datasets/ftfile.py Wed Mar 24 14:36:55 2010 -0400 @@ -201,7 +201,9 @@ set. """ if valid_data is None: - total_valid_size = min(sum(FTFile(td).size for td in test_data), maxsize) + total_valid_size = sum(FTFile(td).size for td in test_data) + if maxsize is not None: + total_valid_size = min(total_valid_size, maxsize) valid_size = total_valid_size/len(train_data) self._train = FTData(train_data, train_lbl, size=-valid_size, inscale=inscale, outscale=outscale, diff -r 43afd29f3dbd -r 206374eed2fb deep/convolutional_dae/scdae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/scdae.py Wed Mar 24 14:36:55 2010 -0400 @@ -0,0 +1,237 @@ +from pynnet import * +# use hacks also +from pynnet.utils import * + +import numpy +import theano +import theano.tensor as T + +from itertools import izip + +class cdae(LayerStack): + def __init__(self, filter_size, num_filt, num_in, subsampling, corruption, + dtype, img_shape): + LayerStack.__init__(self, [ConvAutoencoder(filter_size=filter_size, + num_filt=num_filt, + num_in=num_in, + noisyness=corruption, + dtype=dtype, + image_shape=img_shape), + MaxPoolLayer(subsampling)]) + + def build(self, input): + LayerStack.build(self, input) + self.cost = self.layers[0].cost + +def cdae_out_size(in_size, filt_size, num_filt, num_in, subs): + out = [None] * 3 + out[0] = num_filt + out[1] = (in_size[1]-filt_size[0]+1)/subs[0] + out[2] = (in_size[2]-filt_size[1]+1)/subs[1] + return out + +def scdae(in_size, num_in, filter_sizes, num_filts, + subsamplings, corruptions, dtype): + layers = [] + old_nfilt = 1 + for fsize, nfilt, subs, corr in izip(filter_sizes, num_filts, + subsamplings, corruptions): + layers.append(cdae(fsize, nfilt, old_nfilt, subs, corr, dtype, + (num_in, in_size[0], in_size[1], in_size[2]))) + in_size = cdae_out_size(in_size, fsize, nfilt, old_nfilt, subs) + old_nfilt = nfilt + return LayerStack(layers), in_size + +def mlp(layer_sizes, dtype): + layers = [] + old_size = layer_sizes[0] + for size in layer_sizes[1:]: + layers.append(SimpleLayer(old_size, size, activation=nlins.tanh, + dtype=dtype)) + old_size = size + return LayerStack(layers) + +def scdae_net(in_size, num_in, filter_sizes, num_filts, subsamplings, + corruptions, layer_sizes, out_size, dtype, batch_size): + rl1 = ReshapeLayer((None,)+in_size) + ls, outs = scdae(in_size, num_in, filter_sizes, num_filts, subsamplings, + corruptions, dtype) + outs = numpy.prod(outs) + rl2 = ReshapeLayer((None, outs)) + layer_sizes = [outs]+layer_sizes + ls2 = mlp(layer_sizes, dtype) + lrl = SimpleLayer(layer_sizes[-1], out_size, activation=nlins.softmax) + return NNet([rl1, ls, rl2, ls2, lrl], error=errors.nll) + +def build_funcs(batch_size, img_size, filter_sizes, num_filters, subs, + noise, mlp_sizes, out_size, dtype, pretrain_lr, train_lr): + + n = scdae_net((1,)+img_size, batch_size, filter_sizes, num_filters, subs, + noise, mlp_sizes, out_size, dtype, batch_size) + x = T.fmatrix('x') + y = T.ivector('y') + + def pretrainfunc(net, alpha): + up = trainers.get_updates(net.params, net.cost, alpha) + return theano.function([x], net.cost, updates=up) + + def trainfunc(net, alpha): + up = trainers.get_updates(net.params, net.cost, alpha) + return theano.function([x, y], net.cost, updates=up) + + n.build(x, y) + pretrain_funcs_opt = [pretrainfunc(l, pretrain_lr) for l in n.layers[1].layers] + trainf_opt = trainfunc(n, train_lr) + evalf_opt = theano.function([x, y], errors.class_error(n.output, y)) + + clear_imgshape(n) + n.build(x, y) + pretrain_funcs_reg = [pretrainfunc(l, 0.01) for l in n.layers[1].layers] + trainf_reg = trainfunc(n, 0.1) + evalf_reg = theano.function([x, y], errors.class_error(n.output, y)) + + def select_f(f1, f2, bsize): + def f(x): + if x.shape[0] == bsize: + return f1(x) + else: + return f2(x) + return f + + pretrain_funcs = [select_f(p_opt, p_reg, batch_size) for p_opt, p_reg in zip(pretrain_funcs_opt, pretrain_funcs_reg)] + + def select_f2(f1, f2, bsize): + def f(x, y): + if x.shape[0] == bsize: + return f1(x, y) + else: + return f2(x, y) + return f + + trainf = select_f2(trainf_opt, trainf_reg, batch_size) + evalf = select_f2(evalf_opt, evalf_reg, batch_size) + return pretrain_funcs, trainf, evalf + +def do_pretrain(pretrain_funcs, pretrain_epochs): + for f in pretrain_funcs: + for i in xrange(pretrain_epochs): + f() + +def massage_funcs(train_it, dset, batch_size, pretrain_funcs, trainf, evalf): + def pretrain_f(f): + def res(): + for x, y in train_it: + yield f(x) + it = res() + return lambda: it.next() + + pretrain_fs = map(pretrain_f, pretrain_funcs) + + def train_f(f): + def dset_it(): + for x, y in train_it: + yield f(x, y) + it = dset_it() + return lambda: it.next() + + train = train_f(trainf) + + def eval_f(f, dsetf): + def res(): + c = 0 + i = 0 + for x, y in dsetf(batch_size): + i += x.shape[0] + c += f(x, y)*x.shape[0] + return c/i + return res + + test = eval_f(evalf, dset.test) + valid = eval_f(evalf, dset.valid) + + return pretrain_fs, train, valid, test + +def repeat_itf(itf, *args, **kwargs): + while True: + for e in itf(*args, **kwargs): + yield e + +def run_exp(state, channel): + from ift6266 import datasets + from sgd_opt import sgd_opt + import sys, time + + channel.save() + + # params: bsize, pretrain_lr, train_lr, nfilts1, nfilts2, nftils3, nfilts4 + # pretrain_rounds + + dset = dataset.nist_all() + + nfilts = [] + if state.nfilts1 != 0: + nfilts.append(state.nfilts1) + if state.nfilts2 != 0: + nfilts.append(state.nfilts2) + if state.nfilts3 != 0: + nfilts.append(state.nfilts3) + if state.nfilts4 != 0: + nfilts.append(state.nfilts4) + + fsizes = [(5,5)]*len(nfilts) + subs = [(2,2)]*len(nfilts) + noise = [state.noise]*len(nfilts) + + pretrain_funcs, trainf, evalf = build_funcs( + img_size=(32, 32), + batch_size=state.bsize, + filter_sizes=fsizes, + num_filters=nfilts, + subs=subs, + noise=noise, + mlp_sizes=[state.mlp_sz], + out_size=62, + dtype=numpy.float32, + pretrain_lr=state.pretrain_lr, + train_lr=state.train_lr) + + pretrain_fs, train, valid, test = massage_funcs( + state.bsize, dset, pretrain_funcs, trainf, evalf) + + do_pretrain(pretrain_fs, state.pretrain_rounds) + + sgd_opt(train, valid, test, training_epochs=100000, patience=10000, + patience_increase=2., improvement_threshold=0.995, + validation_frequency=2500) + +if __name__ == '__main__': + from ift6266 import datasets + from sgd_opt import sgd_opt + import sys, time + + batch_size = 100 + dset = datasets.mnist() + + pretrain_funcs, trainf, evalf = build_funcs( + img_size = (28, 28), + batch_size=batch_size, filter_sizes=[(5,5), (3,3)], + num_filters=[4, 4], subs=[(2,2), (2,2)], noise=[0.2, 0.2], + mlp_sizes=[500], out_size=10, dtype=numpy.float32, + pretrain_lr=0.01, train_lr=0.1) + + pretrain_fs, train, valid, test = massage_funcs( + repeat_itf(dset.train, batch_size), + dset, batch_size, + pretrain_funcs, trainf, evalf) + + print "pretraining ...", + sys.stdout.flush() + start = time.time() + do_pretrain(pretrain_fs, 2500) + end = time.time() + print "done (in", end-start, "s)" + + sgd_opt(train, valid, test, training_epochs=10000, patience=1000, + patience_increase=2., improvement_threshold=0.995, + validation_frequency=250) + diff -r 43afd29f3dbd -r 206374eed2fb deep/convolutional_dae/sgd_opt.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/sgd_opt.py Wed Mar 24 14:36:55 2010 -0400 @@ -0,0 +1,52 @@ +import time +import sys + +def sgd_opt(train, valid, test, training_epochs=10000, patience=10000, + patience_increase=2., improvement_threshold=0.995, + validation_frequency=None): + + if validation_frequency is None: + validation_frequency = patience/2 + + start_time = time.clock() + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + + start_time = time.clock() + + for epoch in xrange(1, training_epochs+1): + train() + + if epoch % validation_frequency == 0: + this_validation_loss = valid() + print('epoch %i, validation error %f %%' % \ + (epoch, this_validation_loss*100.)) + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, epoch * patience_increase) + + # save best validation score and epoch number + best_validation_loss = this_validation_loss + best_epoch = epoch + + # test it on the test set + test_score = test() + print((' epoch %i, test error of best model %f %%') % + (epoch, test_score*100.)) + + if patience <= epoch: + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + diff -r 43afd29f3dbd -r 206374eed2fb deep/stacked_dae/__init__.py diff -r 43afd29f3dbd -r 206374eed2fb deep/stacked_dae/nist_sda.py diff -r 43afd29f3dbd -r 206374eed2fb deep/stacked_dae/sgd_optimization.py diff -r 43afd29f3dbd -r 206374eed2fb deep/stacked_dae/stacked_dae.py diff -r 43afd29f3dbd -r 206374eed2fb deep/stacked_dae/utils.py diff -r 43afd29f3dbd -r 206374eed2fb scripts/launch_generate100.py --- a/scripts/launch_generate100.py Wed Mar 24 14:35:11 2010 -0400 +++ b/scripts/launch_generate100.py Wed Mar 24 14:36:55 2010 -0400 @@ -3,12 +3,13 @@ import os dir1 = "/data/lisa/data/ift6266h10/" -mach = "maggie16.iro.umontreal.ca,maggie15.iro.umontreal.ca" +mach = ["maggie16.iro.umontreal.ca,zappa8@iro.umontreal.ca"] +#test and valid sets for i,s in enumerate(['valid','test']): for j,c in enumerate([0.3,0.5,0.7,1]): l = str(c).replace('.','') - os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m 0.3 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, [20000,80000][i], 200+i*4+j)) + os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m %s -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, c ,[20000,80000][i], 200+i*4+j)) #P07 for i in range(100): @@ -17,3 +18,9 @@ #PNIST07 for i in range(100): os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/PNIST07_train%d_data.ft -p %sdata/PNIST07_train%d_params -x %sdata/PNIST07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d -t %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i,1)) + + + +#P07 +#for i in [90,94]:#[2,10,13,15,20,49,68,82,86,90,94]: + #os.system("dbidispatch --condor --mem=3900 --os=fc4,fc7,fc9 --machine=maggie16.iro.umontreal.ca --machine=maggie15.iro.umontreal.ca --machine=zappa8@iro.umontreal.ca ./run_pipeline.sh -o %sdata2/P07_train%d_data.ft -p %sdata2/P07_train%d_params -x %sdata2/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1,100+i)) diff -r 43afd29f3dbd -r 206374eed2fb scripts/setup_batches.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/setup_batches.py Wed Mar 24 14:36:55 2010 -0400 @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +import random +from pylearn.io import filetensor as ft + +class Batches(): + def __init__(self): + data_path = '/data/lisa/data/nist/by_class/' + + digits_train_data = 'digits/digits_train_data.ft' + digits_train_labels = 'digits/digits_train_labels.ft' + digits_test_data = 'digits/digits_test_data.ft' + digits_test_labels = 'digits/digits_test_labels.ft' + + lower_train_data = 'lower/lower_train_data.ft' + lower_train_labels = 'lower/lower_train_labels.ft' + #upper_train_data = 'upper/upper_train_data.ft' + #upper_train_labels = 'upper/upper_train_labels.ft' + + f_digits_train_data = open(data_path + digits_train_data) + f_digits_train_labels = open(data_path + digits_train_labels) + f_digits_test_data = open(data_path + digits_test_data) + f_digits_test_labels = open(data_path + digits_test_labels) + + f_lower_train_data = open(data_path + lower_train_data) + f_lower_train_labels = open(data_path + lower_train_labels) + #f_upper_train_data = open(data_path + upper_train_data) + #f_upper_train_labels = open(data_path + upper_train_labels) + + self.raw_digits_train_data = ft.read(f_digits_train_data) + self.raw_digits_train_labels = ft.read(f_digits_train_labels) + self.raw_digits_test_data = ft.read(f_digits_test_data) + self.raw_digits_test_labels = ft.read(f_digits_test_labels) + + self.raw_lower_train_data = ft.read(f_lower_train_data) + self.raw_lower_train_labels = ft.read(f_lower_train_labels) + #self.raw_upper_train_data = ft.read(f_upper_train_data) + #self.raw_upper_train_labels = ft.read(f_upper_train_labels) + + f_digits_train_data.close() + f_digits_train_labels.close() + f_digits_test_data.close() + f_digits_test_labels.close() + + f_lower_train_data.close() + f_lower_train_labels.close() + #f_upper_train_data.close() + #f_upper_train_labels.close() + + def set_batches(self, start_ratio = -1, end_ratio = -1, batch_size = 20, verbose = False): + self.batch_size = batch_size + + digits_train_size = len(self.raw_digits_train_labels) + digits_test_size = len(self.raw_digits_test_labels) + + lower_train_size = len(self.raw_lower_train_labels) + #upper_train_size = len(self.raw_upper_train_labels) + + if verbose == True: + print 'digits_train_size = %d' %digits_train_size + print 'digits_test_size = %d' %digits_test_size + print 'lower_train_size = %d' %lower_train_size + #print 'upper_train_size = %d' %upper_train_size + + # define main and other datasets + raw_main_train_data = self.raw_digits_train_data + raw_other_train_data = self.raw_lower_train_labels + raw_test_data = self.raw_digits_test_labels + + raw_main_train_labels = self.raw_digits_train_labels + raw_other_train_labels = self.raw_lower_train_labels + raw_test_labels = self.raw_digits_test_labels + + main_train_size = len(raw_main_train_data) + other_train_size = len(raw_other_train_data) + test_size = len(raw_test_data) + test_size = int(test_size/batch_size) + test_size *= batch_size + validation_size = test_size + + # default ratio is actual ratio + if start_ratio == -1: + self.start_ratio = float(main_train_size) / float(main_train_size + other_train_size) + else: + self.start_ratio = start_ratio + + if start_ratio == -1: + self.end_ratio = float(main_train_size) / float(main_train_size + other_train_size) + else: + self.end_ratio = end_ratio + + if verbose == True: + print 'start_ratio = %f' %self.start_ratio + print 'end_ratio = %f' %self.end_ratio + + i_main = 0 + i_other = 0 + i_batch = 0 + + # compute the number of batches given start and end ratios + n_main_batch = (main_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2)) + n_other_batch = (other_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size - batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2)) + n_batches = min([n_main_batch, n_other_batch]) + + # train batches + self.train_batches = [] + + # as long as we have data left in main and other, we create batches + while i_main < main_train_size - batch_size - test_size and i_other < other_train_size - batch_size: + + ratio = self.start_ratio + i_batch * (self.end_ratio - self.start_ratio) / n_batches + batch_data = [] + batch_labels = [] + + for i in xrange(0, self.batch_size): # randomly choose between main and other, given the current ratio + rnd = random.randint(0, 100) + + if rnd < 100 * ratio: + batch_data = batch_data + \ + [raw_main_train_data[i_main]] + batch_labels = batch_labels + \ + [raw_main_train_labels[i_main]] + i_main += 1 + else: + batch_data = batch_data + \ + [raw_other_train_data[i_other]] + batch_labels = batch_labels + \ + [raw_other_train_labels[i_other]] + i_other += 1 + + self.train_batches = self.train_batches + \ + [(batch_data,batch_labels)] + i_batch += 1 + + offset = i_main + + if verbose == True: + print 'n_main = %d' %i_main + print 'n_other = %d' %i_other + print 'nb_train_batches = %d / %d' %(i_batch,n_batches) + print 'offset = %d' %offset + + # test batches + self.test_batches = [] + for i in xrange(0, test_size, batch_size): + self.test_batches = self.test_batches + \ + [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])] + + # validation batches + self.validation_batches = [] + for i in xrange(0, test_size, batch_size): + self.validation_batches = self.validation_batches + \ + [(raw_main_train_data[offset+i:offset+i+batch_size], raw_main_train_labels[offset+i:offset+i+batch_size])] + + def get_train_batches(self): + return self.train_batches + + def get_test_batches(self): + return self.test_batches + + def get_validation_batches(self): + return self.validation_batches + + def test_set_batches(self, intervall = 1000): + for i in xrange(0, len(self.train_batches) - self.batch_size, intervall): + n_main = 0 + + for j in xrange(0, self.batch_size): + if self.train_batches[i][1][j] < 10: + n_main +=1 + print 'ratio batch %d : %f' %(i,float(n_main) / float(self.batch_size)) + +if __name__ == '__main__': + batches = Batches() + batches.set_batches(0.5,1, 20, True) + batches.test_set_batches()