# HG changeset patch # User fsavard # Date 1269010576 14400 # Node ID 1e4e60ddadb15dfc82ae79f6e89f8f3fe879b9e8 # Parent c8fe09a6503970bdbcbaed7c729010f33f3a19ca# Parent f14fb56b3f8d3455a2a6eafcbeb4e05e5a5d0d9e Merge. Ah, et dans le dernier commit, j'avais oublié de mentionner que j'ai ajouté du code pour gérer l'isolation de différents clones pour rouler des expériences et modifier le code en même temps. diff -r c8fe09a65039 -r 1e4e60ddadb1 baseline/conv_mlp/convolutional_mlp.py --- a/baseline/conv_mlp/convolutional_mlp.py Fri Mar 19 10:54:39 2010 -0400 +++ b/baseline/conv_mlp/convolutional_mlp.py Fri Mar 19 10:56:16 2010 -0400 @@ -26,8 +26,8 @@ import theano.sandbox.softsign import pylearn.datasets.MNIST from pylearn.io import filetensor as ft -from theano.tensor.signal import downsample -from theano.tensor.nnet import conv +from theano.sandbox import conv, downsample +import theano,pylearn.version,ift6266 class LeNetConvPoolLayer(object): @@ -214,7 +214,7 @@ # Dealing with the training set # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = (d[:4000,:],labels[:4000]) + (train_set_x, train_set_y) = (d[:200000,:],labels[:200000]) # initialize the list of training minibatches with empty list train_batches = [] for i in xrange(0, len(train_set_x), batch_size): @@ -229,7 +229,7 @@ #print train_batches[500] # Dealing with the validation set - (valid_set_x, valid_set_y) = (d[4000:5000,:],labels[4000:5000]) + (valid_set_x, valid_set_y) = (d[200000:270000,:],labels[200000:270000]) # initialize the list of validation minibatches valid_batches = [] for i in xrange(0, len(valid_set_x), batch_size): @@ -237,17 +237,18 @@ [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] # Dealing with the testing set - (test_set_x, test_set_y) = (d[5000:6000,:],labels[5000:6000]) + (test_set_x, test_set_y) = (d[270000:340000,:],labels[270000:340000]) # initialize the list of testing minibatches test_batches = [] for i in xrange(0, len(test_set_x), batch_size): test_batches = test_batches + \ [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + return train_batches, valid_batches, test_batches -def evaluate_lenet5(learning_rate=0.1, n_iter=1, batch_size=20, n_kern0=20,n_kern1=50,filter_shape=5,n_layer=3, dataset='mnist.pkl.gz'): +def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, dataset='mnist.pkl.gz'): rng = numpy.random.RandomState(23455) print 'Before load dataset' @@ -256,6 +257,16 @@ ishape = (32,32) # this is the size of NIST images n_kern2=80 + n_kern3=100 + if n_layer==4: + filter_shape1=3 + filter_shape2=3 + if n_layer==5: + filter_shape0=4 + filter_shape1=2 + filter_shape2=2 + filter_shape3=2 + # allocate symbolic variables for the data x = T.matrix('x') # rasterized images @@ -276,7 +287,7 @@ # 4D output tensor is thus of shape (20,20,14,14) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size,1,32,32), - filter_shape=(n_kern0,1,filter_shape,filter_shape), poolsize=(2,2)) + filter_shape=(n_kern0,1,filter_shape0,filter_shape0), poolsize=(2,2)) if(n_layer>2): @@ -284,17 +295,17 @@ # filtering reduces the image size to (14-5+1,14-5+1)=(10,10) # maxpooling reduces this further to (10/2,10/2) = (5,5) # 4D output tensor is thus of shape (20,50,5,5) - fshape=(32-filter_shape+1)/2 + fshape0=(32-filter_shape0+1)/2 layer1 = LeNetConvPoolLayer(rng, input=layer0.output, - image_shape=(batch_size,n_kern0,fshape,fshape), - filter_shape=(n_kern1,n_kern0,filter_shape,filter_shape), poolsize=(2,2)) + image_shape=(batch_size,n_kern0,fshape0,fshape0), + filter_shape=(n_kern1,n_kern0,filter_shape1,filter_shape1), poolsize=(2,2)) else: - fshape=(32-filter_shape+1)/2 + fshape0=(32-filter_shape0+1)/2 layer1_input = layer0.output.flatten(2) # construct a fully-connected sigmoidal layer - layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape*fshape, n_out=500) + layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=500) layer2 = LogisticRegression(input=layer1.output, n_in=500, n_out=10) cost = layer2.negative_log_likelihood(y) @@ -304,17 +315,46 @@ if(n_layer>3): - fshape=(32-filter_shape+1)/2 - fshape2=(fshape-filter_shape+1)/2 - fshape3=(fshape2-filter_shape+1)/2 + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 layer2 = LeNetConvPoolLayer(rng, input=layer1.output, - image_shape=(batch_size,n_kern1,fshape2,fshape2), - filter_shape=(n_kern2,n_kern1,filter_shape,filter_shape), poolsize=(2,2)) + image_shape=(batch_size,n_kern1,fshape1,fshape1), + filter_shape=(n_kern2,n_kern1,filter_shape2,filter_shape2), poolsize=(2,2)) + + if(n_layer>4): + + + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 + fshape2=(fshape1-filter_shape2+1)/2 + fshape3=(fshape2-filter_shape3+1)/2 + layer3 = LeNetConvPoolLayer(rng, input=layer2.output, + image_shape=(batch_size,n_kern2,fshape2,fshape2), + filter_shape=(n_kern3,n_kern2,filter_shape3,filter_shape3), poolsize=(2,2)) + + layer4_input = layer3.output.flatten(2) + layer4 = SigmoidalLayer(rng, input=layer4_input, + n_in=n_kern3*fshape3*fshape3, n_out=500) + + + layer5 = LogisticRegression(input=layer4.output, n_in=500, n_out=10) + + cost = layer5.negative_log_likelihood(y) + + test_model = theano.function([x,y], layer5.errors(y)) + + params = layer5.params+ layer4.params+ layer3.params+ layer2.params+ layer1.params + layer0.params + + elif(n_layer>3): + + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 + fshape2=(fshape1-filter_shape2+1)/2 layer3_input = layer2.output.flatten(2) layer3 = SigmoidalLayer(rng, input=layer3_input, - n_in=n_kern2*fshape3*fshape3, n_out=500) + n_in=n_kern2*fshape2*fshape2, n_out=500) layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=10) @@ -328,8 +368,8 @@ elif(n_layer>2): - fshape=(32-filter_shape+1)/2 - fshape2=(fshape-filter_shape+1)/2 + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 # the SigmoidalLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). @@ -338,7 +378,7 @@ # construct a fully-connected sigmoidal layer layer2 = SigmoidalLayer(rng, input=layer2_input, - n_in=n_kern1*fshape2*fshape2, n_out=500) + n_in=n_kern1*fshape1*fshape1, n_out=500) # classify the values of the fully-connected sigmoidal layer @@ -462,7 +502,7 @@ def experiment(state, channel): print 'start experiment' - (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.filter_shape, state.n_layer) + (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1) print 'end experiment' state.best_validation_loss = best_validation_loss diff -r c8fe09a65039 -r 1e4e60ddadb1 data_generation/pipeline/pipeline.py --- a/data_generation/pipeline/pipeline.py Fri Mar 19 10:54:39 2010 -0400 +++ b/data_generation/pipeline/pipeline.py Fri Mar 19 10:56:16 2010 -0400 @@ -10,6 +10,7 @@ import numpy import ift6266.data_generation.transformations.filetensor as ft import random +import copy # To debug locally, also call with -s 100 (to stop after ~100) # (otherwise we allocate all needed memory, might be loonnng and/or crash @@ -59,11 +60,12 @@ -b, --prob-captcha: probability of using a captcha image -g, --prob-ocr: probability of using an ocr image -y, --seed: the job seed + -t, --type: [default: 0:full transformations], 1:Nist-friendly transformations ''' try: - opts, args = getopt.getopt(get_argv(), "rm:z:o:p:x:s:f:l:c:d:a:b:g:y:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "labels-output-file=", -"stop-after=", "data-file=", "label-file=", "ocr-file=", "ocrlabel-file=", "prob-font=", "prob-captcha=", "prob-ocr=", "seed="]) + opts, args = getopt.getopt(get_argv(), "r:m:z:o:p:x:s:f:l:c:d:a:b:g:y:t:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "labels-output-file=", +"stop-after=", "data-file=", "label-file=", "ocr-file=", "ocrlabel-file=", "prob-font=", "prob-captcha=", "prob-ocr=", "seed=","type="]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" @@ -76,6 +78,11 @@ random.seed(int(a)) numpy.random.seed(int(a)) +type_pipeline = 0 +for o, a in opts: + if o in ('-t','--type'): + type_pipeline = int(a) + if DEBUG_X: import pylab pylab.ion() @@ -104,7 +111,17 @@ VISUALIZER = Visualizer(to_dir=DEBUG_OUTPUT_DIR, on_screen=False) ###---------------------order of transformation module -MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(),Rature(),Occlusion(), PermutPixel(),DistorsionGauss(),AddBackground(), PoivreSel(), BruitGauss(), Contrast()] +if type_pipeline == 0: + MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(),Rature(),Occlusion(), PermutPixel(),DistorsionGauss(),AddBackground(), PoivreSel(), BruitGauss(), Contrast()] + stop_idx = 0 +if type_pipeline == 1: + MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(False),Rature(),Occlusion(), PermutPixel(),DistorsionGauss(),AddBackground(), PoivreSel(), BruitGauss(), Contrast()] + stop_idx = 5 + #we disable transformation corresponding to MODULE_INSTANCES[stop_idx:] but we still need to apply them on dummy images + #in order to be sure to have the same random generator state than with the default pipeline. + #This is not optimal (we do more calculus than necessary) but it is a quick hack to produce similar results than previous generation + + # These should have a "after_transform_callback(self, image)" method # (called after each call to transform_image in a module) @@ -155,7 +172,7 @@ sys.stdout.flush() global_idx = img_no - + img = img.reshape(img_size) param_idx = 0 @@ -163,7 +180,7 @@ for mod in self.modules: # This used to be done _per batch_, # ie. out of the "for img" loop - complexity = complexity_iterator.next() + complexity = complexity_iterator.next() #better to do a complexity sampling for each transformations in order to have more variability #otherwise a lot of images similar to the source are generated (i.e. when complexity is close to 0 (1/8 of the time)) #we need to save the complexity of each transformations and the sum of these complexity is a good indicator of the overall @@ -174,8 +191,13 @@ p = mod.regenerate_parameters(complexity) self.params[global_idx, param_idx+len(self.modules):param_idx+len(p)+len(self.modules)] = p param_idx += len(p) - - img = mod.transform_image(img) + + if not(stop_idx) or stop_idx > mod_idx: + img = mod.transform_image(img) + else: + tmp = mod.transform_image(copy.copy(img)) + #this is done to be sure to have the same global random generator state + #we don't apply the transformation on the original image but on a copy in case of in-place transformations if should_hook_after_each: for hook in AFTER_EACH_MODULE_HOOK: @@ -192,9 +214,10 @@ def write_output(self, output_file_path, params_output_file_path, labels_output_file_path): with open(output_file_path, 'wb') as f: ft.write(f, self.res_data) - + + #if type_pipeline == 0: #only needed for type 0 pipeline numpy.save(params_output_file_path, self.params) - + with open(labels_output_file_path, 'wb') as f: ft.write(f, self.res_labels) @@ -209,6 +232,7 @@ def range_complexity_iterator(probability_zero, max_complexity): assert max_complexity <= 1.0 n = numpy.random.uniform(0.0, 1.0) + n = 2.0 #hack to bug fix, having a min complexity is not necessary and we need the same seed... while True: if n < probability_zero: yield 0.0 @@ -349,6 +373,8 @@ prob_ocr = float(a) elif o in ('-y', "--seed"): pass + elif o in ('-t', "--type"): + pass else: assert False, "unhandled option" diff -r c8fe09a65039 -r 1e4e60ddadb1 data_generation/pipeline/testtransformations.py --- a/data_generation/pipeline/testtransformations.py Fri Mar 19 10:54:39 2010 -0400 +++ b/data_generation/pipeline/testtransformations.py Fri Mar 19 10:56:16 2010 -0400 @@ -28,7 +28,7 @@ from affine_transform import AffineTransformation ###---------------------order of transformation module -MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(),Rature(),Occlusion(), PermutPixel(),DistorsionGauss(),AddBackground(), PoivreSel(), BruitGauss(), Contrast()] +MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(False)] ###---------------------complexity associated to each of them complexity = 0.7 @@ -111,11 +111,11 @@ d = N.zeros((n,1024)) -datapath = '/data/lisa/data/ocr_breuel/filetensor/unlv-corrected-2010-02-01-shuffled.ft' -f = open(datapath) -d = ft.read(f) -d = d[0:n,:]/255.0 -createimage('/u/glorotxa/transf/OCR',d) +#datapath = '/data/lisa/data/ocr_breuel/filetensor/unlv-corrected-2010-02-01-shuffled.ft' +#f = open(datapath) +#d = ft.read(f) +#d = d[0:n,:]/255.0 +#createimage('/u/glorotxa/transf/OCR',d) @@ -133,18 +133,18 @@ d = d[0:n,:]/255.0 createimage('/u/glorotxa/transf/NIST_upper',d) -from Facade import * +#from Facade import * -for i in range(n): - d[i,:]=N.asarray(N.reshape(generateCaptcha(0.8,0),(1,1024))/255.0,dtype='float32') +#for i in range(n): + #d[i,:]=N.asarray(N.reshape(generateCaptcha(0.8,0),(1,1024))/255.0,dtype='float32') -createimage('/u/glorotxa/transf/capcha',d) +#createimage('/u/glorotxa/transf/capcha',d) -for i in range(n): - myttf2jpg = ttf2jpg() - d[i,:]=N.reshape(myttf2jpg.generate_image()[0],(1,1024)) -createimage('/u/glorotxa/transf/fonts',d) +#for i in range(n): + #myttf2jpg = ttf2jpg() + #d[i,:]=N.reshape(myttf2jpg.generate_image()[0],(1,1024)) +#createimage('/u/glorotxa/transf/fonts',d) datapath = '/data/lisa/data/nist/by_class/' f = open(datapath+'lower/lower_train_data.ft') diff -r c8fe09a65039 -r 1e4e60ddadb1 data_generation/transformations/gimp_script.py --- a/data_generation/transformations/gimp_script.py Fri Mar 19 10:54:39 2010 -0400 +++ b/data_generation/transformations/gimp_script.py Fri Mar 19 10:56:16 2010 -0400 @@ -30,12 +30,16 @@ return numpy.fromstring(dest_rgn[:,:], 'UInt8').astype(numpy.float32).reshape((32,32)).T / 255.0 class GIMP1(): - def get_settings_names(self): + def __init__(self, blur_bool = True): + #This is used to avoid blurring for PNIST + self.blur_bool = blur_bool + + def get_settings_names(self, blur_bool = True): return ['mblur_length', 'mblur_angle', 'pinch'] def regenerate_parameters(self, complexity): if complexity: - self.mblur_length = abs(int(round(numpy.random.normal(0, 3*complexity)))) + self.mblur_length = abs(int(round(numpy.random.normal(0, 3*complexity)))) else: self.mblur_length = 0 self.mblur_angle = int(round(numpy.random.uniform(0,360))) @@ -46,7 +50,7 @@ def transform_image(self, image): if self.mblur_length or self.pinch: setpix(image) - if self.mblur_length: + if self.mblur_length and self.blur_bool: pdb.plug_in_mblur(img, layer1, 0, self.mblur_length, self.mblur_angle, 0, 0) if self.pinch: pdb.plug_in_whirl_pinch(img, layer1, 0.0, self.pinch, 1.0) diff -r c8fe09a65039 -r 1e4e60ddadb1 datasets/defs.py --- a/datasets/defs.py Fri Mar 19 10:54:39 2010 -0400 +++ b/datasets/defs.py Fri Mar 19 10:56:16 2010 -0400 @@ -11,44 +11,45 @@ NIST_PATH = os.getenv('NIST_PATH','/data/lisa/data/nist/by_class/') DATA_PATH = os.getenv('DATA_PATH','/data/lisa/data/ift6266h10/') -nist_digits = FTDataSet(train_data = [os.path.join(NIST_PATH,'digits/digits_train_data.ft')], +nist_digits = lambda maxsize=None: FTDataSet(train_data = [os.path.join(NIST_PATH,'digits/digits_train_data.ft')], train_lbl = [os.path.join(NIST_PATH,'digits/digits_train_labels.ft')], test_data = [os.path.join(NIST_PATH,'digits/digits_test_data.ft')], test_lbl = [os.path.join(NIST_PATH,'digits/digits_test_labels.ft')], - indtype=theano.config.floatX, inscale=255.) -nist_lower = FTDataSet(train_data = [os.path.join(NIST_PATH,'lower/lower_train_data.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) +nist_lower = lambda maxsize=None: FTDataSet(train_data = [os.path.join(NIST_PATH,'lower/lower_train_data.ft')], train_lbl = [os.path.join(NIST_PATH,'lower/lower_train_labels.ft')], test_data = [os.path.join(NIST_PATH,'lower/lower_test_data.ft')], test_lbl = [os.path.join(NIST_PATH,'lower/lower_test_labels.ft')], - indtype=theano.config.floatX, inscale=255.) -nist_upper = FTDataSet(train_data = [os.path.join(NIST_PATH,'upper/upper_train_data.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) +nist_upper = lambda maxsize=None: FTDataSet(train_data = [os.path.join(NIST_PATH,'upper/upper_train_data.ft')], train_lbl = [os.path.join(NIST_PATH,'upper/upper_train_labels.ft')], test_data = [os.path.join(NIST_PATH,'upper/upper_test_data.ft')], test_lbl = [os.path.join(NIST_PATH,'upper/upper_test_labels.ft')], - indtype=theano.config.floatX, inscale=255.) + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) -nist_all = FTDataSet(train_data = [os.path.join(DATA_PATH,'train_data.ft')], +nist_all = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'train_data.ft')], train_lbl = [os.path.join(DATA_PATH,'train_labels.ft')], test_data = [os.path.join(DATA_PATH,'test_data.ft')], test_lbl = [os.path.join(DATA_PATH,'test_labels.ft')], valid_data = [os.path.join(DATA_PATH,'valid_data.ft')], valid_lbl = [os.path.join(DATA_PATH,'valid_labels.ft')], - indtype=theano.config.floatX, inscale=255.) + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) -ocr = FTDataSet(train_data = [os.path.join(DATA_PATH,'ocr_train_data.ft')], +ocr = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'ocr_train_data.ft')], train_lbl = [os.path.join(DATA_PATH,'ocr_train_labels.ft')], test_data = [os.path.join(DATA_PATH,'ocr_test_data.ft')], test_lbl = [os.path.join(DATA_PATH,'ocr_test_labels.ft')], valid_data = [os.path.join(DATA_PATH,'ocr_valid_data.ft')], valid_lbl = [os.path.join(DATA_PATH,'ocr_valid_labels.ft')], - indtype=theano.config.floatX, inscale=255.) + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) -nist_P07 = FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(100)], +nist_P07 = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(100)], train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(100)], test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')], test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')], valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')], valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')], - indtype=theano.config.floatX, inscale=255.) + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) -mnist = GzpklDataSet(os.path.join(DATA_PATH,'mnist.pkl.gz')) +mnist = lambda maxsize=None: GzpklDataSet(os.path.join(DATA_PATH,'mnist.pkl.gz'), + maxsize=maxsize) diff -r c8fe09a65039 -r 1e4e60ddadb1 datasets/ftfile.py --- a/datasets/ftfile.py Fri Mar 19 10:54:39 2010 -0400 +++ b/datasets/ftfile.py Fri Mar 19 10:56:16 2010 -0400 @@ -89,57 +89,58 @@ return res class FTSource(object): - def __init__(self, file, skip=0, size=None, dtype=None, scale=1): + def __init__(self, file, skip=0, size=None, maxsize=None, + dtype=None, scale=1): r""" Create a data source from a possible subset of a .ft file. Parameters: - `file` (string) -- the filename - `skip` (int, optional) -- amount of examples to skip from - the start of the file. If - negative, skips filesize - skip. - `size` (int, optional) -- truncates number of examples - read (after skipping). If - negative truncates to - filesize - size - (also after skipping). - `dtype` (dtype, optional) -- convert the data to this - dtype after reading. - `scale` (number, optional) -- scale (that is divide) the - data by this number (after - dtype conversion, if any). + `file` -- (string) the filename + `skip` -- (int, optional) amount of examples to skip from + the start of the file. If negative, skips + filesize - skip. + `size` -- (int, optional) truncates number of examples + read (after skipping). If negative truncates to + filesize - size (also after skipping). + `maxsize` -- (int, optional) the maximum size of the file + `dtype` -- (dtype, optional) convert the data to this + dtype after reading. + `scale` -- (number, optional) scale (that is divide) the + data by this number (after dtype conversion, if + any). Tests: - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=100, size=120) + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=100, size=120) """ self.file = file self.skip = skip self.size = size self.dtype = dtype self.scale = scale + self.maxsize = maxsize def open(self): r""" Returns an FTFile that corresponds to this dataset. Tests: - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') - >>> f = s.open() - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1) - >>> len(s.open().read(2)) - 1 - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646) - >>> s.open().size - 1000 - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) - >>> s.open().size - 1 - >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) - >>> s.open().size - 58636 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') + >>> f = s.open() + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1) + >>> len(s.open().read(2)) + 1 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646) + >>> s.open().size + 1000 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) + >>> s.open().size + 1 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) + >>> s.open().size + 58636 """ f = FTFile(self.file, scale=self.scale, dtype=self.dtype) if self.skip != 0: @@ -147,19 +148,25 @@ if self.size is not None and self.size < f.size: if self.size < 0: f.size += self.size + if f.size < 0: + f.size = 0 else: f.size = self.size + if self.maxsize is not None and f.size > self.maxsize: + f.size = self.maxsize return f class FTData(object): r""" This is a list of FTSources. """ - def __init__(self, datafiles, labelfiles, skip=0, size=None, + def __init__(self, datafiles, labelfiles, skip=0, size=None, maxsize=None, inscale=1, indtype=None, outscale=1, outdtype=None): - self.inputs = [FTSource(f, skip, size, scale=inscale, dtype=indtype) + if maxsize is not None: + maxsize /= len(datafiles) + self.inputs = [FTSource(f, skip, size, maxsize, scale=inscale, dtype=indtype) for f in datafiles] - self.outputs = [FTSource(f, skip, size, scale=outscale, dtype=outdtype) + self.outputs = [FTSource(f, skip, size, maxsize, scale=outscale, dtype=outdtype) for f in labelfiles] def open_inputs(self): @@ -170,7 +177,9 @@ class FTDataSet(DataSet): - def __init__(self, train_data, train_lbl, test_data, test_lbl, valid_data=None, valid_lbl=None, indtype=None, outdtype=None, inscale=1, outscale=1): + def __init__(self, train_data, train_lbl, test_data, test_lbl, + valid_data=None, valid_lbl=None, indtype=None, outdtype=None, + inscale=1, outscale=1, maxsize=None): r""" Defines a DataSet from a bunch of files. @@ -184,6 +193,7 @@ (optional) `indtype`, `outdtype`, -- see FTSource.__init__() `inscale`, `outscale` (optional) + `maxsize` -- maximum size of the set returned If `valid_data` and `valid_labels` are not supplied then a sample @@ -191,21 +201,26 @@ set. """ if valid_data is None: - total_valid_size = sum(FTFile(td).size for td in test_data) + total_valid_size = min(sum(FTFile(td).size for td in test_data), maxsize) valid_size = total_valid_size/len(train_data) self._train = FTData(train_data, train_lbl, size=-valid_size, - inscale=inscale, outscale=outscale, indtype=indtype, - outdtype=outdtype) + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype, + maxsize=maxsize) self._valid = FTData(train_data, train_lbl, skip=-valid_size, - inscale=inscale, outscale=outscale, indtype=indtype, - outdtype=outdtype) + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype, + maxsize=maxsize) else: - self._train = FTData(train_data, train_lbl,inscale=inscale, - outscale=outscale, indtype=indtype, outdtype=outdtype) - self._valid = FTData(valid_data, valid_lbl,inscale=inscale, - outscale=outscale, indtype=indtype, outdtype=outdtype) - self._test = FTData(test_data, test_lbl,inscale=inscale, - outscale=outscale, indtype=indtype, outdtype=outdtype) + self._train = FTData(train_data, train_lbl, maxsize=maxsize, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype) + self._valid = FTData(valid_data, valid_lbl, maxsize=maxsize, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype) + self._test = FTData(test_data, test_lbl, maxsize=maxsize, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype) def _return_it(self, batchsize, bufsize, ftdata): return izip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), diff -r c8fe09a65039 -r 1e4e60ddadb1 datasets/gzpklfile.py --- a/datasets/gzpklfile.py Fri Mar 19 10:54:39 2010 -0400 +++ b/datasets/gzpklfile.py Fri Mar 19 10:56:16 2010 -0400 @@ -19,8 +19,9 @@ return res class GzpklDataSet(DataSet): - def __init__(self, fname): + def __init__(self, fname, maxsize): self._fname = fname + self.maxsize = maxsize self._train = 0 self._valid = 1 self._test = 2 @@ -35,5 +36,5 @@ def _return_it(self, batchsz, bufsz, id): if not hasattr(self, 'datas'): self._load() - return izip(DataIterator([ArrayFile(self.datas[id][0])], batchsz, bufsz), - DataIterator([ArrayFile(self.datas[id][1])], batchsz, bufsz)) + return izip(DataIterator([ArrayFile(self.datas[id][0][:self.maxsize])], batchsz, bufsz), + DataIterator([ArrayFile(self.datas[id][1][:self.maxsize])], batchsz, bufsz)) diff -r c8fe09a65039 -r 1e4e60ddadb1 deep/convolutional_dae/stacked_convolutional_dae.py --- a/deep/convolutional_dae/stacked_convolutional_dae.py Fri Mar 19 10:54:39 2010 -0400 +++ b/deep/convolutional_dae/stacked_convolutional_dae.py Fri Mar 19 10:56:16 2010 -0400 @@ -1,17 +1,19 @@ import numpy import theano import time +import sys import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams -import theano.sandbox.softsign +#import theano.sandbox.softsign from theano.tensor.signal import downsample from theano.tensor.nnet import conv from ift6266 import datasets +from ift6266.baseline.log_reg.log_reg import LogisticRegression -from ift6266.baseline.log_reg.log_reg import LogisticRegression - +batch_size = 100 + class SigmoidalLayer(object): def __init__(self, rng, input, n_in, n_out): @@ -57,8 +59,6 @@ initial_b_prime= numpy.zeros((filter_shape[1],),dtype=theano.config.floatX) - - self.W_prime=T.dtensor4('W_prime') self.b_prime = theano.shared(value = initial_b_prime, name = "b_prime") @@ -68,13 +68,11 @@ conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape, image_shape=image_shape, border_mode='valid') - self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x')) - - da_filter_shape = [ filter_shape[1], filter_shape[0], filter_shape[2],\ - filter_shape[3] ] + da_filter_shape = [ filter_shape[1], filter_shape[0], + filter_shape[2], filter_shape[3] ] initial_W_prime = numpy.asarray( numpy.random.uniform( \ low = -numpy.sqrt(6./(fan_in+fan_out)), \ high = numpy.sqrt(6./(fan_in+fan_out)), \ @@ -96,6 +94,7 @@ self.params = [ self.W, self.b, self.b_prime ] class LeNetConvPoolLayer(object): + def __init__(self, rng, input, filter_shape, image_shape=None, poolsize=(2,2)): self.input = input @@ -127,7 +126,7 @@ class SdA(): def __init__(self, input, n_ins_mlp, conv_hidden_layers_sizes, mlp_hidden_layers_sizes, corruption_levels, rng, n_out, - pretrain_lr, finetune_lr): + pretrain_lr, finetune_lr, img_shape): self.layers = [] self.pretrain_functions = [] @@ -144,7 +143,7 @@ max_poolsize=conv_hidden_layers_sizes[i][2] if i == 0 : - layer_input=self.x.reshape((self.x.shape[0], 1, 32, 32)) + layer_input=self.x.reshape((self.x.shape[0], 1) + img_shape) else: layer_input=self.layers[-1].output @@ -211,38 +210,46 @@ self.errors = self.logLayer.errors(self.y) -def sgd_optimization_mnist( learning_rate=0.1, pretraining_epochs = 2, \ - pretrain_lr = 0.01, training_epochs = 1000, \ - dataset=datasets.nist_digits): - - batch_size = 500 # size of the minibatch +def sgd_optimization_mnist(learning_rate=0.1, pretraining_epochs = 1, + pretrain_lr = 0.1, training_epochs = 1000, + kernels = [[4,5,5], [4,3,3]], mlp_layers=[500], + corruption_levels = [0.2, 0.2, 0.2], + batch_size = batch_size, img_shape=(28, 28), + max_pool_layers = [[2,2], [2,2]], + dataset=datasets.mnist(5000)): # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1d vector of # [int] labels - layer0_input = x.reshape((x.shape[0],1,32,32)) + + layer0_input = x.reshape((x.shape[0],1)+img_shape) - - # Setup the convolutional layers with their DAs(add as many as you want) - corruption_levels = [ 0.2, 0.2, 0.2] rng = numpy.random.RandomState(1234) - ker1=2 - ker2=2 conv_layers=[] - conv_layers.append([[ker1,1,5,5], None, [2,2] ]) - conv_layers.append([[ker2,ker1,5,5], None, [2,2] ]) + init_layer = [[kernels[0][0],1,kernels[0][1],kernels[0][2]], + None, # do not specify the batch size since it can + # change for the last one and then theano will + # crash. + max_pool_layers[0]] + conv_layers.append(init_layer) + + conv_n_out = (img_shape[0]-kernels[0][2]+1)/max_pool_layers[0][0] - # Setup the MLP layers of the network - mlp_layers=[500] - - network = SdA(input = layer0_input, n_ins_mlp = ker2*4*4, + for i in range(1,len(kernels)): + layer = [[kernels[i][0],kernels[i-1][0],kernels[i][1],kernels[i][2]], + None, # same comment as for init_layer + max_pool_layers[i] ] + conv_layers.append(layer) + conv_n_out = (conv_n_out - kernels[i][2]+1)/max_pool_layers[i][0] + + network = SdA(input = layer0_input, n_ins_mlp = kernels[-1][0]*conv_n_out**2, conv_hidden_layers_sizes = conv_layers, mlp_hidden_layers_sizes = mlp_layers, - corruption_levels = corruption_levels , n_out = 10, - rng = rng , pretrain_lr = pretrain_lr , - finetune_lr = learning_rate ) + corruption_levels = corruption_levels, n_out = 62, + rng = rng , pretrain_lr = pretrain_lr, + finetune_lr = learning_rate, img_shape=img_shape) test_model = theano.function([network.x, network.y], network.errors) diff -r c8fe09a65039 -r 1e4e60ddadb1 deep/stacked_dae/v_sylvain/nist_sda.py --- a/deep/stacked_dae/v_sylvain/nist_sda.py Fri Mar 19 10:54:39 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/nist_sda.py Fri Mar 19 10:56:16 2010 -0400 @@ -21,9 +21,8 @@ import jobman, jobman.sql from pylearn.io import filetensor -from ift6266 import datasets - from utils import produit_cartesien_jobs +from copy import copy from sgd_optimization import SdaSgdOptimizer @@ -31,49 +30,8 @@ from ift6266.utils.seriestables import * import tables -############################################################################## -# GLOBALS - -TEST_CONFIG = False - -#NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' -JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/sylvainpl_sda_vsylvain' -EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v_sylvain.nist_sda.jobman_entrypoint" - -REDUCE_TRAIN_TO = None -MAX_FINETUNING_EPOCHS = 1000 -# number of minibatches before taking means for valid error etc. -REDUCE_EVERY = 100 - -if TEST_CONFIG: - REDUCE_TRAIN_TO = 1000 - MAX_FINETUNING_EPOCHS = 2 - REDUCE_EVERY = 10 - MINIBATCH_SIZE=20 - -# Possible values the hyperparameters can take. These are then -# combined with produit_cartesien_jobs so we get a list of all -# possible combinations, each one resulting in a job inserted -# in the jobman DB. -JOB_VALS = {'pretraining_lr': [0.1],#, 0.01],#, 0.001],#, 0.0001], - 'pretraining_epochs_per_layer': [10], - 'hidden_layers_sizes': [500], - 'corruption_levels': [0.1], - 'minibatch_size': [20], - 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], - 'finetuning_lr':[0.1], #0.001 was very bad, so we leave it out - 'num_hidden_layers':[1,1]} - -# Just useful for tests... minimal number of epochs -DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, - 'pretraining_lr':0.1, - 'pretraining_epochs_per_layer':2, - 'max_finetuning_epochs':2, - 'hidden_layers_sizes':500, - 'corruption_levels':0.2, - 'minibatch_size':20, - 'reduce_train_to':10000, - 'num_hidden_layers':1}) +from ift6266 import datasets +from config import * ''' Function called by jobman upon launching each job @@ -85,48 +43,82 @@ # TODO: remove this, bad for number of simultaneous requests on DB channel.save() - workingdir = os.getcwd() - - ########### Il faudrait arranger ici pour train plus petit - -## print "Will load NIST" -## -## nist = NIST(minibatch_size=20) -## -## print "NIST loaded" -## # For test runs, we don't want to use the whole dataset so # reduce it to fewer elements if asked to. rtt = None if state.has_key('reduce_train_to'): - rtt = int(state['reduce_train_to']/state['minibatch_size']) + rtt = state['reduce_train_to'] elif REDUCE_TRAIN_TO: - rtt = int(REDUCE_TRAIN_TO/MINIBATCH_SIZE) - - if rtt: - print "Reducing training set to "+str(rtt*state['minibatch_size'])+ " examples" - else: - rtt=float('inf') #No reduction -## nist.reduce_train_set(rtt) -## -## train,valid,test = nist.get_tvt() -## dataset = (train,valid,test) - + rtt = REDUCE_TRAIN_TO + n_ins = 32*32 n_outs = 62 # 10 digits, 26*2 (lower, capitals) - + + examples_per_epoch = NIST_ALL_TRAIN_SIZE + series = create_series(state.num_hidden_layers) print "Creating optimizer with state, ", state - optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, hyperparameters=state, \ + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all(), + hyperparameters=state, \ n_ins=n_ins, n_outs=n_outs,\ - series=series) + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) - optimizer.pretrain(datasets.nist_all,rtt) + parameters=[] + optimizer.pretrain(datasets.nist_P07()) channel.save() + + #Set some of the parameters used for the finetuning + if state.has_key('finetune_set'): + finetune_choice=state['finetune_set'] + else: + finetune_choice=FINETUNE_SET + + if state.has_key('max_finetuning_epochs'): + max_finetune_epoch_NIST=state['max_finetuning_epochs'] + else: + max_finetune_epoch_NIST=MAX_FINETUNING_EPOCHS + + if state.has_key('max_finetuning_epochs_P07'): + max_finetune_epoch_P07=state['max_finetuning_epochs_P07'] + else: + max_finetune_epoch_P07=max_finetune_epoch_NIST + + #Decide how the finetune is done + + if finetune_choice==0: + print('\n\n\tfinetune avec nist\n\n') + optimizer.reload_parameters() + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1) + if finetune_choice==1: + print('\n\n\tfinetune avec P07\n\n') + optimizer.reload_parameters() + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) + if finetune_choice==2: + print('\n\n\tfinetune avec nist suivi de P07\n\n') + optimizer.reload_parameters() + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1) + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) - optimizer.finetune(datasets.nist_all,rtt) + if finetune_choice==-1: + print('\nSerie de 3 essais de fine-tuning') + print('\n\n\tfinetune avec nist\n\n') + optimizer.reload_parameters() + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1) + channel.save() + print('\n\n\tfinetune avec P07\n\n') + optimizer.reload_parameters() + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) + channel.save() + print('\n\n\tfinetune avec nist suivi de P07\n\n') + optimizer.reload_parameters() + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1) + optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) + channel.save() + channel.save() return channel.COMPLETE @@ -207,98 +199,19 @@ print "inserted" -class NIST: - def __init__(self, minibatch_size, basepath=None, reduce_train_to=None): - global NIST_ALL_LOCATION - - self.minibatch_size = minibatch_size - self.basepath = basepath and basepath or NIST_ALL_LOCATION - - self.set_filenames() - - # arrays of 2 elements: .x, .y - self.train = [None, None] - self.test = [None, None] - - self.load_train_test() - - self.valid = [[], []] - self.split_train_valid() - if reduce_train_to: - self.reduce_train_set(reduce_train_to) - - def get_tvt(self): - return self.train, self.valid, self.test - - def set_filenames(self): - self.train_files = ['all_train_data.ft', - 'all_train_labels.ft'] - - self.test_files = ['all_test_data.ft', - 'all_test_labels.ft'] - - def load_train_test(self): - self.load_data_labels(self.train_files, self.train) - self.load_data_labels(self.test_files, self.test) - - def load_data_labels(self, filenames, pair): - for i, fn in enumerate(filenames): - f = open(os.path.join(self.basepath, fn)) - pair[i] = filetensor.read(f) - f.close() - - def reduce_train_set(self, max): - self.train[0] = self.train[0][:max] - self.train[1] = self.train[1][:max] - - if max < len(self.test[0]): - for ar in (self.test, self.valid): - ar[0] = ar[0][:max] - ar[1] = ar[1][:max] - - def split_train_valid(self): - test_len = len(self.test[0]) - - new_train_x = self.train[0][:-test_len] - new_train_y = self.train[1][:-test_len] - - self.valid[0] = self.train[0][-test_len:] - self.valid[1] = self.train[1][-test_len:] - - self.train[0] = new_train_x - self.train[1] = new_train_y - -def test_load_nist(): - print "Will load NIST" - - import time - t1 = time.time() - nist = NIST(20) - t2 = time.time() - - print "NIST loaded. time delta = ", t2-t1 - - tr,v,te = nist.get_tvt() - - print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0]) - - raw_input("Press any key") - if __name__ == '__main__': - import sys - args = sys.argv[1:] - if len(args) > 0 and args[0] == 'load_nist': - test_load_nist() + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() - elif len(args) > 0 and args[0] == 'jobman_insert': + if len(args) > 0 and args[0] == 'jobman_insert': jobman_insert_nist() elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) - jobman_entrypoint(DEFAULT_HP_NIST, chanmock) + jobman_entrypoint(DD(DEFAULT_HP_NIST), chanmock) else: print "Bad arguments" diff -r c8fe09a65039 -r 1e4e60ddadb1 deep/stacked_dae/v_sylvain/sgd_optimization.py --- a/deep/stacked_dae/v_sylvain/sgd_optimization.py Fri Mar 19 10:54:39 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/sgd_optimization.py Fri Mar 19 10:56:16 2010 -0400 @@ -9,34 +9,16 @@ import datetime import theano.tensor as T import sys +import pickle from jobman import DD import jobman, jobman.sql +from copy import copy from stacked_dae import SdA from ift6266.utils.seriestables import * -##def shared_dataset(data_xy): -## data_x, data_y = data_xy -## if theano.config.device.startswith("gpu"): -## print "TRANSFERING DATASETS (via shared()) TO GPU" -## shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) -## shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) -## shared_y = T.cast(shared_y, 'int32') -## else: -## print "WILL RUN ON CPU, NOT GPU, SO DATASETS REMAIN IN BYTES" -## shared_x = theano.shared(data_x) -## shared_y = theano.shared(data_y) -## return shared_x, shared_y - - ######Les shared seront remplacees utilisant "given" dans les enonces de fonction plus loin -def shared_dataset(batch_size, n_in): - - shared_x = theano.shared(numpy.asarray(numpy.zeros((batch_size,n_in)), dtype=theano.config.floatX)) - shared_y = theano.shared(numpy.asarray(numpy.zeros(batch_size), dtype=theano.config.floatX)) - return shared_x, shared_y - default_series = { \ 'reconstruction_error' : DummySeries(), 'training_error' : DummySeries(), @@ -45,37 +27,34 @@ 'params' : DummySeries() } +def itermax(iter, max): + for i,it in enumerate(iter): + if i >= max: + break + yield it + class SdaSgdOptimizer: - def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series=default_series): + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, max_minibatches=None): self.dataset = dataset self.hp = hyperparameters self.n_ins = n_ins self.n_outs = n_outs - self.input_divider = input_divider + self.parameters_pre=[] + self.max_minibatches = max_minibatches + print "SdaSgdOptimizer, max_minibatches =", max_minibatches + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + self.series = series self.rng = numpy.random.RandomState(1234) - self.init_datasets() self.init_classifier() sys.stdout.flush() - - def init_datasets(self): - print "init_datasets" - sys.stdout.flush() - - #train_set, valid_set, test_set = self.dataset - self.test_set_x, self.test_set_y = shared_dataset(self.hp.minibatch_size,self.n_ins) - self.valid_set_x, self.valid_set_y = shared_dataset(self.hp.minibatch_size,self.n_ins) - self.train_set_x, self.train_set_y = shared_dataset(self.hp.minibatch_size,self.n_ins) - - # compute number of minibatches for training, validation and testing - self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size - self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size - # remove last batch in case it's incomplete - self.n_test_batches = (self.test_set_x.value.shape[0] / self.hp.minibatch_size) - 1 def init_classifier(self): print "Constructing classifier" @@ -88,8 +67,6 @@ # construct the stacked denoising autoencoder class self.classifier = SdA( \ - train_set_x= self.train_set_x, \ - train_set_y = self.train_set_y,\ batch_size = self.hp.minibatch_size, \ n_ins= self.n_ins, \ hidden_layers_sizes = layers_sizes, \ @@ -97,8 +74,7 @@ corruption_levels = corruption_levels,\ rng = self.rng,\ pretrain_lr = self.hp.pretraining_lr, \ - finetune_lr = self.hp.finetuning_lr,\ - input_divider = self.input_divider ) + finetune_lr = self.hp.finetuning_lr) #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") @@ -108,7 +84,7 @@ self.pretrain(self.dataset) self.finetune(self.dataset) - def pretrain(self,dataset,reduce): + def pretrain(self,dataset): print "STARTING PRETRAINING, time = ", datetime.datetime.now() sys.stdout.flush() @@ -118,15 +94,19 @@ # go through pretraining epochs for epoch in xrange(self.hp.pretraining_epochs_per_layer): # go through the training set - batch_index=int(0) + batch_index=0 for x,y in dataset.train(self.hp.minibatch_size): - batch_index+=1 - if batch_index > reduce: #If maximum number of mini-batch is used - break c = self.classifier.pretrain_functions[i](x) - self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 + + #if batch_index % 100 == 0: + # print "100 batches" + + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c sys.stdout.flush() @@ -137,33 +117,41 @@ print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) self.hp.update({'pretraining_time': end_time-start_time}) - + sys.stdout.flush() + + #To be able to load them later for tests on finetune + self.parameters_pre=[copy(x.value) for x in self.classifier.params] + f = open('params_pretrain.txt', 'w') + pickle.dump(self.parameters_pre,f) + f.close() - def finetune(self,dataset,reduce): + + def finetune(self,dataset,dataset_test,num_finetune,ind_test): print "STARTING FINETUNING, time = ", datetime.datetime.now() - #index = T.lscalar() # index to a [mini]batch minibatch_size = self.hp.minibatch_size - ensemble_x = T.matrix('ensemble_x') - ensemble_y = T.ivector('ensemble_y') + if ind_test == 0: + nom_test = "NIST" + else: + nom_test = "P07" + # create a function to compute the mistakes that are made by the model # on the validation set, or testing set - shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX)) - test_model = theano.function([ensemble_x,ensemble_y], self.classifier.errors, - givens = { - #self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, - #self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]}) - self.classifier.x: ensemble_x, - self.classifier.y: ensemble_y}) + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) - validate_model = theano.function([ensemble_x,ensemble_y], self.classifier.errors, - givens = { - #self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, - #self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]}) - self.classifier.x: ensemble_x, - self.classifier.y: ensemble_y}) + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) # early-stopping parameters @@ -172,11 +160,13 @@ # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = min(self.n_train_batches, patience/2) + validation_frequency = min(self.mb_per_epoch, patience/2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch + if self.max_minibatches and validation_frequency > self.max_minibatches: + validation_frequency = self.max_minibatches / 2 best_params = None best_validation_loss = float('inf') @@ -186,37 +176,31 @@ done_looping = False epoch = 0 - while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): + total_mb_index = 0 + + while (epoch < num_finetune) and (not done_looping): epoch = epoch + 1 - minibatch_index=int(0) + minibatch_index = -1 for x,y in dataset.train(minibatch_size): - minibatch_index +=1 - - if minibatch_index > reduce: #If maximum number of mini-batchs is used - break - + minibatch_index += 1 cost_ij = self.classifier.finetune(x,y) - iter = epoch * self.n_train_batches + minibatch_index + total_mb_index += 1 self.series["training_error"].append((epoch, minibatch_index), cost_ij) - if (iter+1) % validation_frequency == 0: + if (total_mb_index+1) % validation_frequency == 0: - #validation_losses = [validate_model(x,y) for x,y in dataset.valid(minibatch_size)] - test_index=int(0) - validation_losses=[] - for x,y in dataset.valid(minibatch_size): - test_index+=1 - if test_index > reduce: - break - validation_losses.append(validate_model(x,y)) + iter = dataset.valid(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + validation_losses = [validate_model(x,y) for x,y in iter] this_validation_loss = numpy.mean(validation_losses) self.series["validation_error"].\ append((epoch, minibatch_index), this_validation_loss*100.) print('epoch %i, minibatch %i, validation error %f %%' % \ - (epoch, minibatch_index, \ + (epoch, minibatch_index+1, \ this_validation_loss*100.)) @@ -226,36 +210,48 @@ #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold : - patience = max(patience, iter * patience_increase) + patience = max(patience, total_mb_index * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss - best_iter = iter + best_iter = total_mb_index # test it on the test set - #test_losses = [test_model(x,y) for x,y in dataset.test(minibatch_size)] - test_losses=[] - i=0 - for x,y in dataset.test(minibatch_size): - i+=1 - if i > reduce: - break - test_losses.append(test_model(x,y)) + iter = dataset.test(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + test_losses = [test_model(x,y) for x,y in iter] test_score = numpy.mean(test_losses) + + #test it on the second test set + iter2 = dataset_test.test(minibatch_size) + if self.max_minibatches: + iter2 = itermax(iter2, self.max_minibatches) + test_losses2 = [test_model(x,y) for x,y in iter2] + test_score2 = numpy.mean(test_losses2) self.series["test_error"].\ append((epoch, minibatch_index), test_score*100.) print((' epoch %i, minibatch %i, test error of best ' 'model %f %%') % - (epoch, minibatch_index, + (epoch, minibatch_index+1, test_score*100.)) + + print((' epoch %i, minibatch %i, test error on dataset %s of best ' + 'model %f %%') % + (epoch, minibatch_index+1,nom_test, + test_score2*100.)) sys.stdout.flush() + # useful when doing tests + if self.max_minibatches and minibatch_index >= self.max_minibatches: + break + self.series['params'].append((epoch,), self.classifier.all_params) - if patience <= iter : + if patience <= total_mb_index: done_looping = True break @@ -268,7 +264,22 @@ print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score*100.)) + print(('The test score on the %s dataset is %f')%(nom_test,test_score2*100.)) + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + + #Set parameters like they where right after pre-train + def reload_parameters(self): + + #self.parameters_pre=pickle.load('params_pretrain.txt') + f = open('params_pretrain.txt') + self.parameters_pre=pickle.load(f) + f.close() + for idx,x in enumerate(self.parameters_pre): + self.classifier.params[idx].value=copy(x) + + diff -r c8fe09a65039 -r 1e4e60ddadb1 deep/stacked_dae/v_sylvain/stacked_dae.py --- a/deep/stacked_dae/v_sylvain/stacked_dae.py Fri Mar 19 10:54:39 2010 -0400 +++ b/deep/stacked_dae/v_sylvain/stacked_dae.py Fri Mar 19 10:56:16 2010 -0400 @@ -165,9 +165,9 @@ class SdA(object): - def __init__(self, train_set_x, train_set_y, batch_size, n_ins, + def __init__(self, batch_size, n_ins, hidden_layers_sizes, n_outs, - corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): + corruption_levels, rng, pretrain_lr, finetune_lr): # Just to make sure those are not modified somewhere else afterwards hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) corruption_levels = copy.deepcopy(corruption_levels) @@ -190,23 +190,17 @@ print "n_outs", n_outs print "pretrain_lr", pretrain_lr print "finetune_lr", finetune_lr - print "input_divider", input_divider print "----" - #self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) - if len(hidden_layers_sizes) < 1 : raiseException (' You must have at least one hidden layer ') # allocate symbolic variables for the data - ##index = T.lscalar() # index to a [mini]batch + #index = T.lscalar() # index to a [mini]batch self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels - ensemble = T.matrix('ensemble') - ensemble_x = T.matrix('ensemble_x') - ensemble_y = T.ivector('ensemble_y') for i in xrange( self.n_layers ): # construct the sigmoidal layer @@ -250,10 +244,15 @@ updates[param] = param - gparam * pretrain_lr # create a function that trains the dA - update_fn = theano.function([ensemble], dA_layer.cost, \ - updates = updates, - givens = { - self.x : ensemble}) + update_fn = theano.function([self.x], dA_layer.cost, \ + updates = updates)#, + # givens = { + # self.x : ensemble}) + # collect this function into a list + #update_fn = theano.function([index], dA_layer.cost, \ + # updates = updates, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) # collect this function into a list self.pretrain_functions += [update_fn] @@ -276,18 +275,17 @@ for param,gparam in zip(self.params, gparams): updates[param] = param - gparam*finetune_lr - self.finetune = theano.function([ensemble_x,ensemble_y], cost, - updates = updates, - givens = { - #self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, - #self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) - self.x : ensemble_x, - self.y : ensemble_y} ) + self.finetune = theano.function([self.x,self.y], cost, + updates = updates)#, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + # self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) + if __name__ == '__main__': import sys diff -r c8fe09a65039 -r 1e4e60ddadb1 scripts/launch_generate100.py --- a/scripts/launch_generate100.py Fri Mar 19 10:54:39 2010 -0400 +++ b/scripts/launch_generate100.py Fri Mar 19 10:56:16 2010 -0400 @@ -3,12 +3,17 @@ import os dir1 = "/data/lisa/data/ift6266h10/" -mach = "brams0c.iro.umontreal.ca,brams02.iro.umontreal.ca,brams03.iro.umontreal.ca,maggie22.iro.umontreal.ca" +mach = "maggie16.iro.umontreal.ca,maggie15.iro.umontreal.ca" for i,s in enumerate(['valid','test']): for j,c in enumerate([0.3,0.5,0.7,1]): l = str(c).replace('.','') os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m 0.3 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, [20000,80000][i], 200+i*4+j)) +#P07 for i in range(100): os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P07_train%d_data.ft -p %sdata/P07_train%d_params -x %sdata/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i)) + +#PNIST07 +for i in range(100): + os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/PNIST07_train%d_data.ft -p %sdata/PNIST07_train%d_params -x %sdata/PNIST07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d -t %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i,1))