changeset 279:206374eed2fb

Merge
author fsavard
date Wed, 24 Mar 2010 14:36:55 -0400
parents 43afd29f3dbd (current diff) 20ebc1f2a9fe (diff)
children c77ffb11f91d 8a3af19ae272
files deep/stacked_dae/nist_sda.py deep/stacked_dae/utils.py
diffstat 10 files changed, 585 insertions(+), 138 deletions(-) [+]
line wrap: on
line diff
--- a/baseline/conv_mlp/convolutional_mlp.py	Wed Mar 24 14:35:11 2010 -0400
+++ b/baseline/conv_mlp/convolutional_mlp.py	Wed Mar 24 14:36:55 2010 -0400
@@ -24,9 +24,12 @@
 import numpy, theano, cPickle, gzip, time
 import theano.tensor as T
 import theano.sandbox.softsign
+import sys
 import pylearn.datasets.MNIST
 from pylearn.io import filetensor as ft
 from theano.sandbox import conv, downsample
+
+from ift6266 import datasets
 import theano,pylearn.version,ift6266
 
 class LeNetConvPoolLayer(object):
@@ -178,81 +181,16 @@
             raise NotImplementedError()
 
 
-def load_dataset(fname,batch=20):
-
-    # repertoire qui contient les donnees NIST
-    # le repertoire suivant va fonctionner si vous etes connecte sur un ordinateur
-    # du reseau DIRO
-    datapath = '/data/lisa/data/nist/by_class/'
-    # le fichier .ft contient chiffres NIST dans un format efficace. Les chiffres
-    # sont stockes dans une matrice de NxD, ou N est le nombre d'images, est D est
-    # le nombre de pixels par image (32x32 = 1024). Chaque pixel de l'image est une
-    # valeur entre 0 et 255, correspondant a un niveau de gris. Les valeurs sont
-    # stockees comme des uint8, donc des bytes.
-    f = open(datapath+'digits/digits_train_data.ft')
-    # Verifier que vous avez assez de memoire pour loader les donnees au complet
-    # dans le memoire. Sinon, utilisez ft.arraylike, une classe construite
-    # specialement pour des fichiers qu'on ne souhaite pas loader dans RAM.
-    d = ft.read(f)
-
-    # NB: N'oubliez pas de diviser les valeurs des pixels par 255. si jamais vous
-    # utilisez les donnees commes entrees dans un reseaux de neurones et que vous 
-    # voulez des entres entre 0 et 1.
-    # digits_train_data.ft contient les images, digits_train_labels.ft contient les
-    # etiquettes
-    f = open(datapath+'digits/digits_train_labels.ft')
-    labels = ft.read(f)
-
-
-    # Load the dataset 
-    #f = gzip.open(fname,'rb')
-    #train_set, valid_set, test_set = cPickle.load(f)
-    #f.close()
-
-    # make minibatches of size 20 
-    batch_size = batch   # sized of the minibatch
-
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = (d[:200000,:],labels[:200000])
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
-
-    #print train_batches[500]
-
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = (d[200000:270000,:],labels[200000:270000])
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = (d[270000:340000,:],labels[270000:340000])
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
-
-
-    return train_batches, valid_batches, test_batches
-
-
-def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, dataset='mnist.pkl.gz'):
+def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, sigmoide_size=500, dataset='mnist.pkl.gz'):
     rng = numpy.random.RandomState(23455)
 
     print 'Before load dataset'
-    train_batches, valid_batches, test_batches = load_dataset(dataset,batch_size)
+    dataset=datasets.nist_digits
+    train_batches= dataset.train(batch_size)
+    valid_batches=dataset.valid(batch_size)
+    test_batches=dataset.test(batch_size)
+    #print valid_batches.shape
+    #print test_batches.shape
     print 'After load dataset'
 
     ishape = (32,32)     # this is the size of NIST images
@@ -305,9 +243,9 @@
 	fshape0=(32-filter_shape0+1)/2
 	layer1_input = layer0.output.flatten(2)
 		# construct a fully-connected sigmoidal layer
-	layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=500)
+	layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=sigmoide_size)
 
-	layer2 = LogisticRegression(input=layer1.output, n_in=500, n_out=10)
+	layer2 = LogisticRegression(input=layer1.output, n_in=sigmoide_size, n_out=10)
 	cost = layer2.negative_log_likelihood(y)
 	test_model = theano.function([x,y], layer2.errors(y))
 	params = layer2.params+ layer1.params + layer0.params
@@ -335,10 +273,10 @@
 	layer4_input = layer3.output.flatten(2)
 
 	layer4 = SigmoidalLayer(rng, input=layer4_input, 
-					n_in=n_kern3*fshape3*fshape3, n_out=500)
+					n_in=n_kern3*fshape3*fshape3, n_out=sigmoide_size)
 
   
-	layer5 = LogisticRegression(input=layer4.output, n_in=500, n_out=10)
+	layer5 = LogisticRegression(input=layer4.output, n_in=sigmoide_size, n_out=10)
 
 	cost = layer5.negative_log_likelihood(y)
 
@@ -354,10 +292,10 @@
 	layer3_input = layer2.output.flatten(2)
 
 	layer3 = SigmoidalLayer(rng, input=layer3_input, 
-					n_in=n_kern2*fshape2*fshape2, n_out=500)
+					n_in=n_kern2*fshape2*fshape2, n_out=sigmoide_size)
 
   
-	layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=10)
+	layer4 = LogisticRegression(input=layer3.output, n_in=sigmoide_size, n_out=10)
 
 	cost = layer4.negative_log_likelihood(y)
 
@@ -378,11 +316,11 @@
 
 	# construct a fully-connected sigmoidal layer
 	layer2 = SigmoidalLayer(rng, input=layer2_input, 
-					n_in=n_kern1*fshape1*fshape1, n_out=500)
+					n_in=n_kern1*fshape1*fshape1, n_out=sigmoide_size)
 
   
 	# classify the values of the fully-connected sigmoidal layer
-	layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
+	layer3 = LogisticRegression(input=layer2.output, n_in=sigmoide_size, n_out=10)
 
 	# the cost we minimize during training is the NLL of the model
 	cost = layer3.negative_log_likelihood(y)
@@ -414,7 +352,28 @@
     # TRAIN MODEL #
     ###############
 
-    n_minibatches        = len(train_batches) 
+    #n_minibatches        = len(train_batches) 
+    n_minibatches=0
+    n_valid=0
+    n_test=0
+    for x, y in dataset.train(batch_size):
+	if x.shape[0] == batch_size:
+	    n_minibatches+=1
+    n_minibatches*=batch_size
+    print n_minibatches
+
+    for x, y in dataset.valid(batch_size):
+	if x.shape[0] == batch_size:
+	    n_valid+=1
+    n_valid*=batch_size
+    print n_valid
+
+    for x, y in dataset.test(batch_size):
+	if x.shape[0] == batch_size:
+	    n_test+=1
+    n_test*=batch_size
+    print n_test
+  
 
     # early-stopping parameters
     patience              = 10000 # look as this many examples regardless
@@ -433,60 +392,65 @@
     test_score           = 0.
     start_time = time.clock()
 
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter * n_minibatches):
-
-        # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
 
-        # get the minibatches corresponding to `iter` modulo
-        # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
-	
-        if iter %100 == 0:
-            print 'training @ iter = ', iter
-        cost_ij = train_model(x,y)
-
-        if (iter+1) % validation_frequency == 0: 
+    # have a maximum of `n_iter` iterations through the entire dataset
+    iter=0
+    for epoch in xrange(n_iter):
+	for x, y in train_batches:
+	    if x.shape[0] != batch_size:
+		continue
+	    iter+=1
 
-            # compute zero-one loss on validation set 
-            this_validation_loss = 0.
-            for x,y in valid_batches:
-                # sum up the errors for each minibatch
-                this_validation_loss += test_model(x,y)
-
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(valid_batches)
-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                   (epoch, minibatch_index+1, n_minibatches, \
-                    this_validation_loss*100.))
+	    # get epoch and minibatch index
+	    #epoch           = iter / n_minibatches
+	    minibatch_index =  iter % n_minibatches
+	    
+	    if iter %100 == 0:
+		print 'training @ iter = ', iter
+	    cost_ij = train_model(x,y)
 
 
-            # if we got the best validation score until now
-            if this_validation_loss < best_validation_loss:
+	# compute zero-one loss on validation set 
+	this_validation_loss = 0.
+	for x,y in valid_batches:
+	    if x.shape[0] != batch_size:
+		continue
+	    # sum up the errors for each minibatch
+	    this_validation_loss += test_model(x,y)
 
-                #improve patience if loss improvement is good enough
-                if this_validation_loss < best_validation_loss *  \
-                       improvement_threshold :
-                    patience = max(patience, iter * patience_increase)
+	# get the average by dividing with the number of minibatches
+	this_validation_loss /= n_valid
+	print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+	      (epoch, minibatch_index+1, n_minibatches, \
+		this_validation_loss*100.))
 
-                # save best validation score and iteration number
-                best_validation_loss = this_validation_loss
-                best_iter = iter
+
+	# if we got the best validation score until now
+	if this_validation_loss < best_validation_loss:
 
-                # test it on the test set
-                test_score = 0.
-                for x,y in test_batches:
-                    test_score += test_model(x,y)
-                test_score /= len(test_batches)
-                print(('     epoch %i, minibatch %i/%i, test error of best '
-                      'model %f %%') % 
-                             (epoch, minibatch_index+1, n_minibatches,
-                              test_score*100.))
+	    #improve patience if loss improvement is good enough
+	    if this_validation_loss < best_validation_loss *  \
+		  improvement_threshold :
+		patience = max(patience, iter * patience_increase)
+
+	    # save best validation score and iteration number
+	    best_validation_loss = this_validation_loss
+	    best_iter = iter
 
-        if patience <= iter :
-            break
+	    # test it on the test set
+	    test_score = 0.
+	    for x,y in test_batches:
+		if x.shape[0] != batch_size:
+		    continue
+		test_score += test_model(x,y)
+	    test_score /= n_test
+	    print(('     epoch %i, minibatch %i/%i, test error of best '
+		  'model %f %%') % 
+			(epoch, minibatch_index+1, n_minibatches,
+			  test_score*100.))
+
+	if patience <= iter :
+	    break
 
     end_time = time.clock()
     print('Optimization complete.')
@@ -502,8 +466,10 @@
 
 def experiment(state, channel):
     print 'start experiment'
-    (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1)
+    (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1,state.sigmoide_size)
     print 'end experiment'
+
+    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
     
     state.best_validation_loss = best_validation_loss
     state.test_score = test_score
--- a/baseline/log_reg/log_reg.py	Wed Mar 24 14:35:11 2010 -0400
+++ b/baseline/log_reg/log_reg.py	Wed Mar 24 14:36:55 2010 -0400
@@ -142,7 +142,7 @@
 #--------------------------------------------------------------------------------------------------------------------
 
 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
-                    dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10,  \
+                    dataset=datasets.nist_digits(), image_size = 32 * 32, nb_class = 10,  \
                     patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
     
     #28 * 28 = 784
--- a/data_generation/transformations/pycaptcha/Captcha/File.py	Wed Mar 24 14:35:11 2010 -0400
+++ b/data_generation/transformations/pycaptcha/Captcha/File.py	Wed Mar 24 14:36:55 2010 -0400
@@ -7,7 +7,7 @@
 # Copyright (C) 2004 Micah Dowty <micah@navi.cx>
 #
 
-import os, random
+import os, random, cPickle
 
 # Determine the data directory. This can be overridden after import-time if needed.
 dataDir = os.path.join(os.path.split(os.path.abspath(__file__))[0], "data")
@@ -41,7 +41,10 @@
             else:
                 path = os.path.join(dataDir, self.basePath, name)
             if os.path.isdir(path):
-                for content in os.listdir(path):
+                f = open(path + '/filelist.pkl')
+                filelist = cPickle.load(f)
+                f.close()
+                for content in filelist:
                     if self._checkExtension(content):
                         paths.append(os.path.join(path, content))
             else:
--- a/data_generation/transformations/ttf2jpg.py	Wed Mar 24 14:35:11 2010 -0400
+++ b/data_generation/transformations/ttf2jpg.py	Wed Mar 24 14:36:55 2010 -0400
@@ -10,6 +10,7 @@
 
 import sys, os, fnmatch, random
 import Image, ImageFont, ImageDraw, numpy
+import cPickle
 
 class ttf2jpg():
     def __init__(self, font_file = ''):
@@ -26,8 +27,9 @@
             self.char_list.append(chr(ord('A') + i) )
         for i in range(0,26):
             self.char_list.append(chr(ord('a') + i) )
-        files = os.listdir(self.font_dir)
-        self.font_files = fnmatch.filter(files, '*.ttf') + fnmatch.filter(files, '*.TTF')
+        f = open( self.font_dir + 'filelist.pkl' ,'r')
+        self.font_files = cPickle.load(f)
+        f.close()
 
     # get font name
     def get_settings_names(self):
--- a/datasets/defs.py	Wed Mar 24 14:35:11 2010 -0400
+++ b/datasets/defs.py	Wed Mar 24 14:36:55 2010 -0400
@@ -43,8 +43,10 @@
                 valid_lbl = [os.path.join(DATA_PATH,'ocr_valid_labels.ft')],
                 indtype=theano.config.floatX, inscale=255., maxsize=maxsize)
 
-nist_P07 = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(100)],
-                     train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(100)],
+#There is 2 more arguments here to can choose smaller datasets based on the file number.
+#This is usefull to get different data for pre-training and finetuning
+nist_P07 = lambda maxsize=None, min_file=0, max_file=100: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(min_file, max_file)],
+                     train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(min_file, max_file)],
                      test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')],
                      test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')],
                      valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')],
--- a/datasets/ftfile.py	Wed Mar 24 14:35:11 2010 -0400
+++ b/datasets/ftfile.py	Wed Mar 24 14:36:55 2010 -0400
@@ -201,7 +201,9 @@
         set.
         """
         if valid_data is None:
-            total_valid_size = min(sum(FTFile(td).size for td in test_data), maxsize)
+            total_valid_size = sum(FTFile(td).size for td in test_data)
+            if maxsize is not None:
+                total_valid_size = min(total_valid_size, maxsize) 
             valid_size = total_valid_size/len(train_data)
             self._train = FTData(train_data, train_lbl, size=-valid_size,
                                  inscale=inscale, outscale=outscale,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/convolutional_dae/scdae.py	Wed Mar 24 14:36:55 2010 -0400
@@ -0,0 +1,237 @@
+from pynnet import *
+# use hacks also
+from pynnet.utils import *
+
+import numpy
+import theano
+import theano.tensor as T
+
+from itertools import izip
+
+class cdae(LayerStack):
+    def __init__(self, filter_size, num_filt, num_in, subsampling, corruption,
+                 dtype, img_shape):
+        LayerStack.__init__(self, [ConvAutoencoder(filter_size=filter_size, 
+                                                   num_filt=num_filt,
+                                                   num_in=num_in,
+                                                   noisyness=corruption,
+                                                   dtype=dtype,
+                                                   image_shape=img_shape),
+                                   MaxPoolLayer(subsampling)])
+
+    def build(self, input):
+        LayerStack.build(self, input)
+        self.cost = self.layers[0].cost
+
+def cdae_out_size(in_size, filt_size, num_filt, num_in, subs):
+    out = [None] * 3
+    out[0] = num_filt
+    out[1] = (in_size[1]-filt_size[0]+1)/subs[0]
+    out[2] = (in_size[2]-filt_size[1]+1)/subs[1]
+    return out
+
+def scdae(in_size, num_in, filter_sizes, num_filts,
+          subsamplings, corruptions, dtype):
+    layers = []
+    old_nfilt = 1
+    for fsize, nfilt, subs, corr in izip(filter_sizes, num_filts,
+                                         subsamplings, corruptions):
+        layers.append(cdae(fsize, nfilt, old_nfilt, subs, corr, dtype,
+                           (num_in, in_size[0], in_size[1], in_size[2])))
+        in_size = cdae_out_size(in_size, fsize, nfilt, old_nfilt, subs)
+        old_nfilt = nfilt
+    return LayerStack(layers), in_size
+
+def mlp(layer_sizes, dtype):
+    layers = []
+    old_size = layer_sizes[0]
+    for size in layer_sizes[1:]:
+        layers.append(SimpleLayer(old_size, size, activation=nlins.tanh,
+                                  dtype=dtype))
+        old_size = size
+    return LayerStack(layers)
+
+def scdae_net(in_size, num_in, filter_sizes, num_filts, subsamplings,
+              corruptions, layer_sizes, out_size, dtype, batch_size):
+    rl1 = ReshapeLayer((None,)+in_size)
+    ls, outs = scdae(in_size, num_in, filter_sizes, num_filts, subsamplings, 
+                     corruptions, dtype)
+    outs = numpy.prod(outs)
+    rl2 = ReshapeLayer((None, outs))
+    layer_sizes = [outs]+layer_sizes
+    ls2 = mlp(layer_sizes, dtype)
+    lrl = SimpleLayer(layer_sizes[-1], out_size, activation=nlins.softmax)
+    return NNet([rl1, ls, rl2, ls2, lrl], error=errors.nll)
+
+def build_funcs(batch_size, img_size, filter_sizes, num_filters, subs,
+                noise, mlp_sizes, out_size, dtype, pretrain_lr, train_lr):
+    
+    n = scdae_net((1,)+img_size, batch_size, filter_sizes, num_filters, subs,
+                  noise, mlp_sizes, out_size, dtype, batch_size)
+    x = T.fmatrix('x')
+    y = T.ivector('y')
+    
+    def pretrainfunc(net, alpha):
+        up = trainers.get_updates(net.params, net.cost, alpha)
+        return theano.function([x], net.cost, updates=up)
+
+    def trainfunc(net, alpha):
+        up = trainers.get_updates(net.params, net.cost, alpha)
+        return theano.function([x, y], net.cost, updates=up)
+
+    n.build(x, y)
+    pretrain_funcs_opt = [pretrainfunc(l, pretrain_lr) for l in n.layers[1].layers]
+    trainf_opt = trainfunc(n, train_lr)
+    evalf_opt = theano.function([x, y], errors.class_error(n.output, y))
+    
+    clear_imgshape(n)
+    n.build(x, y)
+    pretrain_funcs_reg = [pretrainfunc(l, 0.01) for l in n.layers[1].layers]
+    trainf_reg = trainfunc(n, 0.1)
+    evalf_reg = theano.function([x, y], errors.class_error(n.output, y))
+
+    def select_f(f1, f2, bsize):
+        def f(x):
+            if x.shape[0] == bsize:
+                return f1(x)
+            else:
+                return f2(x)
+        return f
+    
+    pretrain_funcs = [select_f(p_opt, p_reg, batch_size) for p_opt, p_reg in zip(pretrain_funcs_opt, pretrain_funcs_reg)]
+    
+    def select_f2(f1, f2, bsize):
+        def f(x, y):
+            if x.shape[0] == bsize:
+                return f1(x, y)
+            else:
+                return f2(x, y)
+        return f
+
+    trainf = select_f2(trainf_opt, trainf_reg, batch_size)
+    evalf = select_f2(evalf_opt, evalf_reg, batch_size)
+    return pretrain_funcs, trainf, evalf
+
+def do_pretrain(pretrain_funcs, pretrain_epochs):
+    for f in pretrain_funcs:
+        for i in xrange(pretrain_epochs):
+            f()
+
+def massage_funcs(train_it, dset, batch_size, pretrain_funcs, trainf, evalf):
+    def pretrain_f(f):
+        def res():
+            for x, y in train_it:
+                yield f(x)
+        it = res()
+        return lambda: it.next()
+
+    pretrain_fs = map(pretrain_f, pretrain_funcs)
+
+    def train_f(f):
+        def dset_it():
+            for x, y in train_it:
+                yield f(x, y)
+        it = dset_it()
+        return lambda: it.next()
+    
+    train = train_f(trainf)
+    
+    def eval_f(f, dsetf):
+        def res():
+            c = 0
+            i = 0
+            for x, y in dsetf(batch_size):
+                i += x.shape[0]
+                c += f(x, y)*x.shape[0]
+            return c/i
+        return res
+    
+    test = eval_f(evalf, dset.test)
+    valid = eval_f(evalf, dset.valid)
+
+    return pretrain_fs, train, valid, test
+
+def repeat_itf(itf, *args, **kwargs):
+    while True:
+        for e in itf(*args, **kwargs):
+            yield e
+
+def run_exp(state, channel):
+    from ift6266 import datasets
+    from sgd_opt import sgd_opt
+    import sys, time
+
+    channel.save()
+
+    # params: bsize, pretrain_lr, train_lr, nfilts1, nfilts2, nftils3, nfilts4
+    #         pretrain_rounds
+
+    dset = dataset.nist_all()
+
+    nfilts = []
+    if state.nfilts1 != 0:
+        nfilts.append(state.nfilts1)
+        if state.nfilts2 != 0:
+            nfilts.append(state.nfilts2)
+            if state.nfilts3 != 0:
+                nfilts.append(state.nfilts3)
+                if state.nfilts4 != 0:
+                    nfilts.append(state.nfilts4)
+
+    fsizes = [(5,5)]*len(nfilts)
+    subs = [(2,2)]*len(nfilts)
+    noise = [state.noise]*len(nfilts)
+
+    pretrain_funcs, trainf, evalf = build_funcs(
+        img_size=(32, 32),
+        batch_size=state.bsize,
+        filter_sizes=fsizes,
+        num_filters=nfilts,
+        subs=subs,
+        noise=noise,
+        mlp_sizes=[state.mlp_sz],
+        out_size=62,
+        dtype=numpy.float32,
+        pretrain_lr=state.pretrain_lr,
+        train_lr=state.train_lr)
+
+    pretrain_fs, train, valid, test = massage_funcs(
+        state.bsize, dset, pretrain_funcs, trainf, evalf)
+
+    do_pretrain(pretrain_fs, state.pretrain_rounds)
+
+    sgd_opt(train, valid, test, training_epochs=100000, patience=10000,
+            patience_increase=2., improvement_threshold=0.995,
+            validation_frequency=2500)
+
+if __name__ == '__main__':
+    from ift6266 import datasets
+    from sgd_opt import sgd_opt
+    import sys, time
+    
+    batch_size = 100
+    dset = datasets.mnist()
+
+    pretrain_funcs, trainf, evalf = build_funcs(
+        img_size = (28, 28),
+        batch_size=batch_size, filter_sizes=[(5,5), (3,3)],
+        num_filters=[4, 4], subs=[(2,2), (2,2)], noise=[0.2, 0.2],
+        mlp_sizes=[500], out_size=10, dtype=numpy.float32,
+        pretrain_lr=0.01, train_lr=0.1)
+    
+    pretrain_fs, train, valid, test = massage_funcs(
+        repeat_itf(dset.train, batch_size),
+        dset, batch_size,
+        pretrain_funcs, trainf, evalf)
+
+    print "pretraining ...",
+    sys.stdout.flush()
+    start = time.time()
+    do_pretrain(pretrain_fs, 2500)
+    end = time.time()
+    print "done (in", end-start, "s)"
+    
+    sgd_opt(train, valid, test, training_epochs=10000, patience=1000,
+            patience_increase=2., improvement_threshold=0.995,
+            validation_frequency=250)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/convolutional_dae/sgd_opt.py	Wed Mar 24 14:36:55 2010 -0400
@@ -0,0 +1,52 @@
+import time
+import sys
+
+def sgd_opt(train, valid, test, training_epochs=10000, patience=10000,
+            patience_increase=2., improvement_threshold=0.995,
+            validation_frequency=None):
+
+    if validation_frequency is None:
+        validation_frequency = patience/2
+ 
+    start_time = time.clock()
+
+    best_params = None
+    best_validation_loss = float('inf')
+    test_score = 0.
+
+    start_time = time.clock()
+ 
+    for epoch in xrange(1, training_epochs+1):
+        train()
+
+        if epoch % validation_frequency == 0:
+            this_validation_loss = valid()
+            print('epoch %i, validation error %f %%' % \
+                   (epoch, this_validation_loss*100.))
+            
+            # if we got the best validation score until now
+            if this_validation_loss < best_validation_loss:
+ 
+                #improve patience if loss improvement is good enough
+                if this_validation_loss < best_validation_loss * \
+                       improvement_threshold :
+                    patience = max(patience, epoch * patience_increase)
+                
+                # save best validation score and epoch number
+                best_validation_loss = this_validation_loss
+                best_epoch = epoch
+                
+                # test it on the test set
+                test_score = test()
+                print((' epoch %i, test error of best model %f %%') %
+                      (epoch, test_score*100.))
+                
+        if patience <= epoch:
+            break
+    
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %
+                 (best_validation_loss * 100., test_score*100.))
+    print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
+
--- a/scripts/launch_generate100.py	Wed Mar 24 14:35:11 2010 -0400
+++ b/scripts/launch_generate100.py	Wed Mar 24 14:36:55 2010 -0400
@@ -3,12 +3,13 @@
 import os
 dir1 = "/data/lisa/data/ift6266h10/"
 
-mach = "maggie16.iro.umontreal.ca,maggie15.iro.umontreal.ca"
+mach = ["maggie16.iro.umontreal.ca,zappa8@iro.umontreal.ca"]
 
+#test and valid sets
 for i,s in enumerate(['valid','test']):
     for j,c in enumerate([0.3,0.5,0.7,1]):
         l = str(c).replace('.','')
-        os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m 0.3 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, [20000,80000][i], 200+i*4+j))
+        os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m %s -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, c ,[20000,80000][i], 200+i*4+j))
 
 #P07
 for i in range(100):
@@ -17,3 +18,9 @@
 #PNIST07
 for i in range(100):
     os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/PNIST07_train%d_data.ft -p %sdata/PNIST07_train%d_params -x %sdata/PNIST07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d -t %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i,1))
+
+
+
+#P07
+#for i in [90,94]:#[2,10,13,15,20,49,68,82,86,90,94]:
+   #os.system("dbidispatch --condor --mem=3900 --os=fc4,fc7,fc9 --machine=maggie16.iro.umontreal.ca --machine=maggie15.iro.umontreal.ca --machine=zappa8@iro.umontreal.ca ./run_pipeline.sh -o %sdata2/P07_train%d_data.ft -p %sdata2/P07_train%d_params -x %sdata2/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1,100+i))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/setup_batches.py	Wed Mar 24 14:36:55 2010 -0400
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+import random
+from pylearn.io import filetensor as ft
+
+class Batches():
+  def __init__(self):
+    data_path = '/data/lisa/data/nist/by_class/'
+
+    digits_train_data = 'digits/digits_train_data.ft'
+    digits_train_labels = 'digits/digits_train_labels.ft'
+    digits_test_data = 'digits/digits_test_data.ft'
+    digits_test_labels = 'digits/digits_test_labels.ft'
+
+    lower_train_data = 'lower/lower_train_data.ft'
+    lower_train_labels = 'lower/lower_train_labels.ft'
+    #upper_train_data = 'upper/upper_train_data.ft'
+    #upper_train_labels = 'upper/upper_train_labels.ft'
+
+    f_digits_train_data = open(data_path + digits_train_data)
+    f_digits_train_labels = open(data_path + digits_train_labels)
+    f_digits_test_data = open(data_path + digits_test_data)
+    f_digits_test_labels = open(data_path + digits_test_labels)
+
+    f_lower_train_data = open(data_path + lower_train_data)
+    f_lower_train_labels = open(data_path + lower_train_labels)
+    #f_upper_train_data = open(data_path + upper_train_data)
+    #f_upper_train_labels = open(data_path + upper_train_labels)
+
+    self.raw_digits_train_data = ft.read(f_digits_train_data)
+    self.raw_digits_train_labels = ft.read(f_digits_train_labels)
+    self.raw_digits_test_data = ft.read(f_digits_test_data)
+    self.raw_digits_test_labels = ft.read(f_digits_test_labels)
+
+    self.raw_lower_train_data = ft.read(f_lower_train_data)
+    self.raw_lower_train_labels = ft.read(f_lower_train_labels)
+    #self.raw_upper_train_data = ft.read(f_upper_train_data)
+    #self.raw_upper_train_labels = ft.read(f_upper_train_labels)
+
+    f_digits_train_data.close()
+    f_digits_train_labels.close()
+    f_digits_test_data.close()
+    f_digits_test_labels.close()
+
+    f_lower_train_data.close()
+    f_lower_train_labels.close()
+    #f_upper_train_data.close()
+    #f_upper_train_labels.close()
+
+  def set_batches(self, start_ratio = -1, end_ratio = -1, batch_size = 20, verbose = False):
+    self.batch_size = batch_size
+
+    digits_train_size = len(self.raw_digits_train_labels)
+    digits_test_size = len(self.raw_digits_test_labels)
+
+    lower_train_size = len(self.raw_lower_train_labels)
+    #upper_train_size = len(self.raw_upper_train_labels)
+
+    if verbose == True:
+      print 'digits_train_size = %d' %digits_train_size
+      print 'digits_test_size = %d' %digits_test_size
+      print 'lower_train_size = %d' %lower_train_size
+      #print 'upper_train_size = %d' %upper_train_size
+
+    # define main and other datasets
+    raw_main_train_data = self.raw_digits_train_data
+    raw_other_train_data = self.raw_lower_train_labels
+    raw_test_data = self.raw_digits_test_labels
+
+    raw_main_train_labels = self.raw_digits_train_labels
+    raw_other_train_labels = self.raw_lower_train_labels
+    raw_test_labels = self.raw_digits_test_labels
+
+    main_train_size = len(raw_main_train_data)
+    other_train_size = len(raw_other_train_data)
+    test_size = len(raw_test_data)
+    test_size = int(test_size/batch_size)
+    test_size *= batch_size
+    validation_size = test_size 
+
+    # default ratio is actual ratio
+    if start_ratio == -1:
+      self.start_ratio = float(main_train_size) / float(main_train_size + other_train_size)
+    else:
+      self.start_ratio = start_ratio
+
+    if start_ratio == -1:
+      self.end_ratio = float(main_train_size) / float(main_train_size + other_train_size)
+    else:
+      self.end_ratio = end_ratio
+
+    if verbose == True:
+      print 'start_ratio = %f' %self.start_ratio
+      print 'end_ratio = %f' %self.end_ratio
+
+    i_main = 0
+    i_other = 0
+    i_batch = 0
+
+    # compute the number of batches given start and end ratios
+    n_main_batch = (main_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
+    n_other_batch = (other_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size - batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
+    n_batches = min([n_main_batch, n_other_batch])
+
+    # train batches
+    self.train_batches = []
+
+    # as long as we have data left in main and other, we create batches
+    while i_main < main_train_size - batch_size - test_size  and i_other < other_train_size - batch_size:
+
+      ratio = self.start_ratio + i_batch * (self.end_ratio - self.start_ratio) / n_batches
+      batch_data = []
+      batch_labels = []
+
+      for i in xrange(0, self.batch_size): # randomly choose between main and other, given the current ratio
+	rnd = random.randint(0, 100)
+
+	if rnd < 100 * ratio:
+	  batch_data = batch_data + \
+		[raw_main_train_data[i_main]]
+	  batch_labels = batch_labels + \
+		[raw_main_train_labels[i_main]]
+	  i_main += 1
+	else:
+	  batch_data = batch_data + \
+		[raw_other_train_data[i_other]]
+	  batch_labels = batch_labels + \
+		[raw_other_train_labels[i_other]]
+	  i_other += 1
+
+      self.train_batches = self.train_batches + \
+	      [(batch_data,batch_labels)]
+      i_batch += 1
+
+    offset = i_main
+
+    if verbose == True:
+      print 'n_main = %d' %i_main
+      print 'n_other = %d' %i_other
+      print 'nb_train_batches = %d / %d' %(i_batch,n_batches)
+      print 'offset = %d' %offset
+
+    # test batches
+    self.test_batches = []
+    for i in xrange(0, test_size, batch_size):
+        self.test_batches = self.test_batches + \
+            [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])]
+
+    # validation batches
+    self.validation_batches = []
+    for i in xrange(0, test_size, batch_size):
+        self.validation_batches = self.validation_batches + \
+            [(raw_main_train_data[offset+i:offset+i+batch_size], raw_main_train_labels[offset+i:offset+i+batch_size])]
+
+  def get_train_batches(self):
+    return self.train_batches
+
+  def get_test_batches(self):
+    return self.test_batches
+
+  def get_validation_batches(self):
+    return self.validation_batches
+
+  def test_set_batches(self, intervall = 1000):
+    for i in xrange(0, len(self.train_batches) - self.batch_size, intervall):
+	n_main = 0
+
+	for j in xrange(0, self.batch_size):
+	  if self.train_batches[i][1][j] < 10:
+	    n_main +=1
+	print 'ratio batch %d : %f' %(i,float(n_main) / float(self.batch_size))
+
+if __name__ == '__main__':
+    batches = Batches()
+    batches.set_batches(0.5,1, 20, True)
+    batches.test_set_batches()