changeset 275:7b4507295eba

merge
author Xavier Glorot <glorotxa@iro.umontreal.ca>
date Mon, 22 Mar 2010 10:20:10 -0400
parents 44409b6652aa (current diff) f6d9b6b89c2a (diff)
children 727ed56fad12
files deep/stacked_dae/__init__.py deep/stacked_dae/mnist_sda.py deep/stacked_dae/nist_sda.py deep/stacked_dae/sgd_optimization.py deep/stacked_dae/stacked_dae.py deep/stacked_dae/utils.py deep/stacked_dae/v2/__init__.py deep/stacked_dae/v2/config.py.example deep/stacked_dae/v2/nist_sda.py deep/stacked_dae/v2/sgd_optimization.py deep/stacked_dae/v2/stacked_dae.py deep/stacked_dae/v2/utils.py
diffstat 22 files changed, 1520 insertions(+), 1300 deletions(-) [+]
line wrap: on
line diff
--- a/baseline/conv_mlp/convolutional_mlp.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/baseline/conv_mlp/convolutional_mlp.py	Mon Mar 22 10:20:10 2010 -0400
@@ -24,9 +24,12 @@
 import numpy, theano, cPickle, gzip, time
 import theano.tensor as T
 import theano.sandbox.softsign
+import sys
 import pylearn.datasets.MNIST
 from pylearn.io import filetensor as ft
 from theano.sandbox import conv, downsample
+
+from ift6266 import datasets
 import theano,pylearn.version,ift6266
 
 class LeNetConvPoolLayer(object):
@@ -178,81 +181,16 @@
             raise NotImplementedError()
 
 
-def load_dataset(fname,batch=20):
-
-    # repertoire qui contient les donnees NIST
-    # le repertoire suivant va fonctionner si vous etes connecte sur un ordinateur
-    # du reseau DIRO
-    datapath = '/data/lisa/data/nist/by_class/'
-    # le fichier .ft contient chiffres NIST dans un format efficace. Les chiffres
-    # sont stockes dans une matrice de NxD, ou N est le nombre d'images, est D est
-    # le nombre de pixels par image (32x32 = 1024). Chaque pixel de l'image est une
-    # valeur entre 0 et 255, correspondant a un niveau de gris. Les valeurs sont
-    # stockees comme des uint8, donc des bytes.
-    f = open(datapath+'digits/digits_train_data.ft')
-    # Verifier que vous avez assez de memoire pour loader les donnees au complet
-    # dans le memoire. Sinon, utilisez ft.arraylike, une classe construite
-    # specialement pour des fichiers qu'on ne souhaite pas loader dans RAM.
-    d = ft.read(f)
-
-    # NB: N'oubliez pas de diviser les valeurs des pixels par 255. si jamais vous
-    # utilisez les donnees commes entrees dans un reseaux de neurones et que vous 
-    # voulez des entres entre 0 et 1.
-    # digits_train_data.ft contient les images, digits_train_labels.ft contient les
-    # etiquettes
-    f = open(datapath+'digits/digits_train_labels.ft')
-    labels = ft.read(f)
-
-
-    # Load the dataset 
-    #f = gzip.open(fname,'rb')
-    #train_set, valid_set, test_set = cPickle.load(f)
-    #f.close()
-
-    # make minibatches of size 20 
-    batch_size = batch   # sized of the minibatch
-
-    # Dealing with the training set
-    # get the list of training images (x) and their labels (y)
-    (train_set_x, train_set_y) = (d[:200000,:],labels[:200000])
-    # initialize the list of training minibatches with empty list
-    train_batches = []
-    for i in xrange(0, len(train_set_x), batch_size):
-        # add to the list of minibatches the minibatch starting at 
-        # position i, ending at position i+batch_size
-        # a minibatch is a pair ; the first element of the pair is a list 
-        # of datapoints, the second element is the list of corresponding 
-        # labels
-        train_batches = train_batches + \
-               [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
-
-    #print train_batches[500]
-
-    # Dealing with the validation set
-    (valid_set_x, valid_set_y) = (d[200000:270000,:],labels[200000:270000])
-    # initialize the list of validation minibatches 
-    valid_batches = []
-    for i in xrange(0, len(valid_set_x), batch_size):
-        valid_batches = valid_batches + \
-               [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
-
-    # Dealing with the testing set
-    (test_set_x, test_set_y) = (d[270000:340000,:],labels[270000:340000])
-    # initialize the list of testing minibatches 
-    test_batches = []
-    for i in xrange(0, len(test_set_x), batch_size):
-        test_batches = test_batches + \
-              [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
-
-
-    return train_batches, valid_batches, test_batches
-
-
-def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, dataset='mnist.pkl.gz'):
+def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, sigmoide_size=500, dataset='mnist.pkl.gz'):
     rng = numpy.random.RandomState(23455)
 
     print 'Before load dataset'
-    train_batches, valid_batches, test_batches = load_dataset(dataset,batch_size)
+    dataset=datasets.nist_digits
+    train_batches= dataset.train(batch_size)
+    valid_batches=dataset.valid(batch_size)
+    test_batches=dataset.test(batch_size)
+    #print valid_batches.shape
+    #print test_batches.shape
     print 'After load dataset'
 
     ishape = (32,32)     # this is the size of NIST images
@@ -305,9 +243,9 @@
 	fshape0=(32-filter_shape0+1)/2
 	layer1_input = layer0.output.flatten(2)
 		# construct a fully-connected sigmoidal layer
-	layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=500)
+	layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=sigmoide_size)
 
-	layer2 = LogisticRegression(input=layer1.output, n_in=500, n_out=10)
+	layer2 = LogisticRegression(input=layer1.output, n_in=sigmoide_size, n_out=10)
 	cost = layer2.negative_log_likelihood(y)
 	test_model = theano.function([x,y], layer2.errors(y))
 	params = layer2.params+ layer1.params + layer0.params
@@ -335,10 +273,10 @@
 	layer4_input = layer3.output.flatten(2)
 
 	layer4 = SigmoidalLayer(rng, input=layer4_input, 
-					n_in=n_kern3*fshape3*fshape3, n_out=500)
+					n_in=n_kern3*fshape3*fshape3, n_out=sigmoide_size)
 
   
-	layer5 = LogisticRegression(input=layer4.output, n_in=500, n_out=10)
+	layer5 = LogisticRegression(input=layer4.output, n_in=sigmoide_size, n_out=10)
 
 	cost = layer5.negative_log_likelihood(y)
 
@@ -354,10 +292,10 @@
 	layer3_input = layer2.output.flatten(2)
 
 	layer3 = SigmoidalLayer(rng, input=layer3_input, 
-					n_in=n_kern2*fshape2*fshape2, n_out=500)
+					n_in=n_kern2*fshape2*fshape2, n_out=sigmoide_size)
 
   
-	layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=10)
+	layer4 = LogisticRegression(input=layer3.output, n_in=sigmoide_size, n_out=10)
 
 	cost = layer4.negative_log_likelihood(y)
 
@@ -378,11 +316,11 @@
 
 	# construct a fully-connected sigmoidal layer
 	layer2 = SigmoidalLayer(rng, input=layer2_input, 
-					n_in=n_kern1*fshape1*fshape1, n_out=500)
+					n_in=n_kern1*fshape1*fshape1, n_out=sigmoide_size)
 
   
 	# classify the values of the fully-connected sigmoidal layer
-	layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
+	layer3 = LogisticRegression(input=layer2.output, n_in=sigmoide_size, n_out=10)
 
 	# the cost we minimize during training is the NLL of the model
 	cost = layer3.negative_log_likelihood(y)
@@ -414,7 +352,28 @@
     # TRAIN MODEL #
     ###############
 
-    n_minibatches        = len(train_batches) 
+    #n_minibatches        = len(train_batches) 
+    n_minibatches=0
+    n_valid=0
+    n_test=0
+    for x, y in dataset.train(batch_size):
+	if x.shape[0] == batch_size:
+	    n_minibatches+=1
+    n_minibatches*=batch_size
+    print n_minibatches
+
+    for x, y in dataset.valid(batch_size):
+	if x.shape[0] == batch_size:
+	    n_valid+=1
+    n_valid*=batch_size
+    print n_valid
+
+    for x, y in dataset.test(batch_size):
+	if x.shape[0] == batch_size:
+	    n_test+=1
+    n_test*=batch_size
+    print n_test
+  
 
     # early-stopping parameters
     patience              = 10000 # look as this many examples regardless
@@ -433,60 +392,65 @@
     test_score           = 0.
     start_time = time.clock()
 
-    # have a maximum of `n_iter` iterations through the entire dataset
-    for iter in xrange(n_iter * n_minibatches):
-
-        # get epoch and minibatch index
-        epoch           = iter / n_minibatches
-        minibatch_index =  iter % n_minibatches
 
-        # get the minibatches corresponding to `iter` modulo
-        # `len(train_batches)`
-        x,y = train_batches[ minibatch_index ]
-	
-        if iter %100 == 0:
-            print 'training @ iter = ', iter
-        cost_ij = train_model(x,y)
-
-        if (iter+1) % validation_frequency == 0: 
+    # have a maximum of `n_iter` iterations through the entire dataset
+    iter=0
+    for epoch in xrange(n_iter):
+	for x, y in train_batches:
+	    if x.shape[0] != batch_size:
+		continue
+	    iter+=1
 
-            # compute zero-one loss on validation set 
-            this_validation_loss = 0.
-            for x,y in valid_batches:
-                # sum up the errors for each minibatch
-                this_validation_loss += test_model(x,y)
-
-            # get the average by dividing with the number of minibatches
-            this_validation_loss /= len(valid_batches)
-            print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                   (epoch, minibatch_index+1, n_minibatches, \
-                    this_validation_loss*100.))
+	    # get epoch and minibatch index
+	    #epoch           = iter / n_minibatches
+	    minibatch_index =  iter % n_minibatches
+	    
+	    if iter %100 == 0:
+		print 'training @ iter = ', iter
+	    cost_ij = train_model(x,y)
 
 
-            # if we got the best validation score until now
-            if this_validation_loss < best_validation_loss:
+	# compute zero-one loss on validation set 
+	this_validation_loss = 0.
+	for x,y in valid_batches:
+	    if x.shape[0] != batch_size:
+		continue
+	    # sum up the errors for each minibatch
+	    this_validation_loss += test_model(x,y)
 
-                #improve patience if loss improvement is good enough
-                if this_validation_loss < best_validation_loss *  \
-                       improvement_threshold :
-                    patience = max(patience, iter * patience_increase)
+	# get the average by dividing with the number of minibatches
+	this_validation_loss /= n_valid
+	print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+	      (epoch, minibatch_index+1, n_minibatches, \
+		this_validation_loss*100.))
 
-                # save best validation score and iteration number
-                best_validation_loss = this_validation_loss
-                best_iter = iter
+
+	# if we got the best validation score until now
+	if this_validation_loss < best_validation_loss:
 
-                # test it on the test set
-                test_score = 0.
-                for x,y in test_batches:
-                    test_score += test_model(x,y)
-                test_score /= len(test_batches)
-                print(('     epoch %i, minibatch %i/%i, test error of best '
-                      'model %f %%') % 
-                             (epoch, minibatch_index+1, n_minibatches,
-                              test_score*100.))
+	    #improve patience if loss improvement is good enough
+	    if this_validation_loss < best_validation_loss *  \
+		  improvement_threshold :
+		patience = max(patience, iter * patience_increase)
+
+	    # save best validation score and iteration number
+	    best_validation_loss = this_validation_loss
+	    best_iter = iter
 
-        if patience <= iter :
-            break
+	    # test it on the test set
+	    test_score = 0.
+	    for x,y in test_batches:
+		if x.shape[0] != batch_size:
+		    continue
+		test_score += test_model(x,y)
+	    test_score /= n_test
+	    print(('     epoch %i, minibatch %i/%i, test error of best '
+		  'model %f %%') % 
+			(epoch, minibatch_index+1, n_minibatches,
+			  test_score*100.))
+
+	if patience <= iter :
+	    break
 
     end_time = time.clock()
     print('Optimization complete.')
@@ -502,8 +466,10 @@
 
 def experiment(state, channel):
     print 'start experiment'
-    (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1)
+    (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1,state.sigmoide_size)
     print 'end experiment'
+
+    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
     
     state.best_validation_loss = best_validation_loss
     state.test_score = test_score
--- a/baseline/log_reg/log_reg.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/baseline/log_reg/log_reg.py	Mon Mar 22 10:20:10 2010 -0400
@@ -142,7 +142,7 @@
 #--------------------------------------------------------------------------------------------------------------------
 
 def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \
-                    dataset=datasets.nist_digits, image_size = 32 * 32, nb_class = 10,  \
+                    dataset=datasets.nist_digits(), image_size = 32 * 32, nb_class = 10,  \
                     patience = 5000, patience_increase = 2, improvement_threshold = 0.995):
     
     #28 * 28 = 784
--- a/datasets/defs.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/datasets/defs.py	Mon Mar 22 10:20:10 2010 -0400
@@ -43,8 +43,10 @@
                 valid_lbl = [os.path.join(DATA_PATH,'ocr_valid_labels.ft')],
                 indtype=theano.config.floatX, inscale=255., maxsize=maxsize)
 
-nist_P07 = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(100)],
-                     train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(100)],
+#There is 2 more arguments here to can choose smaller datasets based on the file number.
+#This is usefull to get different data for pre-training and finetuning
+nist_P07 = lambda maxsize=None, min_file=0, max_file=100: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(min_file, max_file)],
+                     train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(min_file, max_file)],
                      test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')],
                      test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')],
                      valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')],
--- a/datasets/ftfile.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/datasets/ftfile.py	Mon Mar 22 10:20:10 2010 -0400
@@ -201,7 +201,9 @@
         set.
         """
         if valid_data is None:
-            total_valid_size = min(sum(FTFile(td).size for td in test_data), maxsize)
+            total_valid_size = sum(FTFile(td).size for td in test_data)
+            if maxsize is not None:
+                total_valid_size = min(total_valid_size, maxsize) 
             valid_size = total_valid_size/len(train_data)
             self._train = FTData(train_data, train_lbl, size=-valid_size,
                                  inscale=inscale, outscale=outscale,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/config.py.example	Mon Mar 22 10:20:10 2010 -0400
@@ -0,0 +1,109 @@
+# ----------------------------------------------------------------------------
+# BEGIN EXPERIMENT ISOLATION CODE
+
+'''
+This makes sure we use the codebase clone created for this experiment.
+I.e. if you want to make modifications to the codebase but don't want your
+running experiment code to be impacted by those changes, first copy the
+codebase somewhere, and configure this section. It will make sure we import
+from the right place.
+
+MUST BE DONE BEFORE IMPORTING ANYTHING ELSE
+(Leave this comment there so others will understand what's going on)
+'''
+
+# Place where you copied modules that should be fixed for this experiment
+codebase_clone_path = "/u/savardf/ift6266/experiment_clones/ift6266_experiment10"
+
+# Places where there might be conflicting modules from your $PYTHONPATH
+remove_these_from_pythonpath = ["/u/savardf/ift6266/dev_code"]
+
+import sys
+sys.path[0:0] = [codebase_clone_path]
+
+# remove paths we specifically don't want in $PYTHONPATH
+for bad_path in remove_these_from_pythonpath:
+    sys.path[:] = [el for el in sys.path if not el in (bad_path, bad_path+"/")]
+
+# Make the imports
+import ift6266
+
+# Just making sure we're importing from the right place
+modules_to_check = [ift6266]
+for module in modules_to_check:
+    if not codebase_clone_path in module.__path__[0]:
+        raise RuntimeError("Module loaded from incorrect path "+module.__path__[0])
+
+# Path to pass to jobman sqlschedule. IMPORTANT TO CHANGE TO REFLECT YOUR CLONE.
+# Make sure this is accessible from the default $PYTHONPATH (in your .bashrc)
+# (and make sure every subdirectory has its __init__.py file)
+EXPERIMENT_PATH = "ift6266_experiment10.ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"
+
+# END EXPERIMENT ISOLATION CODE
+# ----------------------------------------------------------------------------
+
+from jobman import DD
+
+'''
+These are parameters used by nist_sda.py. They'll end up as globals in there.
+
+Rename this file to config.py and configure as needed.
+DON'T add the renamed file to the repository, as others might use it
+without realizing it, with dire consequences.
+'''
+
+# Set this to True when you want to run cluster tests, ie. you want
+# to run on the cluster, many jobs, but want to reduce the training
+# set size and the number of epochs, so you know everything runs
+# fine on the cluster.
+# Set this PRIOR to inserting your test jobs in the DB.
+TEST_CONFIG = False
+
+NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
+NIST_ALL_TRAIN_SIZE = 649081
+# valid et test =82587 82587 
+
+# change "sandbox" when you're ready
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere'
+
+# reduce training set to that many examples
+REDUCE_TRAIN_TO = None
+# that's a max, it usually doesn't get to that point
+MAX_FINETUNING_EPOCHS = 1000
+# number of minibatches before taking means for valid error etc.
+REDUCE_EVERY = 100
+
+if TEST_CONFIG:
+    REDUCE_TRAIN_TO = 1000
+    MAX_FINETUNING_EPOCHS = 2
+    REDUCE_EVERY = 10
+
+
+# This is to configure insertion of jobs on the cluster.
+# Possible values the hyperparameters can take. These are then
+# combined with produit_cartesien_jobs so we get a list of all
+# possible combinations, each one resulting in a job inserted
+# in the jobman DB.
+JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
+        'pretraining_epochs_per_layer': [10,20],
+        'hidden_layers_sizes': [300,800],
+        'corruption_levels': [0.1,0.2,0.3],
+        'minibatch_size': [20],
+        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
+        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
+        'num_hidden_layers':[2,3]}
+
+# Just useful for tests... minimal number of epochs
+# (This is used when running a single job, locally, when
+# calling ./nist_sda.py test_jobman_entrypoint
+DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
+                       'pretraining_lr':0.1,
+                       'pretraining_epochs_per_layer':2,
+                       'max_finetuning_epochs':2,
+                       'hidden_layers_sizes':800,
+                       'corruption_levels':0.2,
+                       'minibatch_size':20,
+                       'reduce_train_to':10000,
+                       'num_hidden_layers':1})
+
+
--- a/deep/stacked_dae/mnist_sda.py	Mon Mar 22 10:19:45 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,45 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-
-# TODO: This probably doesn't work anymore, adapt to new code in sgd_opt
-# Parameterize call to sgd_optimization for MNIST
-
-import numpy 
-import theano
-import time
-import theano.tensor as T
-from theano.tensor.shared_randomstreams import RandomStreams
-
-from sgd_optimization import SdaSgdOptimizer
-import cPickle, gzip
-from jobman import DD
-
-MNIST_LOCATION = '/u/savardf/datasets/mnist.pkl.gz'
-
-def sgd_optimization_mnist(learning_rate=0.1, pretraining_epochs = 2, \
-                            pretrain_lr = 0.1, training_epochs = 5, \
-                            dataset='mnist.pkl.gz'):
-    # Load the dataset 
-    f = gzip.open(dataset,'rb')
-    # this gives us train, valid, test (each with .x, .y)
-    dataset = cPickle.load(f)
-    f.close()
-
-    n_ins = 28*28
-    n_outs = 10
-
-    hyperparameters = DD({'finetuning_lr':learning_rate,
-                       'pretraining_lr':pretrain_lr,
-                       'pretraining_epochs_per_layer':pretraining_epochs,
-                       'max_finetuning_epochs':training_epochs,
-                       'hidden_layers_sizes':[100],
-                       'corruption_levels':[0.2],
-                       'minibatch_size':20})
-
-    optimizer = SdaSgdOptimizer(dataset, hyperparameters, n_ins, n_outs)
-    optimizer.pretrain()
-    optimizer.finetune()
-
-if __name__ == '__main__':
-    sgd_optimization_mnist(dataset=MNIST_LOCATION)
-
--- a/deep/stacked_dae/nist_sda.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/deep/stacked_dae/nist_sda.py	Mon Mar 22 10:20:10 2010 -0400
@@ -1,6 +1,9 @@
 #!/usr/bin/python
 # coding: utf-8
 
+# Must be imported first
+from config import *
+
 import ift6266
 import pylearn
 
@@ -25,69 +28,22 @@
 
 from sgd_optimization import SdaSgdOptimizer
 
-from ift6266.utils.scalar_series import *
-
-##############################################################################
-# GLOBALS
-
-TEST_CONFIG = False
-
-NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4'
-EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"
-
-REDUCE_TRAIN_TO = None
-MAX_FINETUNING_EPOCHS = 1000
-# number of minibatches before taking means for valid error etc.
-REDUCE_EVERY = 1000
-
-if TEST_CONFIG:
-    REDUCE_TRAIN_TO = 1000
-    MAX_FINETUNING_EPOCHS = 2
-    REDUCE_EVERY = 10
+#from ift6266.utils.scalar_series import *
+from ift6266.utils.seriestables import *
+import tables
 
-# Possible values the hyperparameters can take. These are then
-# combined with produit_cartesien_jobs so we get a list of all
-# possible combinations, each one resulting in a job inserted
-# in the jobman DB.
-JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
-        'pretraining_epochs_per_layer': [10,20],
-        'hidden_layers_sizes': [300,800],
-        'corruption_levels': [0.1,0.2,0.3],
-        'minibatch_size': [20],
-        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
-        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
-        'num_hidden_layers':[2,3]}
-
-# Just useful for tests... minimal number of epochs
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
-                       'pretraining_lr':0.1,
-                       'pretraining_epochs_per_layer':20,
-                       'max_finetuning_epochs':2,
-                       'hidden_layers_sizes':800,
-                       'corruption_levels':0.2,
-                       'minibatch_size':20,
-                       #'reduce_train_to':300,
-                       'num_hidden_layers':2})
+from ift6266 import datasets
 
 '''
 Function called by jobman upon launching each job
-Its path is the one given when inserting jobs:
-ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint
+Its path is the one given when inserting jobs: see EXPERIMENT_PATH
 '''
 def jobman_entrypoint(state, channel):
     # record mercurial versions of each package
     pylearn.version.record_versions(state,[theano,ift6266,pylearn])
+    # TODO: remove this, bad for number of simultaneous requests on DB
     channel.save()
 
-    workingdir = os.getcwd()
-
-    print "Will load NIST"
-
-    nist = NIST(minibatch_size=20)
-
-    print "NIST loaded"
-
     # For test runs, we don't want to use the whole dataset so
     # reduce it to fewer elements if asked to.
     rtt = None
@@ -95,59 +51,93 @@
         rtt = state['reduce_train_to']
     elif REDUCE_TRAIN_TO:
         rtt = REDUCE_TRAIN_TO
-
-    if rtt:
-        print "Reducing training set to "+str(rtt)+ " examples"
-        nist.reduce_train_set(rtt)
-
-    train,valid,test = nist.get_tvt()
-    dataset = (train,valid,test)
-
+ 
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
+     
+    examples_per_epoch = NIST_ALL_TRAIN_SIZE
 
-    # b,b',W for each hidden layer 
-    # + b,W of last layer (logreg)
-    numparams = state.num_hidden_layers * 3 + 2
-    series_mux = None
-    series_mux = create_series(workingdir, numparams)
+    series = create_series(state.num_hidden_layers)
 
     print "Creating optimizer with state, ", state
 
-    optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \
+    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all(), 
+                                    hyperparameters=state, \
                                     n_ins=n_ins, n_outs=n_outs,\
-                                    input_divider=255.0, series_mux=series_mux)
+                                    examples_per_epoch=examples_per_epoch, \
+                                    series=series,
+                                    max_minibatches=rtt)
 
-    optimizer.pretrain()
+    optimizer.pretrain(datasets.nist_all())
     channel.save()
 
-    optimizer.finetune()
+    optimizer.finetune(datasets.nist_all())
     channel.save()
 
     return channel.COMPLETE
 
 # These Series objects are used to save various statistics
 # during the training.
-def create_series(basedir, numparams):
-    mux = SeriesMultiplexer()
+def create_series(num_hidden_layers):
+
+    # Replace series we don't want to save with DummySeries, e.g.
+    # series['training_error'] = DummySeries()
+
+    series = {}
+
+    basedir = os.getcwd()
+
+    h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w")
+
+    # reconstruction
+    reconstruction_base = \
+                ErrorSeries(error_name="reconstruction_error",
+                    table_name="reconstruction_error",
+                    hdf5_file=h5f,
+                    index_names=('epoch','minibatch'),
+                    title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)")
+    series['reconstruction_error'] = \
+                AccumulatorSeriesWrapper(base_series=reconstruction_base,
+                    reduce_every=REDUCE_EVERY)
+
+    # train
+    training_base = \
+                ErrorSeries(error_name="training_error",
+                    table_name="training_error",
+                    hdf5_file=h5f,
+                    index_names=('epoch','minibatch'),
+                    title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)")
+    series['training_error'] = \
+                AccumulatorSeriesWrapper(base_series=training_base,
+                    reduce_every=REDUCE_EVERY)
+
+    # valid and test are not accumulated/mean, saved directly
+    series['validation_error'] = \
+                ErrorSeries(error_name="validation_error",
+                    table_name="validation_error",
+                    hdf5_file=h5f,
+                    index_names=('epoch','minibatch'))
+
+    series['test_error'] = \
+                ErrorSeries(error_name="test_error",
+                    table_name="test_error",
+                    hdf5_file=h5f,
+                    index_names=('epoch','minibatch'))
+
+    param_names = []
+    for i in range(num_hidden_layers):
+        param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i]
+    param_names += ['logreg_layer_W', 'logreg_layer_b']
 
     # comment out series we don't want to save
-    mux.add_series(AccumulatorSeries(name="reconstruction_error",
-                    reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save
-                    mean=True,
-                    directory=basedir, flush_every=1))
+    series['params'] = SharedParamsStatisticsWrapper(
+                        new_group_name="params",
+                        base_group="/",
+                        arrays_names=param_names,
+                        hdf5_file=h5f,
+                        index_names=('epoch',))
 
-    mux.add_series(AccumulatorSeries(name="training_error",
-                    reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save
-                    mean=True,
-                    directory=basedir, flush_every=1))
-
-    mux.add_series(BaseSeries(name="validation_error", directory=basedir, flush_every=1))
-    mux.add_series(BaseSeries(name="test_error", directory=basedir, flush_every=1))
-
-    mux.add_series(ParamsArrayStats(numparams,name="params",directory=basedir))
-
-    return mux
+    return series
 
 # Perform insertion into the Postgre DB based on combination
 # of hyperparameter values above
@@ -162,93 +152,14 @@
 
     print "inserted"
 
-class NIST:
-    def __init__(self, minibatch_size, basepath=None, reduce_train_to=None):
-        global NIST_ALL_LOCATION
-
-        self.minibatch_size = minibatch_size
-        self.basepath = basepath and basepath or NIST_ALL_LOCATION
-
-        self.set_filenames()
-
-        # arrays of 2 elements: .x, .y
-        self.train = [None, None]
-        self.test = [None, None]
-
-        self.load_train_test()
-
-        self.valid = [[], []]
-        self.split_train_valid()
-        if reduce_train_to:
-            self.reduce_train_set(reduce_train_to)
-
-    def get_tvt(self):
-        return self.train, self.valid, self.test
-
-    def set_filenames(self):
-        self.train_files = ['all_train_data.ft',
-                                'all_train_labels.ft']
-
-        self.test_files = ['all_test_data.ft',
-                            'all_test_labels.ft']
-
-    def load_train_test(self):
-        self.load_data_labels(self.train_files, self.train)
-        self.load_data_labels(self.test_files, self.test)
-
-    def load_data_labels(self, filenames, pair):
-        for i, fn in enumerate(filenames):
-            f = open(os.path.join(self.basepath, fn))
-            pair[i] = filetensor.read(f)
-            f.close()
-
-    def reduce_train_set(self, max):
-        self.train[0] = self.train[0][:max]
-        self.train[1] = self.train[1][:max]
-
-        if max < len(self.test[0]):
-            for ar in (self.test, self.valid):
-                ar[0] = ar[0][:max]
-                ar[1] = ar[1][:max]
-
-    def split_train_valid(self):
-        test_len = len(self.test[0])
-        
-        new_train_x = self.train[0][:-test_len]
-        new_train_y = self.train[1][:-test_len]
-
-        self.valid[0] = self.train[0][-test_len:]
-        self.valid[1] = self.train[1][-test_len:]
-
-        self.train[0] = new_train_x
-        self.train[1] = new_train_y
-
-def test_load_nist():
-    print "Will load NIST"
-
-    import time
-    t1 = time.time()
-    nist = NIST(20)
-    t2 = time.time()
-
-    print "NIST loaded. time delta = ", t2-t1
-
-    tr,v,te = nist.get_tvt()
-
-    print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0])
-
-    raw_input("Press any key")
-
 if __name__ == '__main__':
 
-    import sys
-
     args = sys.argv[1:]
 
-    if len(args) > 0 and args[0] == 'load_nist':
-        test_load_nist()
+    #if len(args) > 0 and args[0] == 'load_nist':
+    #    test_load_nist()
 
-    elif len(args) > 0 and args[0] == 'jobman_insert':
+    if len(args) > 0 and args[0] == 'jobman_insert':
         jobman_insert_nist()
 
     elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/old/mnist_sda.py	Mon Mar 22 10:20:10 2010 -0400
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+# coding: utf-8
+
+# TODO: This probably doesn't work anymore, adapt to new code in sgd_opt
+# Parameterize call to sgd_optimization for MNIST
+
+import numpy 
+import theano
+import time
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+from sgd_optimization import SdaSgdOptimizer
+import cPickle, gzip
+from jobman import DD
+
+MNIST_LOCATION = '/u/savardf/datasets/mnist.pkl.gz'
+
+def sgd_optimization_mnist(learning_rate=0.1, pretraining_epochs = 2, \
+                            pretrain_lr = 0.1, training_epochs = 5, \
+                            dataset='mnist.pkl.gz'):
+    # Load the dataset 
+    f = gzip.open(dataset,'rb')
+    # this gives us train, valid, test (each with .x, .y)
+    dataset = cPickle.load(f)
+    f.close()
+
+    n_ins = 28*28
+    n_outs = 10
+
+    hyperparameters = DD({'finetuning_lr':learning_rate,
+                       'pretraining_lr':pretrain_lr,
+                       'pretraining_epochs_per_layer':pretraining_epochs,
+                       'max_finetuning_epochs':training_epochs,
+                       'hidden_layers_sizes':[100],
+                       'corruption_levels':[0.2],
+                       'minibatch_size':20})
+
+    optimizer = SdaSgdOptimizer(dataset, hyperparameters, n_ins, n_outs)
+    optimizer.pretrain()
+    optimizer.finetune()
+
+if __name__ == '__main__':
+    sgd_optimization_mnist(dataset=MNIST_LOCATION)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/old/nist_sda.py	Mon Mar 22 10:20:10 2010 -0400
@@ -0,0 +1,260 @@
+#!/usr/bin/python
+# coding: utf-8
+
+import ift6266
+import pylearn
+
+import numpy 
+import theano
+import time
+
+import pylearn.version
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+import copy
+import sys
+import os
+import os.path
+
+from jobman import DD
+import jobman, jobman.sql
+from pylearn.io import filetensor
+
+from utils import produit_cartesien_jobs
+
+from sgd_optimization import SdaSgdOptimizer
+
+from ift6266.utils.scalar_series import *
+
+##############################################################################
+# GLOBALS
+
+TEST_CONFIG = False
+
+NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4'
+EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"
+
+REDUCE_TRAIN_TO = None
+MAX_FINETUNING_EPOCHS = 1000
+# number of minibatches before taking means for valid error etc.
+REDUCE_EVERY = 1000
+
+if TEST_CONFIG:
+    REDUCE_TRAIN_TO = 1000
+    MAX_FINETUNING_EPOCHS = 2
+    REDUCE_EVERY = 10
+
+# Possible values the hyperparameters can take. These are then
+# combined with produit_cartesien_jobs so we get a list of all
+# possible combinations, each one resulting in a job inserted
+# in the jobman DB.
+JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
+        'pretraining_epochs_per_layer': [10,20],
+        'hidden_layers_sizes': [300,800],
+        'corruption_levels': [0.1,0.2,0.3],
+        'minibatch_size': [20],
+        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
+        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
+        'num_hidden_layers':[2,3]}
+
+# Just useful for tests... minimal number of epochs
+DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
+                       'pretraining_lr':0.1,
+                       'pretraining_epochs_per_layer':20,
+                       'max_finetuning_epochs':2,
+                       'hidden_layers_sizes':800,
+                       'corruption_levels':0.2,
+                       'minibatch_size':20,
+                       #'reduce_train_to':300,
+                       'num_hidden_layers':2})
+
+'''
+Function called by jobman upon launching each job
+Its path is the one given when inserting jobs:
+ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint
+'''
+def jobman_entrypoint(state, channel):
+    # record mercurial versions of each package
+    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
+    channel.save()
+
+    workingdir = os.getcwd()
+
+    print "Will load NIST"
+
+    nist = NIST(minibatch_size=20)
+
+    print "NIST loaded"
+
+    # For test runs, we don't want to use the whole dataset so
+    # reduce it to fewer elements if asked to.
+    rtt = None
+    if state.has_key('reduce_train_to'):
+        rtt = state['reduce_train_to']
+    elif REDUCE_TRAIN_TO:
+        rtt = REDUCE_TRAIN_TO
+
+    if rtt:
+        print "Reducing training set to "+str(rtt)+ " examples"
+        nist.reduce_train_set(rtt)
+
+    train,valid,test = nist.get_tvt()
+    dataset = (train,valid,test)
+
+    n_ins = 32*32
+    n_outs = 62 # 10 digits, 26*2 (lower, capitals)
+
+    # b,b',W for each hidden layer 
+    # + b,W of last layer (logreg)
+    numparams = state.num_hidden_layers * 3 + 2
+    series_mux = None
+    series_mux = create_series(workingdir, numparams)
+
+    print "Creating optimizer with state, ", state
+
+    optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \
+                                    n_ins=n_ins, n_outs=n_outs,\
+                                    input_divider=255.0, series_mux=series_mux)
+
+    optimizer.pretrain()
+    channel.save()
+
+    optimizer.finetune()
+    channel.save()
+
+    return channel.COMPLETE
+
+# These Series objects are used to save various statistics
+# during the training.
+def create_series(basedir, numparams):
+    mux = SeriesMultiplexer()
+
+    # comment out series we don't want to save
+    mux.add_series(AccumulatorSeries(name="reconstruction_error",
+                    reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save
+                    mean=True,
+                    directory=basedir, flush_every=1))
+
+    mux.add_series(AccumulatorSeries(name="training_error",
+                    reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save
+                    mean=True,
+                    directory=basedir, flush_every=1))
+
+    mux.add_series(BaseSeries(name="validation_error", directory=basedir, flush_every=1))
+    mux.add_series(BaseSeries(name="test_error", directory=basedir, flush_every=1))
+
+    mux.add_series(ParamsArrayStats(numparams,name="params",directory=basedir))
+
+    return mux
+
+# Perform insertion into the Postgre DB based on combination
+# of hyperparameter values above
+# (see comment for produit_cartesien_jobs() to know how it works)
+def jobman_insert_nist():
+    jobs = produit_cartesien_jobs(JOB_VALS)
+
+    db = jobman.sql.db(JOBDB)
+    for job in jobs:
+        job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH})
+        jobman.sql.insert_dict(job, db)
+
+    print "inserted"
+
+class NIST:
+    def __init__(self, minibatch_size, basepath=None, reduce_train_to=None):
+        global NIST_ALL_LOCATION
+
+        self.minibatch_size = minibatch_size
+        self.basepath = basepath and basepath or NIST_ALL_LOCATION
+
+        self.set_filenames()
+
+        # arrays of 2 elements: .x, .y
+        self.train = [None, None]
+        self.test = [None, None]
+
+        self.load_train_test()
+
+        self.valid = [[], []]
+        self.split_train_valid()
+        if reduce_train_to:
+            self.reduce_train_set(reduce_train_to)
+
+    def get_tvt(self):
+        return self.train, self.valid, self.test
+
+    def set_filenames(self):
+        self.train_files = ['all_train_data.ft',
+                                'all_train_labels.ft']
+
+        self.test_files = ['all_test_data.ft',
+                            'all_test_labels.ft']
+
+    def load_train_test(self):
+        self.load_data_labels(self.train_files, self.train)
+        self.load_data_labels(self.test_files, self.test)
+
+    def load_data_labels(self, filenames, pair):
+        for i, fn in enumerate(filenames):
+            f = open(os.path.join(self.basepath, fn))
+            pair[i] = filetensor.read(f)
+            f.close()
+
+    def reduce_train_set(self, max):
+        self.train[0] = self.train[0][:max]
+        self.train[1] = self.train[1][:max]
+
+        if max < len(self.test[0]):
+            for ar in (self.test, self.valid):
+                ar[0] = ar[0][:max]
+                ar[1] = ar[1][:max]
+
+    def split_train_valid(self):
+        test_len = len(self.test[0])
+        
+        new_train_x = self.train[0][:-test_len]
+        new_train_y = self.train[1][:-test_len]
+
+        self.valid[0] = self.train[0][-test_len:]
+        self.valid[1] = self.train[1][-test_len:]
+
+        self.train[0] = new_train_x
+        self.train[1] = new_train_y
+
+def test_load_nist():
+    print "Will load NIST"
+
+    import time
+    t1 = time.time()
+    nist = NIST(20)
+    t2 = time.time()
+
+    print "NIST loaded. time delta = ", t2-t1
+
+    tr,v,te = nist.get_tvt()
+
+    print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0])
+
+    raw_input("Press any key")
+
+if __name__ == '__main__':
+
+    import sys
+
+    args = sys.argv[1:]
+
+    if len(args) > 0 and args[0] == 'load_nist':
+        test_load_nist()
+
+    elif len(args) > 0 and args[0] == 'jobman_insert':
+        jobman_insert_nist()
+
+    elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
+        chanmock = DD({'COMPLETE':0,'save':(lambda:None)})
+        jobman_entrypoint(DEFAULT_HP_NIST, chanmock)
+
+    else:
+        print "Bad arguments"
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/old/sgd_optimization.py	Mon Mar 22 10:20:10 2010 -0400
@@ -0,0 +1,234 @@
+#!/usr/bin/python
+# coding: utf-8
+
+# Generic SdA optimization loop, adapted from the deeplearning.net tutorial
+
+import numpy 
+import theano
+import time
+import datetime
+import theano.tensor as T
+import sys
+
+from jobman import DD
+import jobman, jobman.sql
+
+from stacked_dae import SdA
+
+def shared_dataset(data_xy):
+    data_x, data_y = data_xy
+    #shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
+    #shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
+    #shared_y = T.cast(shared_y, 'int32')
+    shared_x = theano.shared(data_x)
+    shared_y = theano.shared(data_y)
+    return shared_x, shared_y
+
+class DummyMux():
+    def append(self, param1, param2):
+        pass
+
+class SdaSgdOptimizer:
+    def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series_mux=None):
+        self.dataset = dataset
+        self.hp = hyperparameters
+        self.n_ins = n_ins
+        self.n_outs = n_outs
+        self.input_divider = input_divider
+   
+        if not series_mux:
+            series_mux = DummyMux()
+            print "No series multiplexer set"
+        self.series_mux = series_mux
+
+        self.rng = numpy.random.RandomState(1234)
+
+        self.init_datasets()
+        self.init_classifier()
+
+        sys.stdout.flush()
+     
+    def init_datasets(self):
+        print "init_datasets"
+        sys.stdout.flush()
+
+        train_set, valid_set, test_set = self.dataset
+        self.test_set_x, self.test_set_y = shared_dataset(test_set)
+        self.valid_set_x, self.valid_set_y = shared_dataset(valid_set)
+        self.train_set_x, self.train_set_y = shared_dataset(train_set)
+
+        # compute number of minibatches for training, validation and testing
+        self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size
+        self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size
+        # remove last batch in case it's incomplete
+        self.n_test_batches  = (self.test_set_x.value.shape[0]  / self.hp.minibatch_size) - 1
+
+    def init_classifier(self):
+        print "Constructing classifier"
+
+        # we don't want to save arrays in DD objects, so
+        # we recreate those arrays here
+        nhl = self.hp.num_hidden_layers
+        layers_sizes = [self.hp.hidden_layers_sizes] * nhl
+        corruption_levels = [self.hp.corruption_levels] * nhl
+
+        # construct the stacked denoising autoencoder class
+        self.classifier = SdA( \
+                          train_set_x= self.train_set_x, \
+                          train_set_y = self.train_set_y,\
+                          batch_size = self.hp.minibatch_size, \
+                          n_ins= self.n_ins, \
+                          hidden_layers_sizes = layers_sizes, \
+                          n_outs = self.n_outs, \
+                          corruption_levels = corruption_levels,\
+                          rng = self.rng,\
+                          pretrain_lr = self.hp.pretraining_lr, \
+                          finetune_lr = self.hp.finetuning_lr,\
+                          input_divider = self.input_divider )
+
+        #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph")
+
+        sys.stdout.flush()
+
+    def train(self):
+        self.pretrain()
+        self.finetune()
+
+    def pretrain(self):
+        print "STARTING PRETRAINING, time = ", datetime.datetime.now()
+        sys.stdout.flush()
+
+        #time_acc_func = 0.0
+        #time_acc_total = 0.0
+
+        start_time = time.clock()  
+        ## Pre-train layer-wise 
+        for i in xrange(self.classifier.n_layers):
+            # go through pretraining epochs 
+            for epoch in xrange(self.hp.pretraining_epochs_per_layer):
+                # go through the training set
+                for batch_index in xrange(self.n_train_batches):
+                    #t1 = time.clock()
+                    c = self.classifier.pretrain_functions[i](batch_index)
+                    #t2 = time.clock()
+
+                    #time_acc_func += t2 - t1
+
+                    #if batch_index % 500 == 0:
+                    #    print "acc / total", time_acc_func / (t2 - start_time), time_acc_func
+
+                    self.series_mux.append("reconstruction_error", c)
+                        
+                print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
+                sys.stdout.flush()
+
+                self.series_mux.append("params", self.classifier.all_params)
+     
+        end_time = time.clock()
+
+        print ('Pretraining took %f minutes' %((end_time-start_time)/60.))
+        self.hp.update({'pretraining_time': end_time-start_time})
+
+        sys.stdout.flush()
+
+    def finetune(self):
+        print "STARTING FINETUNING, time = ", datetime.datetime.now()
+
+        index   = T.lscalar()    # index to a [mini]batch 
+        minibatch_size = self.hp.minibatch_size
+
+        # create a function to compute the mistakes that are made by the model
+        # on the validation set, or testing set
+        shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX))
+        test_model = theano.function([index], self.classifier.errors,
+                 givens = {
+                   self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
+                   self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+
+        validate_model = theano.function([index], self.classifier.errors,
+                givens = {
+                   self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
+                   self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+
+
+        # early-stopping parameters
+        patience              = 10000 # look as this many examples regardless
+        patience_increase     = 2.    # wait this much longer when a new best is 
+                                      # found
+        improvement_threshold = 0.995 # a relative improvement of this much is 
+                                      # considered significant
+        validation_frequency  = min(self.n_train_batches, patience/2)
+                                      # go through this many 
+                                      # minibatche before checking the network 
+                                      # on the validation set; in this case we 
+                                      # check every epoch 
+
+        best_params          = None
+        best_validation_loss = float('inf')
+        test_score           = 0.
+        start_time = time.clock()
+
+        done_looping = False
+        epoch = 0
+
+        while (epoch < self.hp.max_finetuning_epochs) and (not done_looping):
+            epoch = epoch + 1
+            for minibatch_index in xrange(self.n_train_batches):
+
+                cost_ij = self.classifier.finetune(minibatch_index)
+                iter    = epoch * self.n_train_batches + minibatch_index
+
+                self.series_mux.append("training_error", cost_ij)
+
+                if (iter+1) % validation_frequency == 0: 
+                    
+                    validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)]
+                    this_validation_loss = numpy.mean(validation_losses)
+                    self.series_mux.append("validation_error", this_validation_loss)
+                    print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                           (epoch, minibatch_index+1, self.n_train_batches, \
+                            this_validation_loss*100.))
+
+
+                    # if we got the best validation score until now
+                    if this_validation_loss < best_validation_loss:
+
+                        #improve patience if loss improvement is good enough
+                        if this_validation_loss < best_validation_loss *  \
+                               improvement_threshold :
+                            patience = max(patience, iter * patience_increase)
+
+                        # save best validation score and iteration number
+                        best_validation_loss = this_validation_loss
+                        best_iter = iter
+
+                        # test it on the test set
+                        test_losses = [test_model(i) for i in xrange(self.n_test_batches)]
+                        test_score = numpy.mean(test_losses)
+                        self.series_mux.append("test_error", test_score)
+                        print(('     epoch %i, minibatch %i/%i, test error of best '
+                              'model %f %%') % 
+                                     (epoch, minibatch_index+1, self.n_train_batches,
+                                      test_score*100.))
+
+                    sys.stdout.flush()
+
+            self.series_mux.append("params", self.classifier.all_params)
+
+            if patience <= iter :
+                done_looping = True
+                break
+
+        end_time = time.clock()
+        self.hp.update({'finetuning_time':end_time-start_time,\
+                    'best_validation_error':best_validation_loss,\
+                    'test_score':test_score,
+                    'num_finetuning_epochs':epoch})
+
+        print(('Optimization complete with best validation score of %f %%,'
+               'with test performance %f %%') %  
+                     (best_validation_loss * 100., test_score*100.))
+        print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.))
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/old/stacked_dae.py	Mon Mar 22 10:20:10 2010 -0400
@@ -0,0 +1,287 @@
+#!/usr/bin/python
+# coding: utf-8
+
+import numpy 
+import theano
+import time
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+import copy
+
+from utils import update_locals
+
+# taken from LeDeepNet/daa.py
+# has a special case when taking log(0) (defined =0)
+# modified to not take the mean anymore
+from theano.tensor.xlogx import xlogx, xlogy0
+# it's target*log(output)
+def binary_cross_entropy(target, output, sum_axis=1):
+    XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output))
+    return -T.sum(XE, axis=sum_axis)
+
+class LogisticRegression(object):
+    def __init__(self, input, n_in, n_out):
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
+        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
+                                            dtype = theano.config.floatX) )
+        # initialize the baises b as a vector of n_out 0s
+        self.b = theano.shared( value=numpy.zeros((n_out,), 
+                                            dtype = theano.config.floatX) )
+        # compute vector of class-membership probabilities in symbolic form
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
+        
+        # compute prediction as class whose probability is maximal in 
+        # symbolic form
+        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+
+        # list of parameters for this layer
+        self.params = [self.W, self.b]
+
+    def negative_log_likelihood(self, y):
+       return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
+
+    def errors(self, y):
+        # check if y has same dimension of y_pred 
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred', 
+                ('y', target.type, 'y_pred', self.y_pred.type))
+
+        # check if y is of the correct datatype        
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+class SigmoidalLayer(object):
+    def __init__(self, rng, input, n_in, n_out):
+        self.input = input
+
+        W_values = numpy.asarray( rng.uniform( \
+              low = -numpy.sqrt(6./(n_in+n_out)), \
+              high = numpy.sqrt(6./(n_in+n_out)), \
+              size = (n_in, n_out)), dtype = theano.config.floatX)
+        self.W = theano.shared(value = W_values)
+
+        b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
+        self.b = theano.shared(value= b_values)
+
+        self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b)
+        self.params = [self.W, self.b]
+
+
+
+class dA(object):
+  def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\
+               input = None, shared_W = None, shared_b = None):
+    self.n_visible = n_visible
+    self.n_hidden  = n_hidden
+    
+    # create a Theano random generator that gives symbolic random values
+    theano_rng = RandomStreams()
+    
+    if shared_W != None and shared_b != None : 
+        self.W = shared_W
+        self.b = shared_b
+    else:
+        # initial values for weights and biases
+        # note : W' was written as `W_prime` and b' as `b_prime`
+
+        # W is initialized with `initial_W` which is uniformely sampled
+        # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
+        # the output of uniform if converted using asarray to dtype 
+        # theano.config.floatX so that the code is runable on GPU
+        initial_W = numpy.asarray( numpy.random.uniform( \
+              low = -numpy.sqrt(6./(n_hidden+n_visible)), \
+              high = numpy.sqrt(6./(n_hidden+n_visible)), \
+              size = (n_visible, n_hidden)), dtype = theano.config.floatX)
+        initial_b       = numpy.zeros(n_hidden, dtype = theano.config.floatX)
+    
+    
+        # theano shared variables for weights and biases
+        self.W       = theano.shared(value = initial_W,       name = "W")
+        self.b       = theano.shared(value = initial_b,       name = "b")
+    
+ 
+    initial_b_prime= numpy.zeros(n_visible)
+    # tied weights, therefore W_prime is W transpose
+    self.W_prime = self.W.T 
+    self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
+
+    # if no input is given, generate a variable representing the input
+    if input == None : 
+        # we use a matrix because we expect a minibatch of several examples,
+        # each example being a row
+        self.x = T.dmatrix(name = 'input') 
+    else:
+        self.x = input
+    # Equation (1)
+    # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs
+    # note : first argument of theano.rng.binomial is the shape(size) of 
+    #        random numbers that it should produce
+    #        second argument is the number of trials 
+    #        third argument is the probability of success of any trial
+    #
+    #        this will produce an array of 0s and 1s where 1 has a 
+    #        probability of 1 - ``corruption_level`` and 0 with
+    #        ``corruption_level``
+    self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level) * self.x
+    # Equation (2)
+    # note  : y is stored as an attribute of the class so that it can be 
+    #         used later when stacking dAs. 
+    self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
+    # Equation (3)
+    self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+    # Equation (4)
+    # note : we sum over the size of a datapoint; if we are using minibatches,
+    #        L will  be a vector, with one entry per example in minibatch
+    #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
+    #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
+
+    # bypassing z to avoid running to log(0)
+    #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime)
+    #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \
+    #                + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 )
+
+    # I added this epsilon to avoid getting log(0) and 1/0 in grad
+    # This means conceptually that there'd be no probability of 0, but that
+    # doesn't seem to me as important (maybe I'm wrong?).
+    eps = 0.00000001
+    eps_1 = 1-eps
+    self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
+                    + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
+    # note : L is now a vector, where each element is the cross-entropy cost 
+    #        of the reconstruction of the corresponding example of the 
+    #        minibatch. We need to compute the average of all these to get 
+    #        the cost of the minibatch
+    self.cost = T.mean(self.L)
+
+    self.params = [ self.W, self.b, self.b_prime ]
+
+
+class SdA(object):
+    def __init__(self, train_set_x, train_set_y, batch_size, n_ins, 
+                 hidden_layers_sizes, n_outs, 
+                 corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0):
+        # Just to make sure those are not modified somewhere else afterwards
+        hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
+        corruption_levels = copy.deepcopy(corruption_levels)
+
+        update_locals(self, locals())      
+ 
+        self.layers             = []
+        self.pretrain_functions = []
+        self.params             = []
+        # MODIF: added this so we also get the b_primes
+        # (not used for finetuning... still using ".params")
+        self.all_params         = []
+        self.n_layers           = len(hidden_layers_sizes)
+
+        print "Creating SdA with params:"
+        print "batch_size", batch_size
+        print "hidden_layers_sizes", hidden_layers_sizes
+        print "corruption_levels", corruption_levels
+        print "n_ins", n_ins
+        print "n_outs", n_outs
+        print "pretrain_lr", pretrain_lr
+        print "finetune_lr", finetune_lr
+        print "input_divider", input_divider
+        print "----"
+
+        self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
+
+        if len(hidden_layers_sizes) < 1 :
+            raiseException (' You must have at least one hidden layer ')
+
+
+        # allocate symbolic variables for the data
+        index   = T.lscalar()    # index to a [mini]batch 
+        self.x  = T.matrix('x')  # the data is presented as rasterized images
+        self.y  = T.ivector('y') # the labels are presented as 1D vector of 
+                                 # [int] labels
+
+        for i in xrange( self.n_layers ):
+            # construct the sigmoidal layer
+
+            # the size of the input is either the number of hidden units of 
+            # the layer below or the input size if we are on the first layer
+            if i == 0 :
+                input_size = n_ins
+            else:
+                input_size = hidden_layers_sizes[i-1]
+
+            # the input to this layer is either the activation of the hidden
+            # layer below or the input of the SdA if you are on the first
+            # layer
+            if i == 0 : 
+                layer_input = self.x
+            else:
+                layer_input = self.layers[-1].output
+
+            layer = SigmoidalLayer(rng, layer_input, input_size, 
+                                   hidden_layers_sizes[i] )
+            # add the layer to the 
+            self.layers += [layer]
+            self.params += layer.params
+        
+            # Construct a denoising autoencoder that shared weights with this
+            # layer
+            dA_layer = dA(input_size, hidden_layers_sizes[i], \
+                          corruption_level = corruption_levels[0],\
+                          input = layer_input, \
+                          shared_W = layer.W, shared_b = layer.b)
+
+            self.all_params += dA_layer.params
+        
+            # Construct a function that trains this dA
+            # compute gradients of layer parameters
+            gparams = T.grad(dA_layer.cost, dA_layer.params)
+            # compute the list of updates
+            updates = {}
+            for param, gparam in zip(dA_layer.params, gparams):
+                updates[param] = param - gparam * pretrain_lr
+            
+            # create a function that trains the dA
+            update_fn = theano.function([index], dA_layer.cost, \
+                  updates = updates,
+                  givens = { 
+                     self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
+            # collect this function into a list
+            self.pretrain_functions += [update_fn]
+
+        
+        # We now need to add a logistic layer on top of the MLP
+        self.logLayer = LogisticRegression(\
+                         input = self.layers[-1].output,\
+                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
+
+        self.params += self.logLayer.params
+        self.all_params += self.logLayer.params
+        # construct a function that implements one step of finetunining
+
+        # compute the cost, defined as the negative log likelihood 
+        cost = self.logLayer.negative_log_likelihood(self.y)
+        # compute the gradients with respect to the model parameters
+        gparams = T.grad(cost, self.params)
+        # compute list of updates
+        updates = {}
+        for param,gparam in zip(self.params, gparams):
+            updates[param] = param - gparam*finetune_lr
+            
+        self.finetune = theano.function([index], cost, 
+                updates = updates,
+                givens = {
+                  self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
+                  self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
+
+        # symbolic variable that points to the number of errors made on the
+        # minibatch given by self.x and self.y
+
+        self.errors = self.logLayer.errors(self.y)
+
+if __name__ == '__main__':
+    import sys
+    args = sys.argv[1:]
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/stacked_dae/old/utils.py	Mon Mar 22 10:20:10 2010 -0400
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+# coding: utf-8
+
+from __future__ import with_statement
+
+from jobman import DD
+
+# from pylearn codebase
+# useful in __init__(param1, param2, etc.) to save
+# values in self.param1, self.param2... just call
+# update_locals(self, locals())
+def update_locals(obj, dct):
+    if 'self' in dct:
+        del dct['self']
+    obj.__dict__.update(dct)
+
+# from a dictionary of possible values for hyperparameters, e.g.
+# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]}
+# create a list of other dictionaries representing all the possible
+# combinations, thus in this example creating:
+# [{'learning_rate': 0.1, 'num_layers': 1}, ...]
+# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2))
+def produit_cartesien_jobs(val_dict):
+    job_list = [DD()]
+    all_keys = val_dict.keys()
+
+    for key in all_keys:
+        possible_values = val_dict[key]
+        new_job_list = []
+        for val in possible_values:
+            for job in job_list:
+                to_insert = job.copy()
+                to_insert.update({key: val})
+                new_job_list.append(to_insert)
+        job_list = new_job_list
+
+    return job_list
+
+def test_produit_cartesien_jobs():
+    vals = {'a': [1,2], 'b': [3,4,5]}
+    print produit_cartesien_jobs(vals)
+
+
+# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python
+"""Simple module for getting amount of memory used by a specified user's
+processes on a UNIX system.
+It uses UNIX ps utility to get the memory usage for a specified username and
+pipe it to awk for summing up per application memory usage and return the total.
+Python's Popen() from subprocess module is used for spawning ps and awk.
+
+"""
+
+import subprocess
+
+class MemoryMonitor(object):
+
+    def __init__(self, username):
+        """Create new MemoryMonitor instance."""
+        self.username = username
+
+    def usage(self):
+        """Return int containing memory used by user's processes."""
+        self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username,
+                                        shell=True,
+                                        stdout=subprocess.PIPE,
+                                        )
+        self.stdout_list = self.process.communicate()[0].split('\n')
+        return int(self.stdout_list[0])
+
--- a/deep/stacked_dae/sgd_optimization.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/deep/stacked_dae/sgd_optimization.py	Mon Mar 22 10:20:10 2010 -0400
@@ -15,53 +15,43 @@
 
 from stacked_dae import SdA
 
-def shared_dataset(data_xy):
-    data_x, data_y = data_xy
-    #shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
-    #shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
-    #shared_y = T.cast(shared_y, 'int32')
-    shared_x = theano.shared(data_x)
-    shared_y = theano.shared(data_y)
-    return shared_x, shared_y
+from ift6266.utils.seriestables import *
 
-class DummyMux():
-    def append(self, param1, param2):
-        pass
+default_series = { \
+        'reconstruction_error' : DummySeries(),
+        'training_error' : DummySeries(),
+        'validation_error' : DummySeries(),
+        'test_error' : DummySeries(),
+        'params' : DummySeries()
+        }
+
+def itermax(iter, max):
+    for i,it in enumerate(iter):
+        if i >= max:
+            break
+        yield it
 
 class SdaSgdOptimizer:
-    def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series_mux=None):
+    def __init__(self, dataset, hyperparameters, n_ins, n_outs,
+                    examples_per_epoch, series=default_series, max_minibatches=None):
         self.dataset = dataset
         self.hp = hyperparameters
         self.n_ins = n_ins
         self.n_outs = n_outs
-        self.input_divider = input_divider
    
-        if not series_mux:
-            series_mux = DummyMux()
-            print "No series multiplexer set"
-        self.series_mux = series_mux
+        self.max_minibatches = max_minibatches
+        print "SdaSgdOptimizer, max_minibatches =", max_minibatches
+
+        self.ex_per_epoch = examples_per_epoch
+        self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size
+
+        self.series = series
 
         self.rng = numpy.random.RandomState(1234)
 
-        self.init_datasets()
         self.init_classifier()
 
         sys.stdout.flush()
-     
-    def init_datasets(self):
-        print "init_datasets"
-        sys.stdout.flush()
-
-        train_set, valid_set, test_set = self.dataset
-        self.test_set_x, self.test_set_y = shared_dataset(test_set)
-        self.valid_set_x, self.valid_set_y = shared_dataset(valid_set)
-        self.train_set_x, self.train_set_y = shared_dataset(train_set)
-
-        # compute number of minibatches for training, validation and testing
-        self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size
-        self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size
-        # remove last batch in case it's incomplete
-        self.n_test_batches  = (self.test_set_x.value.shape[0]  / self.hp.minibatch_size) - 1
 
     def init_classifier(self):
         print "Constructing classifier"
@@ -74,8 +64,6 @@
 
         # construct the stacked denoising autoencoder class
         self.classifier = SdA( \
-                          train_set_x= self.train_set_x, \
-                          train_set_y = self.train_set_y,\
                           batch_size = self.hp.minibatch_size, \
                           n_ins= self.n_ins, \
                           hidden_layers_sizes = layers_sizes, \
@@ -83,46 +71,44 @@
                           corruption_levels = corruption_levels,\
                           rng = self.rng,\
                           pretrain_lr = self.hp.pretraining_lr, \
-                          finetune_lr = self.hp.finetuning_lr,\
-                          input_divider = self.input_divider )
+                          finetune_lr = self.hp.finetuning_lr)
 
         #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph")
 
         sys.stdout.flush()
 
     def train(self):
-        self.pretrain()
-        self.finetune()
+        self.pretrain(self.dataset)
+        self.finetune(self.dataset)
 
-    def pretrain(self):
+    def pretrain(self,dataset):
         print "STARTING PRETRAINING, time = ", datetime.datetime.now()
         sys.stdout.flush()
 
-        #time_acc_func = 0.0
-        #time_acc_total = 0.0
-
         start_time = time.clock()  
         ## Pre-train layer-wise 
         for i in xrange(self.classifier.n_layers):
             # go through pretraining epochs 
             for epoch in xrange(self.hp.pretraining_epochs_per_layer):
                 # go through the training set
-                for batch_index in xrange(self.n_train_batches):
-                    #t1 = time.clock()
-                    c = self.classifier.pretrain_functions[i](batch_index)
-                    #t2 = time.clock()
+                batch_index=0
+                for x,y in dataset.train(self.hp.minibatch_size):
+                    c = self.classifier.pretrain_functions[i](x)
+
+                    self.series["reconstruction_error"].append((epoch, batch_index), c)
+                    batch_index+=1
 
-                    #time_acc_func += t2 - t1
+                    #if batch_index % 100 == 0:
+                    #    print "100 batches"
 
-                    #if batch_index % 500 == 0:
-                    #    print "acc / total", time_acc_func / (t2 - start_time), time_acc_func
-
-                    self.series_mux.append("reconstruction_error", c)
+                    # useful when doing tests
+                    if self.max_minibatches and batch_index >= self.max_minibatches:
+                        break
                         
                 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
                 sys.stdout.flush()
 
-                self.series_mux.append("params", self.classifier.all_params)
+                self.series['params'].append((epoch,), self.classifier.all_params)
      
         end_time = time.clock()
 
@@ -131,24 +117,26 @@
 
         sys.stdout.flush()
 
-    def finetune(self):
+    def finetune(self,dataset):
         print "STARTING FINETUNING, time = ", datetime.datetime.now()
 
-        index   = T.lscalar()    # index to a [mini]batch 
         minibatch_size = self.hp.minibatch_size
 
         # create a function to compute the mistakes that are made by the model
         # on the validation set, or testing set
-        shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX))
-        test_model = theano.function([index], self.classifier.errors,
-                 givens = {
-                   self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
-                   self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+        test_model = \
+            theano.function(
+                [self.classifier.x,self.classifier.y], self.classifier.errors)
+        #         givens = {
+        #           self.classifier.x: ensemble_x,
+        #           self.classifier.y: ensemble_y]})
 
-        validate_model = theano.function([index], self.classifier.errors,
-                givens = {
-                   self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider,
-                   self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]})
+        validate_model = \
+            theano.function(
+                [self.classifier.x,self.classifier.y], self.classifier.errors)
+        #        givens = {
+        #           self.classifier.x: ,
+        #           self.classifier.y: ]})
 
 
         # early-stopping parameters
@@ -157,11 +145,13 @@
                                       # found
         improvement_threshold = 0.995 # a relative improvement of this much is 
                                       # considered significant
-        validation_frequency  = min(self.n_train_batches, patience/2)
+        validation_frequency  = min(self.mb_per_epoch, patience/2)
                                       # go through this many 
                                       # minibatche before checking the network 
                                       # on the validation set; in this case we 
                                       # check every epoch 
+        if self.max_minibatches and validation_frequency > self.max_minibatches:
+            validation_frequency = self.max_minibatches / 2
 
         best_params          = None
         best_validation_loss = float('inf')
@@ -171,22 +161,31 @@
         done_looping = False
         epoch = 0
 
+        total_mb_index = 0
+
         while (epoch < self.hp.max_finetuning_epochs) and (not done_looping):
             epoch = epoch + 1
-            for minibatch_index in xrange(self.n_train_batches):
+            minibatch_index = -1
+            for x,y in dataset.train(minibatch_size):
+                minibatch_index += 1
+                cost_ij = self.classifier.finetune(x,y)
+                total_mb_index += 1
 
-                cost_ij = self.classifier.finetune(minibatch_index)
-                iter    = epoch * self.n_train_batches + minibatch_index
-
-                self.series_mux.append("training_error", cost_ij)
+                self.series["training_error"].append((epoch, minibatch_index), cost_ij)
 
-                if (iter+1) % validation_frequency == 0: 
+                if (total_mb_index+1) % validation_frequency == 0: 
                     
-                    validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)]
+                    iter = dataset.valid(minibatch_size)
+                    if self.max_minibatches:
+                        iter = itermax(iter, self.max_minibatches)
+                    validation_losses = [validate_model(x,y) for x,y in iter]
                     this_validation_loss = numpy.mean(validation_losses)
-                    self.series_mux.append("validation_error", this_validation_loss)
+
+                    self.series["validation_error"].\
+                        append((epoch, minibatch_index), this_validation_loss*100.)
+
                     print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                           (epoch, minibatch_index+1, self.n_train_batches, \
+                           (epoch, minibatch_index+1, self.mb_per_epoch, \
                             this_validation_loss*100.))
 
 
@@ -196,26 +195,36 @@
                         #improve patience if loss improvement is good enough
                         if this_validation_loss < best_validation_loss *  \
                                improvement_threshold :
-                            patience = max(patience, iter * patience_increase)
+                            patience = max(patience, total_mb_index * patience_increase)
 
                         # save best validation score and iteration number
                         best_validation_loss = this_validation_loss
-                        best_iter = iter
+                        best_iter = total_mb_index
 
                         # test it on the test set
-                        test_losses = [test_model(i) for i in xrange(self.n_test_batches)]
+                        iter = dataset.test(minibatch_size)
+                        if self.max_minibatches:
+                            iter = itermax(iter, self.max_minibatches)
+                        test_losses = [test_model(x,y) for x,y in iter]
                         test_score = numpy.mean(test_losses)
-                        self.series_mux.append("test_error", test_score)
+
+                        self.series["test_error"].\
+                            append((epoch, minibatch_index), test_score*100.)
+
                         print(('     epoch %i, minibatch %i/%i, test error of best '
                               'model %f %%') % 
-                                     (epoch, minibatch_index+1, self.n_train_batches,
+                                     (epoch, minibatch_index+1, self.mb_per_epoch,
                                       test_score*100.))
 
                     sys.stdout.flush()
 
-            self.series_mux.append("params", self.classifier.all_params)
+                # useful when doing tests
+                if self.max_minibatches and minibatch_index >= self.max_minibatches:
+                    break
 
-            if patience <= iter :
+            self.series['params'].append((epoch,), self.classifier.all_params)
+
+            if patience <= total_mb_index:
                 done_looping = True
                 break
 
--- a/deep/stacked_dae/stacked_dae.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/deep/stacked_dae/stacked_dae.py	Mon Mar 22 10:20:10 2010 -0400
@@ -127,13 +127,13 @@
     #        this will produce an array of 0s and 1s where 1 has a 
     #        probability of 1 - ``corruption_level`` and 0 with
     #        ``corruption_level``
-    self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level) * self.x
+    self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level, dtype=theano.config.floatX) * self.x
     # Equation (2)
     # note  : y is stored as an attribute of the class so that it can be 
     #         used later when stacking dAs. 
     self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
     # Equation (3)
-    self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
+    #self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
     # Equation (4)
     # note : we sum over the size of a datapoint; if we are using minibatches,
     #        L will  be a vector, with one entry per example in minibatch
@@ -141,17 +141,20 @@
     #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
 
     # bypassing z to avoid running to log(0)
-    #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime)
-    #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \
-    #                + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 )
+    z_a = T.dot(self.y, self.W_prime) + self.b_prime
+    log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a))
+    # log(1-sigmoid(z_a))
+    log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a))
+    self.L = -T.sum( self.x * (log_sigmoid) \
+                    + (1.0-self.x) * (log_1_sigmoid), axis=1 )
 
     # I added this epsilon to avoid getting log(0) and 1/0 in grad
     # This means conceptually that there'd be no probability of 0, but that
     # doesn't seem to me as important (maybe I'm wrong?).
-    eps = 0.00000001
-    eps_1 = 1-eps
-    self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
-                    + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
+    #eps = 0.00000001
+    #eps_1 = 1-eps
+    #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
+    #                + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
     # note : L is now a vector, where each element is the cross-entropy cost 
     #        of the reconstruction of the corresponding example of the 
     #        minibatch. We need to compute the average of all these to get 
@@ -162,9 +165,9 @@
 
 
 class SdA(object):
-    def __init__(self, train_set_x, train_set_y, batch_size, n_ins, 
+    def __init__(self, batch_size, n_ins, 
                  hidden_layers_sizes, n_outs, 
-                 corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0):
+                 corruption_levels, rng, pretrain_lr, finetune_lr):
         # Just to make sure those are not modified somewhere else afterwards
         hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
         corruption_levels = copy.deepcopy(corruption_levels)
@@ -187,17 +190,14 @@
         print "n_outs", n_outs
         print "pretrain_lr", pretrain_lr
         print "finetune_lr", finetune_lr
-        print "input_divider", input_divider
         print "----"
 
-        self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
-
         if len(hidden_layers_sizes) < 1 :
             raiseException (' You must have at least one hidden layer ')
 
 
         # allocate symbolic variables for the data
-        index   = T.lscalar()    # index to a [mini]batch 
+        #index   = T.lscalar()    # index to a [mini]batch 
         self.x  = T.matrix('x')  # the data is presented as rasterized images
         self.y  = T.ivector('y') # the labels are presented as 1D vector of 
                                  # [int] labels
@@ -244,10 +244,15 @@
                 updates[param] = param - gparam * pretrain_lr
             
             # create a function that trains the dA
-            update_fn = theano.function([index], dA_layer.cost, \
-                  updates = updates,
-                  givens = { 
-                     self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
+            update_fn = theano.function([self.x], dA_layer.cost, \
+                  updates = updates)#,
+            #     givens = { 
+            #         self.x : ensemble})
+            # collect this function into a list
+            #update_fn = theano.function([index], dA_layer.cost, \
+            #      updates = updates,
+            #      givens = { 
+            #         self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
             # collect this function into a list
             self.pretrain_functions += [update_fn]
 
@@ -270,11 +275,11 @@
         for param,gparam in zip(self.params, gparams):
             updates[param] = param - gparam*finetune_lr
             
-        self.finetune = theano.function([index], cost, 
-                updates = updates,
-                givens = {
-                  self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
-                  self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
+        self.finetune = theano.function([self.x,self.y], cost, 
+                updates = updates)#,
+        #        givens = {
+        #          self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
+        #          self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
 
         # symbolic variable that points to the number of errors made on the
         # minibatch given by self.x and self.y
--- a/deep/stacked_dae/v2/config.py.example	Mon Mar 22 10:19:45 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,64 +0,0 @@
-'''
-These are parameters used by nist_sda.py. They'll end up as globals in there.
-
-Rename this file to config.py and configure as needed.
-DON'T add the renamed file to the repository, as others might use it
-without realizing it, with dire consequences.
-'''
-
-# Set this to True when you want to run cluster tests, ie. you want
-# to run on the cluster, many jobs, but want to reduce the training
-# set size and the number of epochs, so you know everything runs
-# fine on the cluster.
-# Set this PRIOR to inserting your test jobs in the DB.
-TEST_CONFIG = False
-
-NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
-NIST_ALL_TRAIN_SIZE = 649081
-# valid et test =82587 82587 
-
-# change "sandbox" when you're ready
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere'
-EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint"
-
-# reduce training set to that many examples
-REDUCE_TRAIN_TO = None
-# that's a max, it usually doesn't get to that point
-MAX_FINETUNING_EPOCHS = 1000
-# number of minibatches before taking means for valid error etc.
-REDUCE_EVERY = 100
-
-if TEST_CONFIG:
-    REDUCE_TRAIN_TO = 1000
-    MAX_FINETUNING_EPOCHS = 2
-    REDUCE_EVERY = 10
-
-
-# This is to configure insertion of jobs on the cluster.
-# Possible values the hyperparameters can take. These are then
-# combined with produit_cartesien_jobs so we get a list of all
-# possible combinations, each one resulting in a job inserted
-# in the jobman DB.
-JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
-        'pretraining_epochs_per_layer': [10,20],
-        'hidden_layers_sizes': [300,800],
-        'corruption_levels': [0.1,0.2,0.3],
-        'minibatch_size': [20],
-        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
-        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
-        'num_hidden_layers':[2,3]}
-
-# Just useful for tests... minimal number of epochs
-# (This is used when running a single job, locally, when
-# calling ./nist_sda.py test_jobman_entrypoint
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
-                       'pretraining_lr':0.1,
-                       'pretraining_epochs_per_layer':2,
-                       'max_finetuning_epochs':2,
-                       'hidden_layers_sizes':800,
-                       'corruption_levels':0.2,
-                       'minibatch_size':20,
-                       'reduce_train_to':10000,
-                       'num_hidden_layers':1})
-
-
--- a/deep/stacked_dae/v2/nist_sda.py	Mon Mar 22 10:19:45 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,169 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-
-import ift6266
-import pylearn
-
-import numpy 
-import theano
-import time
-
-import pylearn.version
-import theano.tensor as T
-from theano.tensor.shared_randomstreams import RandomStreams
-
-import copy
-import sys
-import os
-import os.path
-
-from jobman import DD
-import jobman, jobman.sql
-from pylearn.io import filetensor
-
-from utils import produit_cartesien_jobs
-
-from sgd_optimization import SdaSgdOptimizer
-
-#from ift6266.utils.scalar_series import *
-from ift6266.utils.seriestables import *
-import tables
-
-from ift6266 import datasets
-from config import *
-
-'''
-Function called by jobman upon launching each job
-Its path is the one given when inserting jobs: see EXPERIMENT_PATH
-'''
-def jobman_entrypoint(state, channel):
-    # record mercurial versions of each package
-    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
-    # TODO: remove this, bad for number of simultaneous requests on DB
-    channel.save()
-
-    # For test runs, we don't want to use the whole dataset so
-    # reduce it to fewer elements if asked to.
-    rtt = None
-    if state.has_key('reduce_train_to'):
-        rtt = state['reduce_train_to']
-    elif REDUCE_TRAIN_TO:
-        rtt = REDUCE_TRAIN_TO
- 
-    n_ins = 32*32
-    n_outs = 62 # 10 digits, 26*2 (lower, capitals)
-     
-    examples_per_epoch = NIST_ALL_TRAIN_SIZE
-
-    series = create_series(state.num_hidden_layers)
-
-    print "Creating optimizer with state, ", state
-
-    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, 
-                                    hyperparameters=state, \
-                                    n_ins=n_ins, n_outs=n_outs,\
-                                    examples_per_epoch=examples_per_epoch, \
-                                    series=series,
-                                    max_minibatches=rtt)
-
-    optimizer.pretrain(datasets.nist_all)
-    channel.save()
-
-    optimizer.finetune(datasets.nist_all)
-    channel.save()
-
-    return channel.COMPLETE
-
-# These Series objects are used to save various statistics
-# during the training.
-def create_series(num_hidden_layers):
-
-    # Replace series we don't want to save with DummySeries, e.g.
-    # series['training_error'] = DummySeries()
-
-    series = {}
-
-    basedir = os.getcwd()
-
-    h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w")
-
-    # reconstruction
-    reconstruction_base = \
-                ErrorSeries(error_name="reconstruction_error",
-                    table_name="reconstruction_error",
-                    hdf5_file=h5f,
-                    index_names=('epoch','minibatch'),
-                    title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)")
-    series['reconstruction_error'] = \
-                AccumulatorSeriesWrapper(base_series=reconstruction_base,
-                    reduce_every=REDUCE_EVERY)
-
-    # train
-    training_base = \
-                ErrorSeries(error_name="training_error",
-                    table_name="training_error",
-                    hdf5_file=h5f,
-                    index_names=('epoch','minibatch'),
-                    title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)")
-    series['training_error'] = \
-                AccumulatorSeriesWrapper(base_series=training_base,
-                    reduce_every=REDUCE_EVERY)
-
-    # valid and test are not accumulated/mean, saved directly
-    series['validation_error'] = \
-                ErrorSeries(error_name="validation_error",
-                    table_name="validation_error",
-                    hdf5_file=h5f,
-                    index_names=('epoch','minibatch'))
-
-    series['test_error'] = \
-                ErrorSeries(error_name="test_error",
-                    table_name="test_error",
-                    hdf5_file=h5f,
-                    index_names=('epoch','minibatch'))
-
-    param_names = []
-    for i in range(num_hidden_layers):
-        param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i]
-    param_names += ['logreg_layer_W', 'logreg_layer_b']
-
-    # comment out series we don't want to save
-    series['params'] = SharedParamsStatisticsWrapper(
-                        new_group_name="params",
-                        base_group="/",
-                        arrays_names=param_names,
-                        hdf5_file=h5f,
-                        index_names=('epoch',))
-
-    return series
-
-# Perform insertion into the Postgre DB based on combination
-# of hyperparameter values above
-# (see comment for produit_cartesien_jobs() to know how it works)
-def jobman_insert_nist():
-    jobs = produit_cartesien_jobs(JOB_VALS)
-
-    db = jobman.sql.db(JOBDB)
-    for job in jobs:
-        job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH})
-        jobman.sql.insert_dict(job, db)
-
-    print "inserted"
-
-if __name__ == '__main__':
-
-    args = sys.argv[1:]
-
-    #if len(args) > 0 and args[0] == 'load_nist':
-    #    test_load_nist()
-
-    if len(args) > 0 and args[0] == 'jobman_insert':
-        jobman_insert_nist()
-
-    elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
-        chanmock = DD({'COMPLETE':0,'save':(lambda:None)})
-        jobman_entrypoint(DEFAULT_HP_NIST, chanmock)
-
-    else:
-        print "Bad arguments"
-
--- a/deep/stacked_dae/v2/sgd_optimization.py	Mon Mar 22 10:19:45 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,243 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-
-# Generic SdA optimization loop, adapted from the deeplearning.net tutorial
-
-import numpy 
-import theano
-import time
-import datetime
-import theano.tensor as T
-import sys
-
-from jobman import DD
-import jobman, jobman.sql
-
-from stacked_dae import SdA
-
-from ift6266.utils.seriestables import *
-
-default_series = { \
-        'reconstruction_error' : DummySeries(),
-        'training_error' : DummySeries(),
-        'validation_error' : DummySeries(),
-        'test_error' : DummySeries(),
-        'params' : DummySeries()
-        }
-
-def itermax(iter, max):
-    for i,it in enumerate(iter):
-        if i >= max:
-            break
-        yield it
-
-class SdaSgdOptimizer:
-    def __init__(self, dataset, hyperparameters, n_ins, n_outs,
-                    examples_per_epoch, series=default_series, max_minibatches=None):
-        self.dataset = dataset
-        self.hp = hyperparameters
-        self.n_ins = n_ins
-        self.n_outs = n_outs
-   
-        self.max_minibatches = max_minibatches
-        print "SdaSgdOptimizer, max_minibatches =", max_minibatches
-
-        self.ex_per_epoch = examples_per_epoch
-        self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size
-
-        self.series = series
-
-        self.rng = numpy.random.RandomState(1234)
-
-        self.init_classifier()
-
-        sys.stdout.flush()
-
-    def init_classifier(self):
-        print "Constructing classifier"
-
-        # we don't want to save arrays in DD objects, so
-        # we recreate those arrays here
-        nhl = self.hp.num_hidden_layers
-        layers_sizes = [self.hp.hidden_layers_sizes] * nhl
-        corruption_levels = [self.hp.corruption_levels] * nhl
-
-        # construct the stacked denoising autoencoder class
-        self.classifier = SdA( \
-                          batch_size = self.hp.minibatch_size, \
-                          n_ins= self.n_ins, \
-                          hidden_layers_sizes = layers_sizes, \
-                          n_outs = self.n_outs, \
-                          corruption_levels = corruption_levels,\
-                          rng = self.rng,\
-                          pretrain_lr = self.hp.pretraining_lr, \
-                          finetune_lr = self.hp.finetuning_lr)
-
-        #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph")
-
-        sys.stdout.flush()
-
-    def train(self):
-        self.pretrain(self.dataset)
-        self.finetune(self.dataset)
-
-    def pretrain(self,dataset):
-        print "STARTING PRETRAINING, time = ", datetime.datetime.now()
-        sys.stdout.flush()
-
-        start_time = time.clock()  
-        ## Pre-train layer-wise 
-        for i in xrange(self.classifier.n_layers):
-            # go through pretraining epochs 
-            for epoch in xrange(self.hp.pretraining_epochs_per_layer):
-                # go through the training set
-                batch_index=0
-                for x,y in dataset.train(self.hp.minibatch_size):
-                    c = self.classifier.pretrain_functions[i](x)
-
-                    self.series["reconstruction_error"].append((epoch, batch_index), c)
-                    batch_index+=1
-
-                    #if batch_index % 100 == 0:
-                    #    print "100 batches"
-
-                    # useful when doing tests
-                    if self.max_minibatches and batch_index >= self.max_minibatches:
-                        break
-                        
-                print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
-                sys.stdout.flush()
-
-                self.series['params'].append((epoch,), self.classifier.all_params)
-     
-        end_time = time.clock()
-
-        print ('Pretraining took %f minutes' %((end_time-start_time)/60.))
-        self.hp.update({'pretraining_time': end_time-start_time})
-
-        sys.stdout.flush()
-
-    def finetune(self,dataset):
-        print "STARTING FINETUNING, time = ", datetime.datetime.now()
-
-        minibatch_size = self.hp.minibatch_size
-
-        # create a function to compute the mistakes that are made by the model
-        # on the validation set, or testing set
-        test_model = \
-            theano.function(
-                [self.classifier.x,self.classifier.y], self.classifier.errors)
-        #         givens = {
-        #           self.classifier.x: ensemble_x,
-        #           self.classifier.y: ensemble_y]})
-
-        validate_model = \
-            theano.function(
-                [self.classifier.x,self.classifier.y], self.classifier.errors)
-        #        givens = {
-        #           self.classifier.x: ,
-        #           self.classifier.y: ]})
-
-
-        # early-stopping parameters
-        patience              = 10000 # look as this many examples regardless
-        patience_increase     = 2.    # wait this much longer when a new best is 
-                                      # found
-        improvement_threshold = 0.995 # a relative improvement of this much is 
-                                      # considered significant
-        validation_frequency  = min(self.mb_per_epoch, patience/2)
-                                      # go through this many 
-                                      # minibatche before checking the network 
-                                      # on the validation set; in this case we 
-                                      # check every epoch 
-        if self.max_minibatches and validation_frequency > self.max_minibatches:
-            validation_frequency = self.max_minibatches / 2
-
-        best_params          = None
-        best_validation_loss = float('inf')
-        test_score           = 0.
-        start_time = time.clock()
-
-        done_looping = False
-        epoch = 0
-
-        total_mb_index = 0
-
-        while (epoch < self.hp.max_finetuning_epochs) and (not done_looping):
-            epoch = epoch + 1
-            minibatch_index = -1
-            for x,y in dataset.train(minibatch_size):
-                minibatch_index += 1
-                cost_ij = self.classifier.finetune(x,y)
-                total_mb_index += 1
-
-                self.series["training_error"].append((epoch, minibatch_index), cost_ij)
-
-                if (total_mb_index+1) % validation_frequency == 0: 
-                    
-                    iter = dataset.valid(minibatch_size)
-                    if self.max_minibatches:
-                        iter = itermax(iter, self.max_minibatches)
-                    validation_losses = [validate_model(x,y) for x,y in iter]
-                    this_validation_loss = numpy.mean(validation_losses)
-
-                    self.series["validation_error"].\
-                        append((epoch, minibatch_index), this_validation_loss*100.)
-
-                    print('epoch %i, minibatch %i/%i, validation error %f %%' % \
-                           (epoch, minibatch_index+1, self.mb_per_epoch, \
-                            this_validation_loss*100.))
-
-
-                    # if we got the best validation score until now
-                    if this_validation_loss < best_validation_loss:
-
-                        #improve patience if loss improvement is good enough
-                        if this_validation_loss < best_validation_loss *  \
-                               improvement_threshold :
-                            patience = max(patience, total_mb_index * patience_increase)
-
-                        # save best validation score and iteration number
-                        best_validation_loss = this_validation_loss
-                        best_iter = total_mb_index
-
-                        # test it on the test set
-                        iter = dataset.test(minibatch_size)
-                        if self.max_minibatches:
-                            iter = itermax(iter, self.max_minibatches)
-                        test_losses = [test_model(x,y) for x,y in iter]
-                        test_score = numpy.mean(test_losses)
-
-                        self.series["test_error"].\
-                            append((epoch, minibatch_index), test_score*100.)
-
-                        print(('     epoch %i, minibatch %i/%i, test error of best '
-                              'model %f %%') % 
-                                     (epoch, minibatch_index+1, self.mb_per_epoch,
-                                      test_score*100.))
-
-                    sys.stdout.flush()
-
-                # useful when doing tests
-                if self.max_minibatches and minibatch_index >= self.max_minibatches:
-                    break
-
-            self.series['params'].append((epoch,), self.classifier.all_params)
-
-            if patience <= total_mb_index:
-                done_looping = True
-                break
-
-        end_time = time.clock()
-        self.hp.update({'finetuning_time':end_time-start_time,\
-                    'best_validation_error':best_validation_loss,\
-                    'test_score':test_score,
-                    'num_finetuning_epochs':epoch})
-
-        print(('Optimization complete with best validation score of %f %%,'
-               'with test performance %f %%') %  
-                     (best_validation_loss * 100., test_score*100.))
-        print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.))
-
-
-
--- a/deep/stacked_dae/v2/stacked_dae.py	Mon Mar 22 10:19:45 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,292 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-
-import numpy 
-import theano
-import time
-import theano.tensor as T
-from theano.tensor.shared_randomstreams import RandomStreams
-import copy
-
-from utils import update_locals
-
-# taken from LeDeepNet/daa.py
-# has a special case when taking log(0) (defined =0)
-# modified to not take the mean anymore
-from theano.tensor.xlogx import xlogx, xlogy0
-# it's target*log(output)
-def binary_cross_entropy(target, output, sum_axis=1):
-    XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output))
-    return -T.sum(XE, axis=sum_axis)
-
-class LogisticRegression(object):
-    def __init__(self, input, n_in, n_out):
-        # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
-        self.W = theano.shared( value=numpy.zeros((n_in,n_out),
-                                            dtype = theano.config.floatX) )
-        # initialize the baises b as a vector of n_out 0s
-        self.b = theano.shared( value=numpy.zeros((n_out,), 
-                                            dtype = theano.config.floatX) )
-        # compute vector of class-membership probabilities in symbolic form
-        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
-        
-        # compute prediction as class whose probability is maximal in 
-        # symbolic form
-        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
-
-        # list of parameters for this layer
-        self.params = [self.W, self.b]
-
-    def negative_log_likelihood(self, y):
-       return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
-
-    def errors(self, y):
-        # check if y has same dimension of y_pred 
-        if y.ndim != self.y_pred.ndim:
-            raise TypeError('y should have the same shape as self.y_pred', 
-                ('y', target.type, 'y_pred', self.y_pred.type))
-
-        # check if y is of the correct datatype        
-        if y.dtype.startswith('int'):
-            # the T.neq operator returns a vector of 0s and 1s, where 1
-            # represents a mistake in prediction
-            return T.mean(T.neq(self.y_pred, y))
-        else:
-            raise NotImplementedError()
-
-
-class SigmoidalLayer(object):
-    def __init__(self, rng, input, n_in, n_out):
-        self.input = input
-
-        W_values = numpy.asarray( rng.uniform( \
-              low = -numpy.sqrt(6./(n_in+n_out)), \
-              high = numpy.sqrt(6./(n_in+n_out)), \
-              size = (n_in, n_out)), dtype = theano.config.floatX)
-        self.W = theano.shared(value = W_values)
-
-        b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
-        self.b = theano.shared(value= b_values)
-
-        self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b)
-        self.params = [self.W, self.b]
-
-
-
-class dA(object):
-  def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\
-               input = None, shared_W = None, shared_b = None):
-    self.n_visible = n_visible
-    self.n_hidden  = n_hidden
-    
-    # create a Theano random generator that gives symbolic random values
-    theano_rng = RandomStreams()
-    
-    if shared_W != None and shared_b != None : 
-        self.W = shared_W
-        self.b = shared_b
-    else:
-        # initial values for weights and biases
-        # note : W' was written as `W_prime` and b' as `b_prime`
-
-        # W is initialized with `initial_W` which is uniformely sampled
-        # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
-        # the output of uniform if converted using asarray to dtype 
-        # theano.config.floatX so that the code is runable on GPU
-        initial_W = numpy.asarray( numpy.random.uniform( \
-              low = -numpy.sqrt(6./(n_hidden+n_visible)), \
-              high = numpy.sqrt(6./(n_hidden+n_visible)), \
-              size = (n_visible, n_hidden)), dtype = theano.config.floatX)
-        initial_b       = numpy.zeros(n_hidden, dtype = theano.config.floatX)
-    
-    
-        # theano shared variables for weights and biases
-        self.W       = theano.shared(value = initial_W,       name = "W")
-        self.b       = theano.shared(value = initial_b,       name = "b")
-    
- 
-    initial_b_prime= numpy.zeros(n_visible)
-    # tied weights, therefore W_prime is W transpose
-    self.W_prime = self.W.T 
-    self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
-
-    # if no input is given, generate a variable representing the input
-    if input == None : 
-        # we use a matrix because we expect a minibatch of several examples,
-        # each example being a row
-        self.x = T.dmatrix(name = 'input') 
-    else:
-        self.x = input
-    # Equation (1)
-    # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs
-    # note : first argument of theano.rng.binomial is the shape(size) of 
-    #        random numbers that it should produce
-    #        second argument is the number of trials 
-    #        third argument is the probability of success of any trial
-    #
-    #        this will produce an array of 0s and 1s where 1 has a 
-    #        probability of 1 - ``corruption_level`` and 0 with
-    #        ``corruption_level``
-    self.tilde_x  = theano_rng.binomial( self.x.shape,  1,  1 - corruption_level, dtype=theano.config.floatX) * self.x
-    # Equation (2)
-    # note  : y is stored as an attribute of the class so that it can be 
-    #         used later when stacking dAs. 
-    self.y   = T.nnet.sigmoid(T.dot(self.tilde_x, self.W      ) + self.b)
-    # Equation (3)
-    #self.z   = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
-    # Equation (4)
-    # note : we sum over the size of a datapoint; if we are using minibatches,
-    #        L will  be a vector, with one entry per example in minibatch
-    #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
-    #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
-
-    # bypassing z to avoid running to log(0)
-    z_a = T.dot(self.y, self.W_prime) + self.b_prime
-    log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a))
-    # log(1-sigmoid(z_a))
-    log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a))
-    self.L = -T.sum( self.x * (log_sigmoid) \
-                    + (1.0-self.x) * (log_1_sigmoid), axis=1 )
-
-    # I added this epsilon to avoid getting log(0) and 1/0 in grad
-    # This means conceptually that there'd be no probability of 0, but that
-    # doesn't seem to me as important (maybe I'm wrong?).
-    #eps = 0.00000001
-    #eps_1 = 1-eps
-    #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \
-    #                + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 )
-    # note : L is now a vector, where each element is the cross-entropy cost 
-    #        of the reconstruction of the corresponding example of the 
-    #        minibatch. We need to compute the average of all these to get 
-    #        the cost of the minibatch
-    self.cost = T.mean(self.L)
-
-    self.params = [ self.W, self.b, self.b_prime ]
-
-
-class SdA(object):
-    def __init__(self, batch_size, n_ins, 
-                 hidden_layers_sizes, n_outs, 
-                 corruption_levels, rng, pretrain_lr, finetune_lr):
-        # Just to make sure those are not modified somewhere else afterwards
-        hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
-        corruption_levels = copy.deepcopy(corruption_levels)
-
-        update_locals(self, locals())      
- 
-        self.layers             = []
-        self.pretrain_functions = []
-        self.params             = []
-        # MODIF: added this so we also get the b_primes
-        # (not used for finetuning... still using ".params")
-        self.all_params         = []
-        self.n_layers           = len(hidden_layers_sizes)
-
-        print "Creating SdA with params:"
-        print "batch_size", batch_size
-        print "hidden_layers_sizes", hidden_layers_sizes
-        print "corruption_levels", corruption_levels
-        print "n_ins", n_ins
-        print "n_outs", n_outs
-        print "pretrain_lr", pretrain_lr
-        print "finetune_lr", finetune_lr
-        print "----"
-
-        if len(hidden_layers_sizes) < 1 :
-            raiseException (' You must have at least one hidden layer ')
-
-
-        # allocate symbolic variables for the data
-        #index   = T.lscalar()    # index to a [mini]batch 
-        self.x  = T.matrix('x')  # the data is presented as rasterized images
-        self.y  = T.ivector('y') # the labels are presented as 1D vector of 
-                                 # [int] labels
-
-        for i in xrange( self.n_layers ):
-            # construct the sigmoidal layer
-
-            # the size of the input is either the number of hidden units of 
-            # the layer below or the input size if we are on the first layer
-            if i == 0 :
-                input_size = n_ins
-            else:
-                input_size = hidden_layers_sizes[i-1]
-
-            # the input to this layer is either the activation of the hidden
-            # layer below or the input of the SdA if you are on the first
-            # layer
-            if i == 0 : 
-                layer_input = self.x
-            else:
-                layer_input = self.layers[-1].output
-
-            layer = SigmoidalLayer(rng, layer_input, input_size, 
-                                   hidden_layers_sizes[i] )
-            # add the layer to the 
-            self.layers += [layer]
-            self.params += layer.params
-        
-            # Construct a denoising autoencoder that shared weights with this
-            # layer
-            dA_layer = dA(input_size, hidden_layers_sizes[i], \
-                          corruption_level = corruption_levels[0],\
-                          input = layer_input, \
-                          shared_W = layer.W, shared_b = layer.b)
-
-            self.all_params += dA_layer.params
-        
-            # Construct a function that trains this dA
-            # compute gradients of layer parameters
-            gparams = T.grad(dA_layer.cost, dA_layer.params)
-            # compute the list of updates
-            updates = {}
-            for param, gparam in zip(dA_layer.params, gparams):
-                updates[param] = param - gparam * pretrain_lr
-            
-            # create a function that trains the dA
-            update_fn = theano.function([self.x], dA_layer.cost, \
-                  updates = updates)#,
-            #     givens = { 
-            #         self.x : ensemble})
-            # collect this function into a list
-            #update_fn = theano.function([index], dA_layer.cost, \
-            #      updates = updates,
-            #      givens = { 
-            #         self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider})
-            # collect this function into a list
-            self.pretrain_functions += [update_fn]
-
-        
-        # We now need to add a logistic layer on top of the MLP
-        self.logLayer = LogisticRegression(\
-                         input = self.layers[-1].output,\
-                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
-
-        self.params += self.logLayer.params
-        self.all_params += self.logLayer.params
-        # construct a function that implements one step of finetunining
-
-        # compute the cost, defined as the negative log likelihood 
-        cost = self.logLayer.negative_log_likelihood(self.y)
-        # compute the gradients with respect to the model parameters
-        gparams = T.grad(cost, self.params)
-        # compute list of updates
-        updates = {}
-        for param,gparam in zip(self.params, gparams):
-            updates[param] = param - gparam*finetune_lr
-            
-        self.finetune = theano.function([self.x,self.y], cost, 
-                updates = updates)#,
-        #        givens = {
-        #          self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider,
-        #          self.y : train_set_y[index*batch_size:(index+1)*batch_size]} )
-
-        # symbolic variable that points to the number of errors made on the
-        # minibatch given by self.x and self.y
-
-        self.errors = self.logLayer.errors(self.y)
-
-if __name__ == '__main__':
-    import sys
-    args = sys.argv[1:]
-
--- a/deep/stacked_dae/v2/utils.py	Mon Mar 22 10:19:45 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-
-from __future__ import with_statement
-
-from jobman import DD
-
-# from pylearn codebase
-# useful in __init__(param1, param2, etc.) to save
-# values in self.param1, self.param2... just call
-# update_locals(self, locals())
-def update_locals(obj, dct):
-    if 'self' in dct:
-        del dct['self']
-    obj.__dict__.update(dct)
-
-# from a dictionary of possible values for hyperparameters, e.g.
-# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]}
-# create a list of other dictionaries representing all the possible
-# combinations, thus in this example creating:
-# [{'learning_rate': 0.1, 'num_layers': 1}, ...]
-# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2))
-def produit_cartesien_jobs(val_dict):
-    job_list = [DD()]
-    all_keys = val_dict.keys()
-
-    for key in all_keys:
-        possible_values = val_dict[key]
-        new_job_list = []
-        for val in possible_values:
-            for job in job_list:
-                to_insert = job.copy()
-                to_insert.update({key: val})
-                new_job_list.append(to_insert)
-        job_list = new_job_list
-
-    return job_list
-
-def test_produit_cartesien_jobs():
-    vals = {'a': [1,2], 'b': [3,4,5]}
-    print produit_cartesien_jobs(vals)
-
-
-# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python
-"""Simple module for getting amount of memory used by a specified user's
-processes on a UNIX system.
-It uses UNIX ps utility to get the memory usage for a specified username and
-pipe it to awk for summing up per application memory usage and return the total.
-Python's Popen() from subprocess module is used for spawning ps and awk.
-
-"""
-
-import subprocess
-
-class MemoryMonitor(object):
-
-    def __init__(self, username):
-        """Create new MemoryMonitor instance."""
-        self.username = username
-
-    def usage(self):
-        """Return int containing memory used by user's processes."""
-        self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username,
-                                        shell=True,
-                                        stdout=subprocess.PIPE,
-                                        )
-        self.stdout_list = self.process.communicate()[0].split('\n')
-        return int(self.stdout_list[0])
-
--- a/deep/stacked_dae/v_sylvain/nist_sda.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/deep/stacked_dae/v_sylvain/nist_sda.py	Mon Mar 22 10:20:10 2010 -0400
@@ -60,7 +60,7 @@
 
     print "Creating optimizer with state, ", state
 
-    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, 
+    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all(), 
                                     hyperparameters=state, \
                                     n_ins=n_ins, n_outs=n_outs,\
                                     examples_per_epoch=examples_per_epoch, \
@@ -68,7 +68,7 @@
                                     max_minibatches=rtt)
 
     parameters=[]
-    optimizer.pretrain(datasets.nist_all)
+    optimizer.pretrain(datasets.nist_P07())
     channel.save()
     
     #Set some of the parameters used for the finetuning
@@ -92,31 +92,31 @@
     if finetune_choice==0:
         print('\n\n\tfinetune avec nist\n\n')
         optimizer.reload_parameters()
-        optimizer.finetune(datasets.nist_all,max_finetune_epoch_NIST)
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
     if finetune_choice==1:
         print('\n\n\tfinetune avec P07\n\n')
         optimizer.reload_parameters()
-        optimizer.finetune(datasets.nist_P07,max_finetune_epoch_P07)
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
     if finetune_choice==2:
         print('\n\n\tfinetune avec nist suivi de P07\n\n')
         optimizer.reload_parameters()
-        optimizer.finetune(datasets.nist_all,max_finetune_epoch_NIST)
-        optimizer.finetune(datasets.nist_P07,max_finetune_epoch_P07)
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
 
     if finetune_choice==-1:
         print('\nSerie de 3 essais de fine-tuning')
         print('\n\n\tfinetune avec nist\n\n')
         optimizer.reload_parameters()
-        optimizer.finetune(datasets.nist_all,max_finetune_epoch_NIST)
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
         channel.save()
         print('\n\n\tfinetune avec P07\n\n')
         optimizer.reload_parameters()
-        optimizer.finetune(datasets.nist_P07,max_finetune_epoch_P07)
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
         channel.save()
         print('\n\n\tfinetune avec nist suivi de P07\n\n')
         optimizer.reload_parameters()
-        optimizer.finetune(datasets.nist_all,max_finetune_epoch_NIST)
-        optimizer.finetune(datasets.nist_P07,max_finetune_epoch_P07)
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
         channel.save()
     
     channel.save()
--- a/deep/stacked_dae/v_sylvain/sgd_optimization.py	Mon Mar 22 10:19:45 2010 -0400
+++ b/deep/stacked_dae/v_sylvain/sgd_optimization.py	Mon Mar 22 10:20:10 2010 -0400
@@ -9,6 +9,7 @@
 import datetime
 import theano.tensor as T
 import sys
+import pickle
 
 from jobman import DD
 import jobman, jobman.sql
@@ -121,12 +122,20 @@
         
         #To be able to load them later for tests on finetune
         self.parameters_pre=[copy(x.value) for x in self.classifier.params]
+        f = open('params_pretrain.txt', 'w')
+        pickle.dump(self.parameters_pre,f)
+        f.close()
 
 
-    def finetune(self,dataset,num_finetune):
+    def finetune(self,dataset,dataset_test,num_finetune,ind_test):
         print "STARTING FINETUNING, time = ", datetime.datetime.now()
 
         minibatch_size = self.hp.minibatch_size
+        if ind_test == 0:
+            nom_test = "NIST"
+        else:
+            nom_test = "P07"
+
 
         # create a function to compute the mistakes that are made by the model
         # on the validation set, or testing set
@@ -213,6 +222,13 @@
                             iter = itermax(iter, self.max_minibatches)
                         test_losses = [test_model(x,y) for x,y in iter]
                         test_score = numpy.mean(test_losses)
+                        
+                        #test it on the second test set
+                        iter2 = dataset_test.test(minibatch_size)
+                        if self.max_minibatches:
+                            iter2 = itermax(iter2, self.max_minibatches)
+                        test_losses2 = [test_model(x,y) for x,y in iter2]
+                        test_score2 = numpy.mean(test_losses2)
 
                         self.series["test_error"].\
                             append((epoch, minibatch_index), test_score*100.)
@@ -221,6 +237,11 @@
                               'model %f %%') % 
                                      (epoch, minibatch_index+1,
                                       test_score*100.))
+                                    
+                        print(('     epoch %i, minibatch %i, test error on dataset %s of best '
+                              'model %f %%') % 
+                                     (epoch, minibatch_index+1,nom_test,
+                                      test_score2*100.))
 
                     sys.stdout.flush()
 
@@ -243,12 +264,18 @@
         print(('Optimization complete with best validation score of %f %%,'
                'with test performance %f %%') %  
                      (best_validation_loss * 100., test_score*100.))
+        print(('The test score on the %s dataset is %f')%(nom_test,test_score2*100.))
+        
         print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.))
         
 
     #Set parameters like they where right after pre-train
     def reload_parameters(self):
-
+        
+        #self.parameters_pre=pickle.load('params_pretrain.txt')
+        f = open('params_pretrain.txt')
+        self.parameters_pre=pickle.load(f)
+        f.close()
         for idx,x in enumerate(self.parameters_pre):
             self.classifier.params[idx].value=copy(x)
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/setup_batches.py	Mon Mar 22 10:20:10 2010 -0400
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+import random
+from pylearn.io import filetensor as ft
+
+class Batches():
+  def __init__(self):
+    data_path = '/data/lisa/data/nist/by_class/'
+
+    digits_train_data = 'digits/digits_train_data.ft'
+    digits_train_labels = 'digits/digits_train_labels.ft'
+    digits_test_data = 'digits/digits_test_data.ft'
+    digits_test_labels = 'digits/digits_test_labels.ft'
+
+    lower_train_data = 'lower/lower_train_data.ft'
+    lower_train_labels = 'lower/lower_train_labels.ft'
+    #upper_train_data = 'upper/upper_train_data.ft'
+    #upper_train_labels = 'upper/upper_train_labels.ft'
+
+    f_digits_train_data = open(data_path + digits_train_data)
+    f_digits_train_labels = open(data_path + digits_train_labels)
+    f_digits_test_data = open(data_path + digits_test_data)
+    f_digits_test_labels = open(data_path + digits_test_labels)
+
+    f_lower_train_data = open(data_path + lower_train_data)
+    f_lower_train_labels = open(data_path + lower_train_labels)
+    #f_upper_train_data = open(data_path + upper_train_data)
+    #f_upper_train_labels = open(data_path + upper_train_labels)
+
+    self.raw_digits_train_data = ft.read(f_digits_train_data)
+    self.raw_digits_train_labels = ft.read(f_digits_train_labels)
+    self.raw_digits_test_data = ft.read(f_digits_test_data)
+    self.raw_digits_test_labels = ft.read(f_digits_test_labels)
+
+    self.raw_lower_train_data = ft.read(f_lower_train_data)
+    self.raw_lower_train_labels = ft.read(f_lower_train_labels)
+    #self.raw_upper_train_data = ft.read(f_upper_train_data)
+    #self.raw_upper_train_labels = ft.read(f_upper_train_labels)
+
+    f_digits_train_data.close()
+    f_digits_train_labels.close()
+    f_digits_test_data.close()
+    f_digits_test_labels.close()
+
+    f_lower_train_data.close()
+    f_lower_train_labels.close()
+    #f_upper_train_data.close()
+    #f_upper_train_labels.close()
+
+  def set_batches(self, start_ratio = -1, end_ratio = -1, batch_size = 20, verbose = False):
+    self.batch_size = batch_size
+
+    digits_train_size = len(self.raw_digits_train_labels)
+    digits_test_size = len(self.raw_digits_test_labels)
+
+    lower_train_size = len(self.raw_lower_train_labels)
+    #upper_train_size = len(self.raw_upper_train_labels)
+
+    if verbose == True:
+      print 'digits_train_size = %d' %digits_train_size
+      print 'digits_test_size = %d' %digits_test_size
+      print 'lower_train_size = %d' %lower_train_size
+      #print 'upper_train_size = %d' %upper_train_size
+
+    # define main and other datasets
+    raw_main_train_data = self.raw_digits_train_data
+    raw_other_train_data = self.raw_lower_train_labels
+    raw_test_data = self.raw_digits_test_labels
+
+    raw_main_train_labels = self.raw_digits_train_labels
+    raw_other_train_labels = self.raw_lower_train_labels
+    raw_test_labels = self.raw_digits_test_labels
+
+    main_train_size = len(raw_main_train_data)
+    other_train_size = len(raw_other_train_data)
+    test_size = len(raw_test_data)
+    test_size = int(test_size/batch_size)
+    test_size *= batch_size
+    validation_size = test_size 
+
+    # default ratio is actual ratio
+    if start_ratio == -1:
+      self.start_ratio = float(main_train_size) / float(main_train_size + other_train_size)
+    else:
+      self.start_ratio = start_ratio
+
+    if start_ratio == -1:
+      self.end_ratio = float(main_train_size) / float(main_train_size + other_train_size)
+    else:
+      self.end_ratio = end_ratio
+
+    if verbose == True:
+      print 'start_ratio = %f' %self.start_ratio
+      print 'end_ratio = %f' %self.end_ratio
+
+    i_main = 0
+    i_other = 0
+    i_batch = 0
+
+    # compute the number of batches given start and end ratios
+    n_main_batch = (main_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
+    n_other_batch = (other_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size - batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2))
+    n_batches = min([n_main_batch, n_other_batch])
+
+    # train batches
+    self.train_batches = []
+
+    # as long as we have data left in main and other, we create batches
+    while i_main < main_train_size - batch_size - test_size  and i_other < other_train_size - batch_size:
+
+      ratio = self.start_ratio + i_batch * (self.end_ratio - self.start_ratio) / n_batches
+      batch_data = []
+      batch_labels = []
+
+      for i in xrange(0, self.batch_size): # randomly choose between main and other, given the current ratio
+	rnd = random.randint(0, 100)
+
+	if rnd < 100 * ratio:
+	  batch_data = batch_data + \
+		[raw_main_train_data[i_main]]
+	  batch_labels = batch_labels + \
+		[raw_main_train_labels[i_main]]
+	  i_main += 1
+	else:
+	  batch_data = batch_data + \
+		[raw_other_train_data[i_other]]
+	  batch_labels = batch_labels + \
+		[raw_other_train_labels[i_other]]
+	  i_other += 1
+
+      self.train_batches = self.train_batches + \
+	      [(batch_data,batch_labels)]
+      i_batch += 1
+
+    offset = i_main
+
+    if verbose == True:
+      print 'n_main = %d' %i_main
+      print 'n_other = %d' %i_other
+      print 'nb_train_batches = %d / %d' %(i_batch,n_batches)
+      print 'offset = %d' %offset
+
+    # test batches
+    self.test_batches = []
+    for i in xrange(0, test_size, batch_size):
+        self.test_batches = self.test_batches + \
+            [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])]
+
+    # validation batches
+    self.validation_batches = []
+    for i in xrange(0, test_size, batch_size):
+        self.validation_batches = self.validation_batches + \
+            [(raw_main_train_data[offset+i:offset+i+batch_size], raw_main_train_labels[offset+i:offset+i+batch_size])]
+
+  def get_train_batches(self):
+    return self.train_batches
+
+  def get_test_batches(self):
+    return self.test_batches
+
+  def get_validation_batches(self):
+    return self.validation_batches
+
+  def test_set_batches(self, intervall = 1000):
+    for i in xrange(0, len(self.train_batches) - self.batch_size, intervall):
+	n_main = 0
+
+	for j in xrange(0, self.batch_size):
+	  if self.train_batches[i][1][j] < 10:
+	    n_main +=1
+	print 'ratio batch %d : %f' %(i,float(n_main) / float(self.batch_size))
+
+if __name__ == '__main__':
+    batches = Batches()
+    batches.set_batches(0.5,1, 20, True)
+    batches.test_set_batches()