# HG changeset patch
# User fsavard
# Date 1267638700 18000
# Node ID e656edaedb486c452835e3c1fbe10e38b578858a
# Parent  3632e6258642499640868362bfe9bd294429b5c2
Commented a few things, renamed the produit_croise_jobs function, replaced the cost function (NOT TESTED YET).

diff -r 3632e6258642 -r e656edaedb48 deep/stacked_dae/nist_sda.py
--- a/deep/stacked_dae/nist_sda.py	Tue Mar 02 14:47:18 2010 -0500
+++ b/deep/stacked_dae/nist_sda.py	Wed Mar 03 12:51:40 2010 -0500
@@ -21,7 +21,7 @@
 import jobman, jobman.sql
 from pylearn.io import filetensor
 
-from utils import produit_croise_jobs
+from utils import produit_cartesien_jobs
 
 from sgd_optimization import SdaSgdOptimizer
 
@@ -31,7 +31,7 @@
 
 NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
 
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/fsavard_sda2'
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4'
 
 REDUCE_TRAIN_TO = None
 MAX_FINETUNING_EPOCHS = 1000
@@ -43,6 +43,10 @@
 
 EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint"
 
+# Possible values the hyperparameters can take. These are then
+# combined with produit_cartesien_jobs so we get a list of all
+# possible combinations, each one resulting in a job inserted
+# in the jobman DB.
 JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
         'pretraining_epochs_per_layer': [10,20],
         'hidden_layers_sizes': [300,800],
@@ -63,7 +67,11 @@
                        #'reduce_train_to':300,
                        'num_hidden_layers':2})
 
+# Function called by jobman upon launching each job
+# Its path is the one given when inserting jobs:
+# ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint
 def jobman_entrypoint(state, channel):
+    # record mercurial versions of each package
     pylearn.version.record_versions(state,[theano,ift6266,pylearn])
     channel.save()
 
@@ -71,10 +79,12 @@
 
     print "Will load NIST"
 
-    nist = NIST(20)
+    nist = NIST(minibatch_size=20)
 
     print "NIST loaded"
 
+    # For test runs, we don't want to use the whole dataset so
+    # reduce it to fewer elements if asked to.
     rtt = None
     if state.has_key('reduce_train_to'):
         rtt = state['reduce_train_to']
@@ -82,7 +92,7 @@
         rtt = REDUCE_TRAIN_TO
 
     if rtt:
-        print "Reducing training set to "+str( rtt)+ " examples"
+        print "Reducing training set to "+str(rtt)+ " examples"
         nist.reduce_train_set(rtt)
 
     train,valid,test = nist.get_tvt()
@@ -91,14 +101,9 @@
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
 
-    hls = state.hidden_layers_sizes
-    cl = state.corruption_levels
-    nhl = state.num_hidden_layers
-    state.hidden_layers_sizes = [hls] * nhl
-    state.corruption_levels = [cl] * nhl
-
-    # b,b',W for each hidden layer + b,W of last layer (logreg)
-    numparams = nhl * 3 + 2
+    # b,b',W for each hidden layer 
+    # + b,W of last layer (logreg)
+    numparams = state.num_hidden_layers * 3 + 2
     series_mux = None
     series_mux = create_series(workingdir, numparams)
 
@@ -114,11 +119,10 @@
     optimizer.finetune()
     channel.save()
 
-    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
-    channel.save()
-
     return channel.COMPLETE
 
+# These Series objects are used to save various statistics
+# during the training.
 def create_series(basedir, numparams):
     mux = SeriesMultiplexer()
 
@@ -140,8 +144,11 @@
 
     return mux
 
+# Perform insertion into the Postgre DB based on combination
+# of hyperparameter values above
+# (see comment for produit_cartesien_jobs() to know how it works)
 def jobman_insert_nist():
-    jobs = produit_croise_jobs(JOB_VALS)
+    jobs = produit_cartesien_jobs(JOB_VALS)
 
     db = jobman.sql.db(JOBDB)
     for job in jobs:
@@ -227,35 +234,6 @@
 
     raw_input("Press any key")
 
-# hp for hyperparameters
-def sgd_optimization_nist(hp=None, dataset_dir='/data/lisa/data/nist'):
-    global DEFAULT_HP_NIST
-    hp = hp and hp or DEFAULT_HP_NIST
-
-    print "Will load NIST"
-
-    import time
-    t1 = time.time()
-    nist = NIST(20, reduce_train_to=100)
-    t2 = time.time()
-
-    print "NIST loaded. time delta = ", t2-t1
-
-    train,valid,test = nist.get_tvt()
-    dataset = (train,valid,test)
-
-    print train[0][15]
-    print type(train[0][1])
-
-
-    print "Lengths train, valid, test: ", len(train[0]), len(valid[0]), len(test[0])
-
-    n_ins = 32*32
-    n_outs = 62 # 10 digits, 26*2 (lower, capitals)
-
-    optimizer = SdaSgdOptimizer(dataset, hp, n_ins, n_outs, input_divider=255.0)
-    optimizer.train()
-
 if __name__ == '__main__':
 
     import sys
@@ -277,5 +255,4 @@
     elif len(args) > 0 and args[0] == 'estimate':
         estimate_total_time()
     else:
-        sgd_optimization_nist()
-
+        print "Bad arguments"
diff -r 3632e6258642 -r e656edaedb48 deep/stacked_dae/sgd_optimization.py
--- a/deep/stacked_dae/sgd_optimization.py	Tue Mar 02 14:47:18 2010 -0500
+++ b/deep/stacked_dae/sgd_optimization.py	Wed Mar 03 12:51:40 2010 -0500
@@ -60,20 +60,27 @@
         # compute number of minibatches for training, validation and testing
         self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size
         self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size
-        self.n_test_batches  = self.test_set_x.value.shape[0]  / self.hp.minibatch_size
+        # remove last batch in case it's incomplete
+        self.n_test_batches  = (self.test_set_x.value.shape[0]  / self.hp.minibatch_size) - 1
 
     def init_classifier(self):
         print "Constructing classifier"
 
+        # we don't want to save arrays in DD objects, so
+        # we recreate those arrays here
+        nhl = self.hp.num_hidden_layers
+        layers_sizes = [self.hp.hidden_layers_sizes] * nhl
+        corruption_levels = [self.hp.corruption_levels] * nhl
+
         # construct the stacked denoising autoencoder class
         self.classifier = SdA( \
                           train_set_x= self.train_set_x, \
                           train_set_y = self.train_set_y,\
                           batch_size = self.hp.minibatch_size, \
                           n_ins= self.n_ins, \
-                          hidden_layers_sizes = self.hp.hidden_layers_sizes, \
+                          hidden_layers_sizes = layers_sizes, \
                           n_outs = self.n_outs, \
-                          corruption_levels = self.hp.corruption_levels,\
+                          corruption_levels = corruption_levels,\
                           rng = self.rng,\
                           pretrain_lr = self.hp.pretraining_lr, \
                           finetune_lr = self.hp.finetuning_lr,\
diff -r 3632e6258642 -r e656edaedb48 deep/stacked_dae/stacked_dae.py
--- a/deep/stacked_dae/stacked_dae.py	Tue Mar 02 14:47:18 2010 -0500
+++ b/deep/stacked_dae/stacked_dae.py	Wed Mar 03 12:51:40 2010 -0500
@@ -10,6 +10,15 @@
 
 from utils import update_locals
 
+# taken from LeDeepNet/daa.py
+# has a special case when taking log(0) (defined =0)
+# modified to not take the mean anymore
+from theano.tensor.xlogx import xlogx, xlogy0
+# it's target*log(output)
+def binary_cross_entropy(target, output, sum_axis=1):
+    XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output))
+    return -T.sum(XE, axis=sum_axis)
+
 class LogisticRegression(object):
     def __init__(self, input, n_in, n_out):
         # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 
@@ -128,7 +137,8 @@
     # Equation (4)
     # note : we sum over the size of a datapoint; if we are using minibatches,
     #        L will  be a vector, with one entry per example in minibatch
-    self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
+    #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) 
+    self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1)
     # note : L is now a vector, where each element is the cross-entropy cost 
     #        of the reconstruction of the corresponding example of the 
     #        minibatch. We need to compute the average of all these to get 
@@ -156,6 +166,17 @@
         self.all_params         = []
         self.n_layers           = len(hidden_layers_sizes)
 
+        print "Creating SdA with params:"
+        print "batch_size", batch_size
+        print "hidden_layers_sizes", hidden_layers_sizes
+        print "corruption_levels", corruption_levels
+        print "n_ins", n_ins
+        print "n_outs", n_outs
+        print "pretrain_lr", pretrain_lr
+        print "finetune_lr", finetune_lr
+        print "input_divider", input_divider
+        print "----"
+
         self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX))
 
         if len(hidden_layers_sizes) < 1 :
diff -r 3632e6258642 -r e656edaedb48 deep/stacked_dae/utils.py
--- a/deep/stacked_dae/utils.py	Tue Mar 02 14:47:18 2010 -0500
+++ b/deep/stacked_dae/utils.py	Wed Mar 03 12:51:40 2010 -0500
@@ -6,12 +6,21 @@
 from jobman import DD
 
 # from pylearn codebase
+# useful in __init__(param1, param2, etc.) to save
+# values in self.param1, self.param2... just call
+# update_locals(self, locals())
 def update_locals(obj, dct):
     if 'self' in dct:
         del dct['self']
     obj.__dict__.update(dct)
 
-def produit_croise_jobs(val_dict):
+# from a dictionary of possible values for hyperparameters, e.g.
+# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]}
+# create a list of other dictionaries representing all the possible
+# combinations, thus in this example creating:
+# [{'learning_rate': 0.1, 'num_layers': 1}, ...]
+# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2))
+def produit_cartesien_jobs(val_dict):
     job_list = [DD()]
     all_keys = val_dict.keys()
 
@@ -27,9 +36,9 @@
 
     return job_list
 
-def test_produit_croise_jobs():
+def test_produit_cartesien_jobs():
     vals = {'a': [1,2], 'b': [3,4,5]}
-    print produit_croise_jobs(vals)
+    print produit_cartesien_jobs(vals)
 
 
 # taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python