# HG changeset patch
# User fsavard
# Date 1267224352 18000
# Node ID b9ea8e2d071a515a3d4901510e26a75f1154b8ed
# Parent  4d3d3627df3e8064e570d2a950812815288f3d38
Enlevé ce qui concernait la réutilisation de résultats de préentraînement (trop compliqué pour peu de bénéfice: c'est le finetuning qui est vraiment long

diff -r 4d3d3627df3e -r b9ea8e2d071a deep/stacked_dae/nist_sda.py
--- a/deep/stacked_dae/nist_sda.py	Fri Feb 26 15:25:44 2010 -0500
+++ b/deep/stacked_dae/nist_sda.py	Fri Feb 26 17:45:52 2010 -0500
@@ -1,71 +1,86 @@
 #!/usr/bin/python
 # coding: utf-8
 
+import ift6266
+import pylearn
+
 import numpy 
 import theano
 import time
+
+import pylearn.version
 import theano.tensor as T
 from theano.tensor.shared_randomstreams import RandomStreams
+
 import copy
-
 import sys
+import os
 import os.path
 
-from sgd_optimization import SdaSgdOptimizer
-
 from jobman import DD
 import jobman, jobman.sql
 from pylearn.io import filetensor
 
 from utils import produit_croise_jobs
 
-TEST_CONFIG = False
+from sgd_optimization import SdaSgdOptimizer
+
+SERIES_AVAILABLE = False
+try:
+    from scalar_series import *
+    SERIES_AVAILABLE = True
+except ImportError:
+    print "Could not import Series"
+
+TEST_CONFIG = True
 
 NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
 
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/'
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda2'
+
 REDUCE_TRAIN_TO = None
 MAX_FINETUNING_EPOCHS = 1000
+REDUCE_EVERY = 1000 # number of minibatches before taking means for valid error etc.
 if TEST_CONFIG:
-    JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/'
     REDUCE_TRAIN_TO = 1000
     MAX_FINETUNING_EPOCHS = 2
+    REDUCE_EVERY = 10
 
-JOBDB_JOBS = JOBDB + 'fsavard_sda1_jobs'
-JOBDB_RESULTS = JOBDB + 'fsavard_sda1_results'
 EXPERIMENT_PATH = "ift6266.scripts.stacked_dae.nist_sda.jobman_entrypoint"
 
-# There used to be
-# 'finetuning_lr': [0.00001, 0.0001, 0.001, 0.01, 0.1]
-# and
-#  'num_hidden_layers':[1,2,3]
-# but this is now handled by a special mechanism in SgdOptimizer
-# to reuse intermediate results (for the same training of lower layers,
-# we can test many finetuning_lr)
-JOB_VALS = {'pretraining_lr': [0.1, 0.01, 0.001],#, 0.0001],
+JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
         'pretraining_epochs_per_layer': [10,20],
         'hidden_layers_sizes': [300,800],
-        'corruption_levels': [0.1,0.2],
+        'corruption_levels': [0.1,0.2,0.3],
         'minibatch_size': [20],
-        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS]}
-FINETUNING_LR_VALS = [0.1, 0.01, 0.001]#, 0.0001]
-NUM_HIDDEN_LAYERS_VALS = [1,2,3]
+        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
+        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
+        'num_hidden_layers':[2,3]}
 
 # Just useful for tests... minimal number of epochs
 DEFAULT_HP_NIST = DD({'finetuning_lr':0.01,
                        'pretraining_lr':0.01,
                        'pretraining_epochs_per_layer':1,
                        'max_finetuning_epochs':1,
-                       'hidden_layers_sizes':[1000],
-                       'corruption_levels':[0.2],
-                       'minibatch_size':20})
+                       'hidden_layers_sizes':1000,
+                       'corruption_levels':0.2,
+                       'minibatch_size':20,
+                       'reduce_train_to':1000,
+                       'num_hidden_layers':1})
 
 def jobman_entrypoint(state, channel):
-    state = copy.copy(state)
+    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
+    channel.save()
+
+    workingdir = os.getcwd()
 
     print "Will load NIST"
+    sys.stdout.flush()
+
     nist = NIST(20)
+
     print "NIST loaded"
+    sys.stdout.flush()
 
     rtt = None
     if state.has_key('reduce_train_to'):
@@ -83,50 +98,58 @@
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
 
-    db = jobman.sql.db(JOBDB_RESULTS)
-    optimizer = SdaSgdOptimizer(dataset, state, n_ins, n_outs,\
-                    input_divider=255.0, job_tree=True, results_db=db, \
-                    experiment=EXPERIMENT_PATH, \
-                    finetuning_lr_to_try=FINETUNING_LR_VALS, \
-                    num_hidden_layers_to_try=NUM_HIDDEN_LAYERS_VALS)
-    optimizer.train()
+    hls = state.hidden_layers_sizes
+    cl = state.corruption_levels
+    nhl = state.num_hidden_layers
+    state.hidden_layers_sizes = [hls] * nhl
+    state.corruption_levels = [cl] * nhl
+
+    # b,b',W for each hidden layer + b,W of last layer (logreg)
+    numparams = nhl * 3 + 2
+    series_mux = None
+    if SERIES_AVAILABLE:
+        series_mux = create_series(workingdir, numparams)
+
+    optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \
+                                    n_ins=n_ins, n_outs=n_outs,\
+                                    input_divider=255.0, series_mux=series_mux)
+
+    optimizer.pretrain()
+    channel.save()
+
+    optimizer.finetune()
+    channel.save()
+
+    pylearn.version.record_versions(state,[theano,ift6266,pylearn])
+    channel.save()
 
     return channel.COMPLETE
 
-def estimate_pretraining_time(job):
-    job = DD(job)
-    # time spent on pretraining estimated as O(n^2) where n=num hidens
-    # no need to multiply by num_hidden_layers, as results from num=1 
-    # is reused for num=2, or 3, so in the end we get the same time
-    # as if we were training 3 times a single layer
-    # constants:
-    # - 20 mins to pretrain a layer with 1000 units (per 1 epoch)
-    # - 12 mins to finetune (per 1 epoch)
-    # basically the job_tree trick gives us a 5 times speedup on the
-    # pretraining time due to reusing for finetuning_lr
-    # and gives us a second x2 speedup for reusing previous layers
-    # to explore num_hidden_layers
-    return (job.pretraining_epochs_per_layer * 20 / (1000.0*1000) \
-            * job.hidden_layer_sizes * job.hidden_layer_sizes)
+def create_series(basedir, numparams):
+    mux = SeriesMultiplexer()
+
+    # comment out series we don't want to save
+    mux.add_series(AccumulatorSeries(name="reconstruction_error",
+                    reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save
+                    mean=True,
+                    directory=basedir, flush_every=1))
 
-def estimate_total_time():
-    jobs = produit_croise_jobs(JOB_VALS)
-    sumtime = 0.0
-    sum_without = 0.0
-    for job in jobs:
-        sumtime += estimate_pretraining_time(job)
-        # 12 mins per epoch * 30 epochs
-        # 5 finetuning_lr per pretraining combination
-    sum_without = (12*20*len(jobs) + sumtime*2) * len(FINETUNING_LR_VALS)
-    sumtime += len(FINETUNING_LR_VALS) * len(jobs) * 12 * 20
-    print "num jobs=", len(jobs)
-    print "estimate", sumtime/60, " hours"
-    print "estimate without tree optimization", sum_without/60, "ratio", sumtime / sum_without
+    mux.add_series(AccumulatorSeries(name="training_error",
+                    reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save
+                    mean=True,
+                    directory=basedir, flush_every=1))
+
+    mux.add_series(BaseSeries(name="validation_error", directory=basedir, flush_every=1))
+    mux.add_series(BaseSeries(name="test_error", directory=basedir, flush_every=1))
+
+    mux.add_series(ParamsArrayStats(numparams,name="params",directory=basedir))
+
+    return mux
 
 def jobman_insert_nist():
     jobs = produit_croise_jobs(JOB_VALS)
 
-    db = jobman.sql.db(JOBDB_JOBS)
+    db = jobman.sql.db(JOBDB)
     for job in jobs:
         job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH})
         jobman.sql.insert_dict(job, db)
@@ -250,13 +273,11 @@
 
     elif len(args) > 0 and args[0] == 'jobman_insert':
         jobman_insert_nist()
-    elif len(args) > 0 and args[0] == 'test_job_tree':
-        # dont forget to comment out sql.inserts and make reduce_train_to=100
-        print "TESTING JOB TREE"
-        chanmock = {'COMPLETE':0}
-        hp = copy.copy(DEFAULT_HP_NIST)
-        hp.update({'reduce_train_to':100})
-        jobman_entrypoint(hp, chanmock)
+
+    elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
+        chanmock = DD({'COMPLETE':0})
+        jobman_entrypoint(DEFAULT_HP_NIST, chanmock)
+
     elif len(args) > 0 and args[0] == 'estimate':
         estimate_total_time()
     else:
diff -r 4d3d3627df3e -r b9ea8e2d071a deep/stacked_dae/sgd_optimization.py
--- a/deep/stacked_dae/sgd_optimization.py	Fri Feb 26 15:25:44 2010 -0500
+++ b/deep/stacked_dae/sgd_optimization.py	Fri Feb 26 17:45:52 2010 -0500
@@ -7,7 +7,6 @@
 import theano
 import time
 import theano.tensor as T
-import copy
 import sys
 
 from jobman import DD
@@ -24,44 +23,34 @@
     shared_y = theano.shared(data_y)
     return shared_x, shared_y
 
+class DummyMux():
+    def append(self, param1, param2):
+        pass
+
 class SdaSgdOptimizer:
-    def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0,\
-                job_tree=False, results_db=None,\
-                experiment="",\
-                num_hidden_layers_to_try=[1,2,3], \
-                finetuning_lr_to_try=[0.1, 0.01, 0.001, 0.0001, 0.00001]):
-
+    def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series_mux=None):
         self.dataset = dataset
-        self.hp = copy.copy(hyperparameters)
+        self.hp = hyperparameters
         self.n_ins = n_ins
         self.n_outs = n_outs
         self.input_divider = numpy.asarray(input_divider, dtype=theano.config.floatX)
-
-        self.job_tree = job_tree
-        self.results_db = results_db
-        self.experiment = experiment
-        if self.job_tree:
-            assert(not results_db is None)
-            # these hp should not be there, so we insert default values
-            # we use 3 hidden layers as we'll iterate through 1,2,3
-            self.hp.finetuning_lr = 0.1 # dummy value, will be replaced anyway
-            cl = self.hp.corruption_levels
-            nh = self.hp.hidden_layers_sizes
-            self.hp.corruption_levels = [cl,cl,cl]
-            self.hp.hidden_layers_sizes = [nh,nh,nh]
-            
-        self.num_hidden_layers_to_try = num_hidden_layers_to_try
-        self.finetuning_lr_to_try = finetuning_lr_to_try
-
-        self.printout_frequency = 1000
+   
+        if not series_mux:
+            series_mux = DummyMux()
+            print "No series multiplexer set"
+        self.series_mux = series_mux
 
         self.rng = numpy.random.RandomState(1234)
 
         self.init_datasets()
         self.init_classifier()
+
+        sys.stdout.flush()
      
     def init_datasets(self):
         print "init_datasets"
+        sys.stdout.flush()
+
         train_set, valid_set, test_set = self.dataset
         self.test_set_x, self.test_set_y = shared_dataset(test_set)
         self.valid_set_x, self.valid_set_y = shared_dataset(valid_set)
@@ -74,6 +63,7 @@
 
     def init_classifier(self):
         print "Constructing classifier"
+
         # construct the stacked denoising autoencoder class
         self.classifier = SdA( \
                           train_set_x= self.train_set_x, \
@@ -88,17 +78,15 @@
                           finetune_lr = self.hp.finetuning_lr,\
                           input_divider = self.input_divider )
 
+        sys.stdout.flush()
+
     def train(self):
         self.pretrain()
-        if not self.job_tree:
-            # if job_tree is True, finetuning was already performed
-            self.finetune()
+        self.finetune()
 
     def pretrain(self):
         print "STARTING PRETRAINING"
-
-        printout_acc = 0.0
-        last_error = 0.0
+        sys.stdout.flush()
 
         start_time = time.clock()  
         ## Pre-train layer-wise 
@@ -109,62 +97,17 @@
                 for batch_index in xrange(self.n_train_batches):
                     c = self.classifier.pretrain_functions[i](batch_index)
 
-                    printout_acc += c / self.printout_frequency
-                    if (batch_index+1) % self.printout_frequency == 0:
-                        print batch_index, "reconstruction cost avg=", printout_acc
-                        last_error = printout_acc
-                        printout_acc = 0.0
+                    self.series_mux.append("reconstruction_error", c)
                         
                 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c
-
-            self.job_splitter(i+1, time.clock()-start_time, last_error)
+                sys.stdout.flush()
      
         end_time = time.clock()
 
         print ('Pretraining took %f minutes' %((end_time-start_time)/60.))
-
-    # Save time by reusing intermediate results
-    def job_splitter(self, current_pretraining_layer, pretraining_time, last_error):
-
-        state_copy = None
-        original_classifier = None
-
-        if self.job_tree and current_pretraining_layer in self.num_hidden_layers_to_try:
-            for lr in self.finetuning_lr_to_try:
-                sys.stdout.flush()
-                sys.stderr.flush()
-
-                state_copy = copy.copy(self.hp)
-
-                self.hp.update({'num_hidden_layers':current_pretraining_layer, \
-                            'finetuning_lr':lr,\
-                            'pretraining_time':pretraining_time,\
-                            'last_reconstruction_error':last_error})
+        self.hp.update({'pretraining_time': end_time-start_time})
 
-                original_classifier = self.classifier
-                print "ORIGINAL CLASSIFIER MEANS",original_classifier.get_params_means()
-                self.classifier = SdA.copy_reusing_lower_layers(original_classifier, current_pretraining_layer, new_finetuning_lr=lr)
-                
-                self.finetune()
-            
-                self.insert_finished_job()
-
-                print "NEW CLASSIFIER MEANS AFTERWARDS",self.classifier.get_params_means()
-                print "ORIGINAL CLASSIFIER MEANS AFTERWARDS",original_classifier.get_params_means()
-                self.classifier = original_classifier
-                self.hp = state_copy
-
-    def insert_finished_job(self):
-        job = copy.copy(self.hp)
-        job[jobman.sql.STATUS] = jobman.sql.DONE
-        job[jobman.sql.EXPERIMENT] = self.experiment
-
-        # don,t try to store arrays in db
-        job['hidden_layers_sizes'] = job.hidden_layers_sizes[0]
-        job['corruption_levels'] = job.corruption_levels[0]
-
-        print "Will insert finished job", job
-        jobman.sql.insert_dict(jobman.flatten(job), self.results_db)
+        sys.stdout.flush()
 
     def finetune(self):
         print "STARTING FINETUNING"
@@ -205,11 +148,6 @@
         done_looping = False
         epoch = 0
 
-        printout_acc = 0.0
-
-        if not self.hp.has_key('max_finetuning_epochs'):
-            self.hp.max_finetuning_epochs = 1000
-
         while (epoch < self.hp.max_finetuning_epochs) and (not done_looping):
             epoch = epoch + 1
             for minibatch_index in xrange(self.n_train_batches):
@@ -217,10 +155,7 @@
                 cost_ij = self.classifier.finetune(minibatch_index)
                 iter    = epoch * self.n_train_batches + minibatch_index
 
-                printout_acc += cost_ij / float(self.printout_frequency * minibatch_size)
-                if (iter+1) % self.printout_frequency == 0:
-                    print iter, "cost avg=", printout_acc
-                    printout_acc = 0.0
+                self.series_mux.append("training_error", cost_ij)
 
                 if (iter+1) % validation_frequency == 0: 
                     
@@ -251,6 +186,9 @@
                                      (epoch, minibatch_index+1, self.n_train_batches,
                                       test_score*100.))
 
+                    sys.stdout.flush()
+
+            self.series_mux.append("params", self.classifier.params)
 
             if patience <= iter :
                 done_looping = True
@@ -261,6 +199,7 @@
                     'best_validation_error':best_validation_loss,\
                     'test_score':test_score,
                     'num_finetuning_epochs':epoch})
+
         print(('Optimization complete with best validation score of %f %%,'
                'with test performance %f %%') %  
                      (best_validation_loss * 100., test_score*100.))
diff -r 4d3d3627df3e -r b9ea8e2d071a deep/stacked_dae/stacked_dae.py
--- a/deep/stacked_dae/stacked_dae.py	Fri Feb 26 15:25:44 2010 -0500
+++ b/deep/stacked_dae/stacked_dae.py	Fri Feb 26 17:45:52 2010 -0500
@@ -144,6 +144,9 @@
     def __init__(self, train_set_x, train_set_y, batch_size, n_ins, 
                  hidden_layers_sizes, n_outs, 
                  corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0):
+        # Just to make sure those are not modified somewhere else afterwards
+        hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes)
+        corruption_levels = copy.deepcopy(corruption_levels)
         update_locals(self, locals())      
  
         self.layers             = []
@@ -239,48 +242,6 @@
 
         self.errors = self.logLayer.errors(self.y)
 
-    @classmethod
-    def copy_reusing_lower_layers(cls, obj, num_hidden_layers, new_finetuning_lr=None):
-        assert(num_hidden_layers <= obj.n_layers)
-
-        if not new_finetuning_lr:
-            new_finetuning_lr = obj.finetune_lr
-
-        new_sda = cls(train_set_x= obj.train_set_x, \
-                      train_set_y = obj.train_set_y,\
-                      batch_size = obj.batch_size, \
-                      n_ins= obj.n_ins, \
-                      hidden_layers_sizes = obj.hidden_layers_sizes[:num_hidden_layers], \
-                      n_outs = obj.n_outs, \
-                      corruption_levels = obj.corruption_levels[:num_hidden_layers],\
-                      rng = obj.rng,\
-                      pretrain_lr = obj.pretrain_lr, \
-                      finetune_lr = new_finetuning_lr, \
-                      input_divider = obj.input_divider )
-
-        # new_sda.layers contains only the hidden layers actually
-        for i, layer in enumerate(new_sda.layers):
-            original_layer = obj.layers[i]
-            for p1,p2 in zip(layer.params, original_layer.params):
-                p1.value = p2.value.copy()
-
-        return new_sda
-
-    def get_params_copy(self):
-        return copy.deepcopy(self.params)
-
-    def set_params_from_copy(self, copy):
-        # We don't want to replace the var, as the functions have pointers in there
-        # We only want to replace values.
-        for i, p in enumerate(self.params):
-            p.value = copy[i].value
-
-    def get_params_means(self):
-        s = []
-        for p in self.params:
-            s.append(numpy.mean(p.value))
-        return s
-
 if __name__ == '__main__':
     import sys
     args = sys.argv[1:]