diff scripts/stacked_dae/nist_sda.py @ 139:7d8366fb90bf

Ajouté des __init__.py dans l'arborescence pour que les scripts puissent être utilisés avec des paths pour jobman, et fait pas mal de modifs dans stacked_dae pour pouvoir réutiliser le travail fait pour des tests où le pretraining est le même.
author fsavard
date Mon, 22 Feb 2010 13:38:25 -0500
parents 5c79a2557f2f
children 3346fcd3818b
line wrap: on
line diff
--- a/scripts/stacked_dae/nist_sda.py	Sun Feb 21 17:30:38 2010 -0600
+++ b/scripts/stacked_dae/nist_sda.py	Mon Feb 22 13:38:25 2010 -0500
@@ -6,47 +6,135 @@
 import time
 import theano.tensor as T
 from theano.tensor.shared_randomstreams import RandomStreams
+import copy
 
+import sys
 import os.path
 
-from sgd_optimization import sgd_optimization
+from sgd_optimization import SdaSgdOptimizer
 
 from jobman import DD
+import jobman, jobman.sql
 from pylearn.io import filetensor
 
 from utils import produit_croise_jobs
 
+TEST_CONFIG = True
+
 NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
 
+JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/'
+REDUCE_TRAIN_TO = None
+MAX_FINETUNING_EPOCHS = 1000
+if TEST_CONFIG:
+    JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/'
+    REDUCE_TRAIN_TO = 1000
+    MAX_FINETUNING_EPOCHS = 2
+
+JOBDB_JOBS = JOBDB + 'fsavard_sda1_jobs'
+JOBDB_RESULTS = JOBDB + 'fsavard_sda1_results'
+EXPERIMENT_PATH = "ift6266.scripts.stacked_dae.nist_sda.jobman_entrypoint"
+
+# There used to be
+# 'finetuning_lr': [0.00001, 0.0001, 0.001, 0.01, 0.1]
+# and
+#  'num_hidden_layers':[1,2,3]
+# but this is now handled by a special mechanism in SgdOptimizer
+# to reuse intermediate results (for the same training of lower layers,
+# we can test many finetuning_lr)
+JOB_VALS = {'pretraining_lr': [0.1, 0.01, 0.001],#, 0.0001],
+        'pretraining_epochs_per_layer': [10,20],
+        'hidden_layers_sizes': [300,800],
+        'corruption_levels': [0.1,0.2],
+        'minibatch_size': [20],
+        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS]}
+FINETUNING_LR_VALS = [0.1, 0.01, 0.001]#, 0.0001]
+NUM_HIDDEN_LAYERS_VALS = [1,2,3]
+
 # Just useful for tests... minimal number of epochs
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
-                       'pretraining_lr':0.1,
+DEFAULT_HP_NIST = DD({'finetuning_lr':0.01,
+                       'pretraining_lr':0.01,
                        'pretraining_epochs_per_layer':1,
                        'max_finetuning_epochs':1,
-                       'hidden_layers_sizes':[1000,1000],
-                       'corruption_levels':[0.2,0.2],
+                       'hidden_layers_sizes':[1000],
+                       'corruption_levels':[0.2],
                        'minibatch_size':20})
 
-def jobman_entrypoint_nist(state, channel):
-    sgd_optimization_nist(state)
+def jobman_entrypoint(state, channel):
+    state = copy.copy(state)
+
+    print "Will load NIST"
+    nist = NIST(20)
+    print "NIST loaded"
+
+    rtt = None
+    if state.has_key('reduce_train_to'):
+        rtt = state['reduce_train_to']
+    elif REDUCE_TRAIN_TO:
+        rtt = REDUCE_TRAIN_TO
+
+    if rtt:
+        print "Reducing training set to ", rtt, " examples"
+        nist.reduce_train_set(rtt)
+
+    train,valid,test = nist.get_tvt()
+    dataset = (train,valid,test)
+
+    n_ins = 32*32
+    n_outs = 62 # 10 digits, 26*2 (lower, capitals)
+
+    db = jobman.sql.db(JOBDB_RESULTS)
+    optimizer = SdaSgdOptimizer(dataset, state, n_ins, n_outs,\
+                    input_divider=255.0, job_tree=True, results_db=db, \
+                    experiment=EXPERIMENT_PATH, \
+                    finetuning_lr_to_try=FINETUNING_LR_VALS, \
+                    num_hidden_layers_to_try=NUM_HIDDEN_LAYERS_VALS)
+    optimizer.train()
+
+    return channel.COMPLETE
+
+def estimate_pretraining_time(job):
+    job = DD(job)
+    # time spent on pretraining estimated as O(n^2) where n=num hidens
+    # no need to multiply by num_hidden_layers, as results from num=1 
+    # is reused for num=2, or 3, so in the end we get the same time
+    # as if we were training 3 times a single layer
+    # constants:
+    # - 20 mins to pretrain a layer with 1000 units (per 1 epoch)
+    # - 12 mins to finetune (per 1 epoch)
+    # basically the job_tree trick gives us a 5 times speedup on the
+    # pretraining time due to reusing for finetuning_lr
+    # and gives us a second x2 speedup for reusing previous layers
+    # to explore num_hidden_layers
+    return (job.pretraining_epochs_per_layer * 20 / (1000.0*1000) \
+            * job.hidden_layer_sizes * job.hidden_layer_sizes)
+
+def estimate_total_time():
+    jobs = produit_croise_jobs(JOB_VALS)
+    sumtime = 0.0
+    sum_without = 0.0
+    for job in jobs:
+        sumtime += estimate_pretraining_time(job)
+        # 12 mins per epoch * 30 epochs
+        # 5 finetuning_lr per pretraining combination
+    sum_without = (12*20*len(jobs) + sumtime*2) * len(FINETUNING_LR_VALS)
+    sumtime += len(FINETUNING_LR_VALS) * len(jobs) * 12 * 20
+    print "num jobs=", len(jobs)
+    print "estimate", sumtime/60, " hours"
+    print "estimate without tree optimization", sum_without/60, "ratio", sumtime / sum_without
 
 def jobman_insert_nist():
-    vals = {'finetuning_lr': [0.00001, 0.0001, 0.001, 0.01, 0.1],
-            'pretraining_lr': [0.00001, 0.0001, 0.001, 0.01, 0.1],
-            'pretraining_epochs_per_layer': [2,5,20],
-            'hidden_layer_sizes': [100,300,1000],
-            'num_hidden_layers':[1,2,3],
-            'corruption_levels': [0.1,0.2,0.4],
-            'minibatch_size': [5,20,100]}
+    jobs = produit_croise_jobs(JOB_VALS)
 
-    jobs = produit_croise_jobs(vals)
-
+    db = jobman.sql.db(JOBDB_JOBS)
     for job in jobs:
-        insert_job(job)
+        job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH})
+        jobman.sql.insert_dict(job, db)
 
+    print "inserted"
 
 class NIST:
-    def __init__(self, minibatch_size, basepath=None):
+    def __init__(self, minibatch_size, basepath=None, reduce_train_to=None):
         global NIST_ALL_LOCATION
 
         self.minibatch_size = minibatch_size
@@ -61,8 +149,9 @@
         self.load_train_test()
 
         self.valid = [[], []]
-        #self.split_train_valid()
-
+        self.split_train_valid()
+        if reduce_train_to:
+            self.reduce_train_set(reduce_train_to)
 
     def get_tvt(self):
         return self.train, self.valid, self.test
@@ -84,6 +173,15 @@
             pair[i] = filetensor.read(f)
             f.close()
 
+    def reduce_train_set(self, max):
+        self.train[0] = self.train[0][:max]
+        self.train[1] = self.train[1][:max]
+
+        if max < len(self.test[0]):
+            for ar in (self.test, self.valid):
+                ar[0] = ar[0][:max]
+                ar[1] = ar[1][:max]
+
     def split_train_valid(self):
         test_len = len(self.test[0])
         
@@ -121,7 +219,7 @@
 
     import time
     t1 = time.time()
-    nist = NIST(20)
+    nist = NIST(20, reduce_train_to=100)
     t2 = time.time()
 
     print "NIST loaded. time delta = ", t2-t1
@@ -129,12 +227,17 @@
     train,valid,test = nist.get_tvt()
     dataset = (train,valid,test)
 
-    print "Lenghts train, valid, test: ", len(train[0]), len(valid[0]), len(test[0])
+    print train[0][15]
+    print type(train[0][1])
+
+
+    print "Lengths train, valid, test: ", len(train[0]), len(valid[0]), len(test[0])
 
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
 
-    sgd_optimization(dataset, hp, n_ins, n_outs)
+    optimizer = SdaSgdOptimizer(dataset, hp, n_ins, n_outs, input_divider=255.0)
+    optimizer.train()
 
 if __name__ == '__main__':
 
@@ -145,6 +248,17 @@
     if len(args) > 0 and args[0] == 'load_nist':
         test_load_nist()
 
+    elif len(args) > 0 and args[0] == 'jobman_insert':
+        jobman_insert_nist()
+    elif len(args) > 0 and args[0] == 'test_job_tree':
+        # dont forget to comment out sql.inserts and make reduce_train_to=100
+        print "TESTING JOB TREE"
+        chanmock = {'COMPLETE':0}
+        hp = copy.copy(DEFAULT_HP_NIST)
+        hp.update({'reduce_train_to':100})
+        jobman_entrypoint(hp, chanmock)
+    elif len(args) > 0 and args[0] == 'estimate':
+        estimate_total_time()
     else:
         sgd_optimization_nist()