diff deep/stacked_dae/v_sylvain/nist_sda.py @ 266:1e4e60ddadb1

Merge. Ah, et dans le dernier commit, j'avais oublié de mentionner que j'ai ajouté du code pour gérer l'isolation de différents clones pour rouler des expériences et modifier le code en même temps.
author fsavard
date Fri, 19 Mar 2010 10:56:16 -0400
parents f14fb56b3f8d
children 698313f8f6e6
line wrap: on
line diff
--- a/deep/stacked_dae/v_sylvain/nist_sda.py	Fri Mar 19 10:54:39 2010 -0400
+++ b/deep/stacked_dae/v_sylvain/nist_sda.py	Fri Mar 19 10:56:16 2010 -0400
@@ -21,9 +21,8 @@
 import jobman, jobman.sql
 from pylearn.io import filetensor
 
-from ift6266 import datasets
-
 from utils import produit_cartesien_jobs
+from copy import copy
 
 from sgd_optimization import SdaSgdOptimizer
 
@@ -31,49 +30,8 @@
 from ift6266.utils.seriestables import *
 import tables
 
-##############################################################################
-# GLOBALS
-
-TEST_CONFIG = False
-
-#NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/sylvainpl_sda_vsylvain'
-EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v_sylvain.nist_sda.jobman_entrypoint"
-
-REDUCE_TRAIN_TO = None
-MAX_FINETUNING_EPOCHS = 1000
-# number of minibatches before taking means for valid error etc.
-REDUCE_EVERY = 100
-
-if TEST_CONFIG:
-    REDUCE_TRAIN_TO = 1000
-    MAX_FINETUNING_EPOCHS = 2
-    REDUCE_EVERY = 10
-    MINIBATCH_SIZE=20
-
-# Possible values the hyperparameters can take. These are then
-# combined with produit_cartesien_jobs so we get a list of all
-# possible combinations, each one resulting in a job inserted
-# in the jobman DB.
-JOB_VALS = {'pretraining_lr': [0.1],#, 0.01],#, 0.001],#, 0.0001],
-        'pretraining_epochs_per_layer': [10],
-        'hidden_layers_sizes': [500],
-        'corruption_levels': [0.1],
-        'minibatch_size': [20],
-        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
-        'finetuning_lr':[0.1], #0.001 was very bad, so we leave it out
-        'num_hidden_layers':[1,1]}
-
-# Just useful for tests... minimal number of epochs
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
-                       'pretraining_lr':0.1,
-                       'pretraining_epochs_per_layer':2,
-                       'max_finetuning_epochs':2,
-                       'hidden_layers_sizes':500,
-                       'corruption_levels':0.2,
-                       'minibatch_size':20,
-                       'reduce_train_to':10000,
-                       'num_hidden_layers':1})
+from ift6266 import datasets
+from config import *
 
 '''
 Function called by jobman upon launching each job
@@ -85,48 +43,82 @@
     # TODO: remove this, bad for number of simultaneous requests on DB
     channel.save()
 
-    workingdir = os.getcwd()
-
-      ###########   Il faudrait arranger ici pour train plus petit 
-
-##    print "Will load NIST"
-##
-##    nist = NIST(minibatch_size=20)
-##
-##    print "NIST loaded"
-##
     # For test runs, we don't want to use the whole dataset so
     # reduce it to fewer elements if asked to.
     rtt = None
     if state.has_key('reduce_train_to'):
-        rtt = int(state['reduce_train_to']/state['minibatch_size'])
+        rtt = state['reduce_train_to']
     elif REDUCE_TRAIN_TO:
-        rtt = int(REDUCE_TRAIN_TO/MINIBATCH_SIZE)
-
-    if rtt:
-        print "Reducing training set to "+str(rtt*state['minibatch_size'])+ " examples"
-    else:
-        rtt=float('inf')    #No reduction
-##        nist.reduce_train_set(rtt)
-##
-##    train,valid,test = nist.get_tvt()
-##    dataset = (train,valid,test)
-
+        rtt = REDUCE_TRAIN_TO
+ 
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
-    
+     
+    examples_per_epoch = NIST_ALL_TRAIN_SIZE
+
     series = create_series(state.num_hidden_layers)
 
     print "Creating optimizer with state, ", state
 
-    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, hyperparameters=state, \
+    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all(), 
+                                    hyperparameters=state, \
                                     n_ins=n_ins, n_outs=n_outs,\
-                                    series=series)
+                                    examples_per_epoch=examples_per_epoch, \
+                                    series=series,
+                                    max_minibatches=rtt)
 
-    optimizer.pretrain(datasets.nist_all,rtt)
+    parameters=[]
+    optimizer.pretrain(datasets.nist_P07())
     channel.save()
+    
+    #Set some of the parameters used for the finetuning
+    if state.has_key('finetune_set'):
+        finetune_choice=state['finetune_set']
+    else:
+        finetune_choice=FINETUNE_SET
+    
+    if state.has_key('max_finetuning_epochs'):
+        max_finetune_epoch_NIST=state['max_finetuning_epochs']
+    else:
+        max_finetune_epoch_NIST=MAX_FINETUNING_EPOCHS
+    
+    if state.has_key('max_finetuning_epochs_P07'):
+        max_finetune_epoch_P07=state['max_finetuning_epochs_P07']
+    else:
+        max_finetune_epoch_P07=max_finetune_epoch_NIST
+    
+    #Decide how the finetune is done
+    
+    if finetune_choice==0:
+        print('\n\n\tfinetune avec nist\n\n')
+        optimizer.reload_parameters()
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
+    if finetune_choice==1:
+        print('\n\n\tfinetune avec P07\n\n')
+        optimizer.reload_parameters()
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
+    if finetune_choice==2:
+        print('\n\n\tfinetune avec nist suivi de P07\n\n')
+        optimizer.reload_parameters()
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
 
-    optimizer.finetune(datasets.nist_all,rtt)
+    if finetune_choice==-1:
+        print('\nSerie de 3 essais de fine-tuning')
+        print('\n\n\tfinetune avec nist\n\n')
+        optimizer.reload_parameters()
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
+        channel.save()
+        print('\n\n\tfinetune avec P07\n\n')
+        optimizer.reload_parameters()
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
+        channel.save()
+        print('\n\n\tfinetune avec nist suivi de P07\n\n')
+        optimizer.reload_parameters()
+        optimizer.finetune(datasets.nist_all(),datasets.nist_P07(),max_finetune_epoch_NIST,ind_test=1)
+        optimizer.finetune(datasets.nist_P07(),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0)
+        channel.save()
+    
     channel.save()
 
     return channel.COMPLETE
@@ -207,98 +199,19 @@
 
     print "inserted"
 
-class NIST:
-    def __init__(self, minibatch_size, basepath=None, reduce_train_to=None):
-        global NIST_ALL_LOCATION
-
-        self.minibatch_size = minibatch_size
-        self.basepath = basepath and basepath or NIST_ALL_LOCATION
-
-        self.set_filenames()
-
-        # arrays of 2 elements: .x, .y
-        self.train = [None, None]
-        self.test = [None, None]
-
-        self.load_train_test()
-
-        self.valid = [[], []]
-        self.split_train_valid()
-        if reduce_train_to:
-            self.reduce_train_set(reduce_train_to)
-
-    def get_tvt(self):
-        return self.train, self.valid, self.test
-
-    def set_filenames(self):
-        self.train_files = ['all_train_data.ft',
-                                'all_train_labels.ft']
-
-        self.test_files = ['all_test_data.ft',
-                            'all_test_labels.ft']
-
-    def load_train_test(self):
-        self.load_data_labels(self.train_files, self.train)
-        self.load_data_labels(self.test_files, self.test)
-
-    def load_data_labels(self, filenames, pair):
-        for i, fn in enumerate(filenames):
-            f = open(os.path.join(self.basepath, fn))
-            pair[i] = filetensor.read(f)
-            f.close()
-
-    def reduce_train_set(self, max):
-        self.train[0] = self.train[0][:max]
-        self.train[1] = self.train[1][:max]
-
-        if max < len(self.test[0]):
-            for ar in (self.test, self.valid):
-                ar[0] = ar[0][:max]
-                ar[1] = ar[1][:max]
-
-    def split_train_valid(self):
-        test_len = len(self.test[0])
-        
-        new_train_x = self.train[0][:-test_len]
-        new_train_y = self.train[1][:-test_len]
-
-        self.valid[0] = self.train[0][-test_len:]
-        self.valid[1] = self.train[1][-test_len:]
-
-        self.train[0] = new_train_x
-        self.train[1] = new_train_y
-
-def test_load_nist():
-    print "Will load NIST"
-
-    import time
-    t1 = time.time()
-    nist = NIST(20)
-    t2 = time.time()
-
-    print "NIST loaded. time delta = ", t2-t1
-
-    tr,v,te = nist.get_tvt()
-
-    print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0])
-
-    raw_input("Press any key")
-
 if __name__ == '__main__':
 
-    import sys
-
     args = sys.argv[1:]
 
-    if len(args) > 0 and args[0] == 'load_nist':
-        test_load_nist()
+    #if len(args) > 0 and args[0] == 'load_nist':
+    #    test_load_nist()
 
-    elif len(args) > 0 and args[0] == 'jobman_insert':
+    if len(args) > 0 and args[0] == 'jobman_insert':
         jobman_insert_nist()
 
     elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
         chanmock = DD({'COMPLETE':0,'save':(lambda:None)})
-        jobman_entrypoint(DEFAULT_HP_NIST, chanmock)
+        jobman_entrypoint(DD(DEFAULT_HP_NIST), chanmock)
 
     else:
         print "Bad arguments"