diff deep/stacked_dae/v2/nist_sda.py @ 239:42005ec87747

Mergé (manuellement) les changements de Sylvain pour utiliser le code de dataset d'Arnaud, à cette différence près que je n'utilse pas les givens. J'ai probablement une approche différente pour limiter la taille du dataset dans mon débuggage, aussi.
author fsavard
date Mon, 15 Mar 2010 18:30:21 -0400
parents 02eb98d051fe
children
line wrap: on
line diff
--- a/deep/stacked_dae/v2/nist_sda.py	Mon Mar 15 13:22:20 2010 -0400
+++ b/deep/stacked_dae/v2/nist_sda.py	Mon Mar 15 18:30:21 2010 -0400
@@ -29,48 +29,8 @@
 from ift6266.utils.seriestables import *
 import tables
 
-##############################################################################
-# GLOBALS
-
-TEST_CONFIG = False
-
-NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
-JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/fsavard_sda_v2'
-EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint"
-
-REDUCE_TRAIN_TO = None
-MAX_FINETUNING_EPOCHS = 1000
-# number of minibatches before taking means for valid error etc.
-REDUCE_EVERY = 100
-
-if TEST_CONFIG:
-    REDUCE_TRAIN_TO = 1000
-    MAX_FINETUNING_EPOCHS = 2
-    REDUCE_EVERY = 10
-
-# Possible values the hyperparameters can take. These are then
-# combined with produit_cartesien_jobs so we get a list of all
-# possible combinations, each one resulting in a job inserted
-# in the jobman DB.
-JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
-        'pretraining_epochs_per_layer': [10,20],
-        'hidden_layers_sizes': [300,800],
-        'corruption_levels': [0.1,0.2,0.3],
-        'minibatch_size': [20],
-        'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
-        'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
-        'num_hidden_layers':[2,3]}
-
-# Just useful for tests... minimal number of epochs
-DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
-                       'pretraining_lr':0.1,
-                       'pretraining_epochs_per_layer':2,
-                       'max_finetuning_epochs':2,
-                       'hidden_layers_sizes':800,
-                       'corruption_levels':0.2,
-                       'minibatch_size':20,
-                       'reduce_train_to':10000,
-                       'num_hidden_layers':1})
+from ift6266 import datasets
+from config import *
 
 '''
 Function called by jobman upon launching each job
@@ -82,14 +42,6 @@
     # TODO: remove this, bad for number of simultaneous requests on DB
     channel.save()
 
-    workingdir = os.getcwd()
-
-    print "Will load NIST"
-
-    nist = NIST(minibatch_size=20)
-
-    print "NIST loaded"
-
     # For test runs, we don't want to use the whole dataset so
     # reduce it to fewer elements if asked to.
     rtt = None
@@ -97,29 +49,27 @@
         rtt = state['reduce_train_to']
     elif REDUCE_TRAIN_TO:
         rtt = REDUCE_TRAIN_TO
-
-    if rtt:
-        print "Reducing training set to "+str(rtt)+ " examples"
-        nist.reduce_train_set(rtt)
-
-    train,valid,test = nist.get_tvt()
-    dataset = (train,valid,test)
-
+ 
     n_ins = 32*32
     n_outs = 62 # 10 digits, 26*2 (lower, capitals)
+     
+    examples_per_epoch = NIST_ALL_TRAIN_SIZE
 
     series = create_series(state.num_hidden_layers)
 
     print "Creating optimizer with state, ", state
 
-    optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \
+    optimizer = SdaSgdOptimizer(dataset=datasets.nist_all, 
+                                    hyperparameters=state, \
                                     n_ins=n_ins, n_outs=n_outs,\
-                                    input_divider=255.0, series=series)
+                                    examples_per_epoch=examples_per_epoch, \
+                                    series=series,
+                                    max_minibatches=rtt)
 
-    optimizer.pretrain()
+    optimizer.pretrain(datasets.nist_all)
     channel.save()
 
-    optimizer.finetune()
+    optimizer.finetune(datasets.nist_all)
     channel.save()
 
     return channel.COMPLETE
@@ -200,93 +150,14 @@
 
     print "inserted"
 
-class NIST:
-    def __init__(self, minibatch_size, basepath=None, reduce_train_to=None):
-        global NIST_ALL_LOCATION
-
-        self.minibatch_size = minibatch_size
-        self.basepath = basepath and basepath or NIST_ALL_LOCATION
-
-        self.set_filenames()
-
-        # arrays of 2 elements: .x, .y
-        self.train = [None, None]
-        self.test = [None, None]
-
-        self.load_train_test()
-
-        self.valid = [[], []]
-        self.split_train_valid()
-        if reduce_train_to:
-            self.reduce_train_set(reduce_train_to)
-
-    def get_tvt(self):
-        return self.train, self.valid, self.test
-
-    def set_filenames(self):
-        self.train_files = ['all_train_data.ft',
-                                'all_train_labels.ft']
-
-        self.test_files = ['all_test_data.ft',
-                            'all_test_labels.ft']
-
-    def load_train_test(self):
-        self.load_data_labels(self.train_files, self.train)
-        self.load_data_labels(self.test_files, self.test)
-
-    def load_data_labels(self, filenames, pair):
-        for i, fn in enumerate(filenames):
-            f = open(os.path.join(self.basepath, fn))
-            pair[i] = filetensor.read(f)
-            f.close()
-
-    def reduce_train_set(self, max):
-        self.train[0] = self.train[0][:max]
-        self.train[1] = self.train[1][:max]
-
-        if max < len(self.test[0]):
-            for ar in (self.test, self.valid):
-                ar[0] = ar[0][:max]
-                ar[1] = ar[1][:max]
-
-    def split_train_valid(self):
-        test_len = len(self.test[0])
-        
-        new_train_x = self.train[0][:-test_len]
-        new_train_y = self.train[1][:-test_len]
-
-        self.valid[0] = self.train[0][-test_len:]
-        self.valid[1] = self.train[1][-test_len:]
-
-        self.train[0] = new_train_x
-        self.train[1] = new_train_y
-
-def test_load_nist():
-    print "Will load NIST"
-
-    import time
-    t1 = time.time()
-    nist = NIST(20)
-    t2 = time.time()
-
-    print "NIST loaded. time delta = ", t2-t1
-
-    tr,v,te = nist.get_tvt()
-
-    print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0])
-
-    raw_input("Press any key")
-
 if __name__ == '__main__':
 
-    import sys
-
     args = sys.argv[1:]
 
-    if len(args) > 0 and args[0] == 'load_nist':
-        test_load_nist()
+    #if len(args) > 0 and args[0] == 'load_nist':
+    #    test_load_nist()
 
-    elif len(args) > 0 and args[0] == 'jobman_insert':
+    if len(args) > 0 and args[0] == 'jobman_insert':
         jobman_insert_nist()
 
     elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':