comparison deep/stacked_dae/v2/nist_sda.py @ 239:42005ec87747

Mergé (manuellement) les changements de Sylvain pour utiliser le code de dataset d'Arnaud, à cette différence près que je n'utilse pas les givens. J'ai probablement une approche différente pour limiter la taille du dataset dans mon débuggage, aussi.
author fsavard
date Mon, 15 Mar 2010 18:30:21 -0400
parents 02eb98d051fe
children
comparison
equal deleted inserted replaced
238:9fc641d7adda 239:42005ec87747
27 27
28 #from ift6266.utils.scalar_series import * 28 #from ift6266.utils.scalar_series import *
29 from ift6266.utils.seriestables import * 29 from ift6266.utils.seriestables import *
30 import tables 30 import tables
31 31
32 ############################################################################## 32 from ift6266 import datasets
33 # GLOBALS 33 from config import *
34
35 TEST_CONFIG = False
36
37 NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all'
38 JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/fsavard_sda_v2'
39 EXPERIMENT_PATH = "ift6266.deep.stacked_dae.v2.nist_sda.jobman_entrypoint"
40
41 REDUCE_TRAIN_TO = None
42 MAX_FINETUNING_EPOCHS = 1000
43 # number of minibatches before taking means for valid error etc.
44 REDUCE_EVERY = 100
45
46 if TEST_CONFIG:
47 REDUCE_TRAIN_TO = 1000
48 MAX_FINETUNING_EPOCHS = 2
49 REDUCE_EVERY = 10
50
51 # Possible values the hyperparameters can take. These are then
52 # combined with produit_cartesien_jobs so we get a list of all
53 # possible combinations, each one resulting in a job inserted
54 # in the jobman DB.
55 JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001],
56 'pretraining_epochs_per_layer': [10,20],
57 'hidden_layers_sizes': [300,800],
58 'corruption_levels': [0.1,0.2,0.3],
59 'minibatch_size': [20],
60 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS],
61 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out
62 'num_hidden_layers':[2,3]}
63
64 # Just useful for tests... minimal number of epochs
65 DEFAULT_HP_NIST = DD({'finetuning_lr':0.1,
66 'pretraining_lr':0.1,
67 'pretraining_epochs_per_layer':2,
68 'max_finetuning_epochs':2,
69 'hidden_layers_sizes':800,
70 'corruption_levels':0.2,
71 'minibatch_size':20,
72 'reduce_train_to':10000,
73 'num_hidden_layers':1})
74 34
75 ''' 35 '''
76 Function called by jobman upon launching each job 36 Function called by jobman upon launching each job
77 Its path is the one given when inserting jobs: see EXPERIMENT_PATH 37 Its path is the one given when inserting jobs: see EXPERIMENT_PATH
78 ''' 38 '''
80 # record mercurial versions of each package 40 # record mercurial versions of each package
81 pylearn.version.record_versions(state,[theano,ift6266,pylearn]) 41 pylearn.version.record_versions(state,[theano,ift6266,pylearn])
82 # TODO: remove this, bad for number of simultaneous requests on DB 42 # TODO: remove this, bad for number of simultaneous requests on DB
83 channel.save() 43 channel.save()
84 44
85 workingdir = os.getcwd()
86
87 print "Will load NIST"
88
89 nist = NIST(minibatch_size=20)
90
91 print "NIST loaded"
92
93 # For test runs, we don't want to use the whole dataset so 45 # For test runs, we don't want to use the whole dataset so
94 # reduce it to fewer elements if asked to. 46 # reduce it to fewer elements if asked to.
95 rtt = None 47 rtt = None
96 if state.has_key('reduce_train_to'): 48 if state.has_key('reduce_train_to'):
97 rtt = state['reduce_train_to'] 49 rtt = state['reduce_train_to']
98 elif REDUCE_TRAIN_TO: 50 elif REDUCE_TRAIN_TO:
99 rtt = REDUCE_TRAIN_TO 51 rtt = REDUCE_TRAIN_TO
100 52
101 if rtt:
102 print "Reducing training set to "+str(rtt)+ " examples"
103 nist.reduce_train_set(rtt)
104
105 train,valid,test = nist.get_tvt()
106 dataset = (train,valid,test)
107
108 n_ins = 32*32 53 n_ins = 32*32
109 n_outs = 62 # 10 digits, 26*2 (lower, capitals) 54 n_outs = 62 # 10 digits, 26*2 (lower, capitals)
55
56 examples_per_epoch = NIST_ALL_TRAIN_SIZE
110 57
111 series = create_series(state.num_hidden_layers) 58 series = create_series(state.num_hidden_layers)
112 59
113 print "Creating optimizer with state, ", state 60 print "Creating optimizer with state, ", state
114 61
115 optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \ 62 optimizer = SdaSgdOptimizer(dataset=datasets.nist_all,
63 hyperparameters=state, \
116 n_ins=n_ins, n_outs=n_outs,\ 64 n_ins=n_ins, n_outs=n_outs,\
117 input_divider=255.0, series=series) 65 examples_per_epoch=examples_per_epoch, \
66 series=series,
67 max_minibatches=rtt)
118 68
119 optimizer.pretrain() 69 optimizer.pretrain(datasets.nist_all)
120 channel.save() 70 channel.save()
121 71
122 optimizer.finetune() 72 optimizer.finetune(datasets.nist_all)
123 channel.save() 73 channel.save()
124 74
125 return channel.COMPLETE 75 return channel.COMPLETE
126 76
127 # These Series objects are used to save various statistics 77 # These Series objects are used to save various statistics
198 job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) 148 job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH})
199 jobman.sql.insert_dict(job, db) 149 jobman.sql.insert_dict(job, db)
200 150
201 print "inserted" 151 print "inserted"
202 152
203 class NIST:
204 def __init__(self, minibatch_size, basepath=None, reduce_train_to=None):
205 global NIST_ALL_LOCATION
206
207 self.minibatch_size = minibatch_size
208 self.basepath = basepath and basepath or NIST_ALL_LOCATION
209
210 self.set_filenames()
211
212 # arrays of 2 elements: .x, .y
213 self.train = [None, None]
214 self.test = [None, None]
215
216 self.load_train_test()
217
218 self.valid = [[], []]
219 self.split_train_valid()
220 if reduce_train_to:
221 self.reduce_train_set(reduce_train_to)
222
223 def get_tvt(self):
224 return self.train, self.valid, self.test
225
226 def set_filenames(self):
227 self.train_files = ['all_train_data.ft',
228 'all_train_labels.ft']
229
230 self.test_files = ['all_test_data.ft',
231 'all_test_labels.ft']
232
233 def load_train_test(self):
234 self.load_data_labels(self.train_files, self.train)
235 self.load_data_labels(self.test_files, self.test)
236
237 def load_data_labels(self, filenames, pair):
238 for i, fn in enumerate(filenames):
239 f = open(os.path.join(self.basepath, fn))
240 pair[i] = filetensor.read(f)
241 f.close()
242
243 def reduce_train_set(self, max):
244 self.train[0] = self.train[0][:max]
245 self.train[1] = self.train[1][:max]
246
247 if max < len(self.test[0]):
248 for ar in (self.test, self.valid):
249 ar[0] = ar[0][:max]
250 ar[1] = ar[1][:max]
251
252 def split_train_valid(self):
253 test_len = len(self.test[0])
254
255 new_train_x = self.train[0][:-test_len]
256 new_train_y = self.train[1][:-test_len]
257
258 self.valid[0] = self.train[0][-test_len:]
259 self.valid[1] = self.train[1][-test_len:]
260
261 self.train[0] = new_train_x
262 self.train[1] = new_train_y
263
264 def test_load_nist():
265 print "Will load NIST"
266
267 import time
268 t1 = time.time()
269 nist = NIST(20)
270 t2 = time.time()
271
272 print "NIST loaded. time delta = ", t2-t1
273
274 tr,v,te = nist.get_tvt()
275
276 print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0])
277
278 raw_input("Press any key")
279
280 if __name__ == '__main__': 153 if __name__ == '__main__':
281
282 import sys
283 154
284 args = sys.argv[1:] 155 args = sys.argv[1:]
285 156
286 if len(args) > 0 and args[0] == 'load_nist': 157 #if len(args) > 0 and args[0] == 'load_nist':
287 test_load_nist() 158 # test_load_nist()
288 159
289 elif len(args) > 0 and args[0] == 'jobman_insert': 160 if len(args) > 0 and args[0] == 'jobman_insert':
290 jobman_insert_nist() 161 jobman_insert_nist()
291 162
292 elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': 163 elif len(args) > 0 and args[0] == 'test_jobman_entrypoint':
293 chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) 164 chanmock = DD({'COMPLETE':0,'save':(lambda:None)})
294 jobman_entrypoint(DEFAULT_HP_NIST, chanmock) 165 jobman_entrypoint(DEFAULT_HP_NIST, chanmock)