comparison deep/deep_mlp/job.py @ 626:75dbbe409578

Added code for deep mlp, experiment code to go along with it. Also added code I used to filter the P07 / PNIST07 datasets to keep only digits.
author fsavard
date Wed, 16 Mar 2011 13:43:32 -0400
parents
children
comparison
equal deleted inserted replaced
625:128bc92897f2 626:75dbbe409578
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 '''
5 Launching
6
7 jobman sqlschedules postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/mlp_dumi mlp_jobman.experiment mlp_jobman.conf
8 'n_hidden={{500,1000,2000}}'
9 'n_hidden_layers={{2,3}}'
10 'train_on={{NIST,NISTP,P07}}'
11 'train_subset={{DIGITS_ONLY,ALL}}'
12 'learning_rate_log10={{-1.,-2.,-3.}}'
13
14 in mlp_jobman.conf:
15 rng_seed=1234
16 L1_reg=0.0
17 L2_reg=0.0
18 n_epochs=10
19 minibatch_size=20
20 '''
21
22 import os, sys, copy, operator, time
23 import theano
24 import theano.tensor as T
25 import numpy
26 from mlp import MLP
27 from ift6266 import datasets
28 from pylearn.io.seriestables import *
29 import tables
30 from jobman.tools import DD
31
32 N_INPUTS = 32*32
33 REDUCE_EVERY = 250
34
35 TEST_RUN = False
36
37 TEST_HP = DD({'n_hidden':200,
38 'n_hidden_layers': 2,
39 'train_on':'NIST',
40 'train_subset':'ALL',
41 'learning_rate_log10':-2,
42 'rng_seed':1234,
43 'L1_reg':0.0,
44 'L2_reg':0.0,
45 'n_epochs':2,
46 'minibatch_size':20})
47
48 ###########################################
49 # digits datasets
50 # nist_digits is already in NIST_PATH and in ift6266.datasets
51 # NOTE: for these datasets the test and valid sets are wrong
52 # (don't correspond to the training set... they're just placeholders)
53
54 from ift6266.datasets.defs import NIST_PATH, DATA_PATH
55 TRANSFORMED_DIGITS_PATH = '/data/lisatmp/ift6266h10/data/transformed_digits'
56
57 P07_digits = FTDataSet(\
58 train_data = [os.path.join(TRANSFORMED_DIGITS_PATH,\
59 'data/P07_train'+str(i)+'_data.ft')\
60 for i in range(0, 100)],
61 train_lbl = [os.path.join(TRANSFORMED_DIGITS_PATH,\
62 'data/P07_train'+str(i)+'_labels.ft')\
63 for i in range(0,100)],
64 test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')],
65 test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')],
66 valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')],
67 valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')],
68 indtype=theano.config.floatX, inscale=255., maxsize=None)
69
70 #Added PNIST
71 PNIST07_digits = FTDataSet(train_data = [os.path.join(TRANSFORMED_DIGITS_PATH,\
72 'PNIST07_train'+str(i)+'_data.ft')\
73 for i in range(0,100)],
74 train_lbl = [os.path.join(TRANSFORMED_DIGITS_PATH,\
75 'PNIST07_train'+str(i)+'_labels.ft')\
76 for i in range(0,100)],
77 test_data = [os.path.join(DATA_PATH,'data/PNIST07_test_data.ft')],
78 test_lbl = [os.path.join(DATA_PATH,'data/PNIST07_test_labels.ft')],
79 valid_data = [os.path.join(DATA_PATH,'data/PNIST07_valid_data.ft')],
80 valid_lbl = [os.path.join(DATA_PATH,'data/PNIST07_valid_labels.ft')],
81 indtype=theano.config.floatX, inscale=255., maxsize=None)
82
83
84 # building valid_test_datasets
85 # - on veut des dataset_obj pour les 3 datasets
86 # - donc juste à bâtir FTDataset(train=nimportequoi, test, valid=pNIST etc.)
87 # - on veut dans l'array mettre des pointeurs vers la fonction either test ou valid
88 # donc PAS dataset_obj, mais dataset_obj.train (sans les parenthèses)
89 def build_test_valid_sets():
90 nist_ds = datasets.nist_all()
91 pnist_ds = datasets.PNIST07()
92 p07_ds = datasets.nist_P07()
93
94 test_valid_fns = [nist_ds.test, nist_ds.valid,
95 pnist_ds.test, pnist_ds.valid,
96 p07_ds.test, p07_ds.valid]
97
98 test_valid_names = ["nist_all__test", "nist_all__valid",
99 "NISTP__test", "NISTP__valid",
100 "P07__test", "P07__valid"]
101
102 return test_valid_fns, test_valid_names
103
104 def add_error_series(series, error_name, hdf5_file,
105 index_names=('minibatch_idx',), use_accumulator=False,
106 reduce_every=250):
107 # train
108 series_base = ErrorSeries(error_name=error_name,
109 table_name=error_name,
110 hdf5_file=hdf5_file,
111 index_names=index_names)
112
113 if use_accumulator:
114 series[error_name] = \
115 AccumulatorSeriesWrapper(base_series=series_base,
116 reduce_every=reduce_every)
117 else:
118 series[error_name] = series_base
119
120 TEST_VALID_FNS,TEST_VALID_NAMES = None, None
121 def compute_and_save_errors(state, mlp, series, hdf5_file, minibatch_idx):
122 global TEST_VALID_FNS,TEST_VALID_NAMES
123
124 TEST_VALID_FNS,TEST_VALID_NAMES = build_test_valid_sets()
125
126 # if the training is on digits only, then there'll be a 100%
127 # error on digits in the valid/test set... just ignore them
128
129 test_fn = theano.function([mlp.input], mlp.logRegressionLayer.y_pred)
130
131 test_batch_size = 100
132 for test_ds_fn,test_ds_name in zip(TEST_VALID_FNS,TEST_VALID_NAMES):
133 # reset error counts for every test/valid set
134 # note: float
135 total_errors = total_digit_errors = \
136 total_uppercase_errors = total_lowercase_errors = 0.
137
138 total_all = total_lowercase = total_uppercase = total_digit = 0
139
140 for mb_x,mb_y in test_ds_fn(test_batch_size):
141 digit_mask = mb_y < 10
142 uppercase_mask = mb_y >= 36
143 lowercase_mask = numpy.ones((len(mb_x),)) \
144 - digit_mask - uppercase_mask
145
146 total_all += len(mb_x)
147 total_digit += sum(digit_mask)
148 total_uppercase += sum(uppercase_mask)
149 total_lowercase += sum(lowercase_mask)
150
151 predictions = test_fn(mb_x)
152
153 all_errors = (mb_y != predictions)
154 total_errors += sum(all_errors)
155
156 if len(all_errors) != len(digit_mask):
157 print "size all", all_errors.shape, " digit", digit_mask.shape
158 total_digit_errors += sum(numpy.multiply(all_errors, digit_mask))
159 total_uppercase_errors += sum(numpy.multiply(all_errors, uppercase_mask))
160 total_lowercase_errors += sum(numpy.multiply(all_errors, lowercase_mask))
161
162 four_errors = [float(total_errors) / total_all,
163 float(total_digit_errors) / total_digit,
164 float(total_lowercase_errors) / total_lowercase,
165 float(total_uppercase_errors) / total_uppercase]
166
167 four_errors_names = ["all", "digits", "lower", "upper"]
168
169 # record stats per set
170 print "Errors on", test_ds_name, ",".join(four_errors_names),\
171 ":", ",".join([str(e) for e in four_errors])
172
173 # now in the state
174 for err, errname in zip(four_errors, four_errors_names):
175 error_full_name = 'error__'+test_ds_name+'_'+errname
176 min_name = 'min_'+error_full_name
177 minpos_name = 'minpos_'+error_full_name
178
179 if state.has_key(min_name):
180 if state[min_name] > err:
181 state[min_name] = err
182 state[minpos_name] = pos_str
183 else:
184 # also create the series
185 add_error_series(series, error_full_name, hdf5_file,
186 index_names=('minibatch_idx',))
187 state[min_name] = err
188 state[minpos_name] = minibatch_idx
189
190 state[minpos_name] = pos_str
191 series[error_full_name].append((minibatch_idx,), err)
192
193 def jobman_entrypoint(state, channel):
194 global TEST_RUN
195 minibatch_size = state.minibatch_size
196
197 print_every = 100000
198 COMPUTE_ERROR_EVERY = 10**7 / minibatch_size # compute error every 10 million examples
199 if TEST_RUN:
200 print_every = 100
201 COMPUTE_ERROR_EVERY = 1000 / minibatch_size
202
203 print "entrypoint, state is"
204 print state
205
206 ######################
207 # select dataset and dataset subset, plus adjust epoch num to make number
208 # of examples seen independent of dataset
209 # exemple: pour le cas DIGITS_ONLY, il faut changer le nombre d'époques
210 # et pour le cas NIST pur (pas de transformations), il faut multiplier par 100
211 # en partant car on a pas les variations
212
213 # compute this in terms of the P07 dataset size (=80M)
214 MINIBATCHES_TO_SEE = state.n_epochs * 8 * (10**6) / minibatch_size
215
216 if state.train_on == 'NIST' and state.train_subset == 'ALL':
217 dataset_obj = datasets.nist_all()
218 elif state.train_on == 'NIST' and state.train_subset == 'DIGITS_ONLY':
219 dataset_obj = datasets.nist_digits()
220 elif state.train_on == 'NISTP' and state.train_subset == 'ALL':
221 dataset_obj = datasets.PNIST07()
222 elif state.train_on == 'NISTP' and state.train_subset == 'DIGITS_ONLY':
223 dataset_obj = PNIST07_digits
224 elif state.train_on == 'P07' and state.train_subset == 'ALL':
225 dataset_obj = datasets.nist_P07()
226 elif state.train_on == 'P07' and state.train_subset == 'DIGITS_ONLY':
227 dataset_obj = datasets.P07_digits
228
229 dataset = dataset_obj
230
231 if state.train_subset == 'ALL':
232 n_classes = 62
233 elif state.train_subset == 'DIGITS_ONLY':
234 n_classes = 10
235 else:
236 raise NotImplementedError()
237
238 ###############################
239 # construct model
240
241 print "constructing model..."
242 x = T.matrix('x')
243 y = T.ivector('y')
244
245 rng = numpy.random.RandomState(state.rng_seed)
246
247 # construct the MLP class
248 model = MLP(rng = rng, input=x, n_in=N_INPUTS,
249 n_hidden_layers = state.n_hidden_layers,
250 n_hidden = state.n_hidden, n_out=n_classes)
251
252
253 # cost and training fn
254 cost = T.mean(model.negative_log_likelihood(y)) \
255 + state.L1_reg * model.L1 \
256 + state.L2_reg * model.L2_sqr
257
258 print "L1, L2: ", state.L1_reg, state.L2_reg
259
260 gradient_nll_wrt_params = []
261 for param in model.params:
262 gparam = T.grad(cost, param)
263 gradient_nll_wrt_params.append(gparam)
264
265 learning_rate = 10**float(state.learning_rate_log10)
266 print "Learning rate", learning_rate
267
268 train_updates = {}
269 for param, gparam in zip(model.params, gradient_nll_wrt_params):
270 train_updates[param] = param - learning_rate * gparam
271
272 train_fn = theano.function([x,y], cost, updates=train_updates)
273
274 #######################
275 # create series
276 basedir = os.getcwd()
277
278 h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w")
279
280 series = {}
281 add_error_series(series, "training_error", h5f,
282 index_names=('minibatch_idx',), use_accumulator=True,
283 reduce_every=REDUCE_EVERY)
284
285 ##########################
286 # training loop
287
288 start_time = time.clock()
289
290 print "begin training..."
291 print "will train for", MINIBATCHES_TO_SEE, "examples"
292
293 mb_idx = 0
294
295 while(mb_idx*minibatch_size<nb_max_exemples):
296
297 last_costs = []
298
299 for mb_x, mb_y in dataset.train(minibatch_size):
300 if TEST_RUN and mb_idx > 1000:
301 break
302
303 last_cost = train_fn(mb_x, mb_y)
304 series["training_error"].append((mb_idx,), last_cost)
305
306 last_costs.append(last_cost)
307 if (len(last_costs)+1) % print_every == 0:
308 print "Mean over last", print_every, "minibatches: ", numpy.mean(last_costs)
309 last_costs = []
310
311 if (mb_idx+1) % COMPUTE_ERROR_EVERY == 0:
312 # compute errors
313 print "computing errors on all datasets..."
314 print "Time since training began: ", (time.clock()-start_time)/60., "minutes"
315 compute_and_save_errors(state, model, series, h5f, mb_idx)
316
317 channel.save()
318
319 sys.stdout.flush()
320
321 end_time = time.clock()
322
323 print "-"*80
324 print "Finished. Training took", (end_time-start_time)/60., "minutes"
325 print state
326
327 def run_test():
328 global TEST_RUN
329 from fsml.job_management import mock_channel
330 TEST_RUN = True
331 jobman_entrypoint(TEST_HP, mock_channel)
332
333 if __name__ == '__main__':
334 run_test()
335