Mercurial > ift6266
comparison baseline_algorithms/mlp/mlp_nist.py @ 159:e81241cfc2de
merge
author | Myriam Cote <cotemyri@iro.umontreal.ca> |
---|---|
date | Thu, 25 Feb 2010 09:05:48 -0500 |
parents | 8ceaaf812891 |
children |
comparison
equal
deleted
inserted
replaced
158:d1bb6e06497a | 159:e81241cfc2de |
---|---|
28 import pylab | 28 import pylab |
29 import theano | 29 import theano |
30 import theano.tensor as T | 30 import theano.tensor as T |
31 import time | 31 import time |
32 import theano.tensor.nnet | 32 import theano.tensor.nnet |
33 import pylearn | |
33 from pylearn.io import filetensor as ft | 34 from pylearn.io import filetensor as ft |
34 | 35 |
35 data_path = '/data/lisa/data/nist/by_class/' | 36 data_path = '/data/lisa/data/nist/by_class/' |
36 | 37 |
37 class MLP(object): | 38 class MLP(object): |
43 sigmoid function while the top layer is a softamx layer. | 44 sigmoid function while the top layer is a softamx layer. |
44 """ | 45 """ |
45 | 46 |
46 | 47 |
47 | 48 |
48 def __init__(self, input, n_in, n_hidden, n_out): | 49 def __init__(self, input, n_in, n_hidden, n_out,learning_rate): |
49 """Initialize the parameters for the multilayer perceptron | 50 """Initialize the parameters for the multilayer perceptron |
50 | 51 |
51 :param input: symbolic variable that describes the input of the | 52 :param input: symbolic variable that describes the input of the |
52 architecture (one minibatch) | 53 architecture (one minibatch) |
53 | 54 |
92 dtype= theano.config.floatX)) | 93 dtype= theano.config.floatX)) |
93 self.W2 = theano.shared( value = W2_values ) | 94 self.W2 = theano.shared( value = W2_values ) |
94 self.b2 = theano.shared( value = numpy.zeros((n_out,), | 95 self.b2 = theano.shared( value = numpy.zeros((n_out,), |
95 dtype= theano.config.floatX)) | 96 dtype= theano.config.floatX)) |
96 | 97 |
98 #include the learning rate in the classifer so | |
99 #we can modify it on the fly when we want | |
100 lr_value=learning_rate | |
101 self.lr=theano.shared(value=lr_value) | |
97 # symbolic expression computing the values of the hidden layer | 102 # symbolic expression computing the values of the hidden layer |
98 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) | 103 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) |
104 | |
105 | |
99 | 106 |
100 # symbolic expression computing the values of the top layer | 107 # symbolic expression computing the values of the top layer |
101 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) | 108 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) |
102 | 109 |
103 # compute prediction as class whose probability is maximal in | 110 # compute prediction as class whose probability is maximal in |
104 # symbolic form | 111 # symbolic form |
105 self.y_pred = T.argmax( self.p_y_given_x, axis =1) | 112 self.y_pred = T.argmax( self.p_y_given_x, axis =1) |
113 self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1) | |
114 | |
115 | |
116 | |
106 | 117 |
107 # L1 norm ; one regularization option is to enforce L1 norm to | 118 # L1 norm ; one regularization option is to enforce L1 norm to |
108 # be small | 119 # be small |
109 self.L1 = abs(self.W1).sum() + abs(self.W2).sum() | 120 self.L1 = abs(self.W1).sum() + abs(self.W2).sum() |
110 | 121 |
148 # represents a mistake in prediction | 159 # represents a mistake in prediction |
149 return T.mean(T.neq(self.y_pred, y)) | 160 return T.mean(T.neq(self.y_pred, y)) |
150 else: | 161 else: |
151 raise NotImplementedError() | 162 raise NotImplementedError() |
152 | 163 |
153 #def jobman_mlp(state,channel): | |
154 # (validation_error,test_error,nb_exemples,time)=mlp_full_nist(state.learning_rate,\ | |
155 # state.n_iter,\ | |
156 # state.batch_size,\ | |
157 # state.nb_hidden_units) | |
158 # state.validation_error = validation_error | |
159 # state.test_error = test_error | |
160 # state.nb_exemples = nb_exemples | |
161 # state.time=time | |
162 # return channel.COMPLETE | |
163 | |
164 | |
165 | |
166 | 164 |
167 def mlp_full_nist( verbose = False,\ | 165 def mlp_full_nist( verbose = False,\ |
166 adaptive_lr = 0,\ | |
168 train_data = 'all/all_train_data.ft',\ | 167 train_data = 'all/all_train_data.ft',\ |
169 train_labels = 'all/all_train_labels.ft',\ | 168 train_labels = 'all/all_train_labels.ft',\ |
170 test_data = 'all/all_test_data.ft',\ | 169 test_data = 'all/all_test_data.ft',\ |
171 test_labels = 'all/all_test_labels.ft',\ | 170 test_labels = 'all/all_test_labels.ft',\ |
172 learning_rate=0.01,\ | 171 learning_rate=0.01,\ |
175 nb_max_exemples=1000000,\ | 174 nb_max_exemples=1000000,\ |
176 batch_size=20,\ | 175 batch_size=20,\ |
177 nb_hidden = 500,\ | 176 nb_hidden = 500,\ |
178 nb_targets = 62): | 177 nb_targets = 62): |
179 | 178 |
179 | |
180 configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr] | |
181 | |
182 total_validation_error_list = [] | |
183 total_train_error_list = [] | |
184 learning_rate_list=[] | |
185 best_training_error=float('inf'); | |
186 | |
180 | 187 |
181 | 188 |
182 f = open(data_path+train_data) | 189 f = open(data_path+train_data) |
183 g= open(data_path+train_labels) | 190 g= open(data_path+train_labels) |
184 h = open(data_path+test_data) | 191 h = open(data_path+test_data) |
233 # allocate symbolic variables for the data | 240 # allocate symbolic variables for the data |
234 x = T.fmatrix() # the data is presented as rasterized images | 241 x = T.fmatrix() # the data is presented as rasterized images |
235 y = T.lvector() # the labels are presented as 1D vector of | 242 y = T.lvector() # the labels are presented as 1D vector of |
236 # [long int] labels | 243 # [long int] labels |
237 | 244 |
245 if verbose==True: | |
246 print 'finished parsing the data' | |
238 # construct the logistic regression class | 247 # construct the logistic regression class |
239 classifier = MLP( input=x.reshape((batch_size,32*32)),\ | 248 classifier = MLP( input=x.reshape((batch_size,32*32)),\ |
240 n_in=32*32,\ | 249 n_in=32*32,\ |
241 n_hidden=nb_hidden,\ | 250 n_hidden=nb_hidden,\ |
242 n_out=nb_targets) | 251 n_out=nb_targets, |
252 learning_rate=learning_rate) | |
253 | |
254 | |
255 | |
243 | 256 |
244 # the cost we minimize during training is the negative log likelihood of | 257 # the cost we minimize during training is the negative log likelihood of |
245 # the model plus the regularization terms (L1 and L2); cost is expressed | 258 # the model plus the regularization terms (L1 and L2); cost is expressed |
246 # here symbolically | 259 # here symbolically |
247 cost = classifier.negative_log_likelihood(y) \ | 260 cost = classifier.negative_log_likelihood(y) \ |
258 g_W2 = T.grad(cost, classifier.W2) | 271 g_W2 = T.grad(cost, classifier.W2) |
259 g_b2 = T.grad(cost, classifier.b2) | 272 g_b2 = T.grad(cost, classifier.b2) |
260 | 273 |
261 # specify how to update the parameters of the model as a dictionary | 274 # specify how to update the parameters of the model as a dictionary |
262 updates = \ | 275 updates = \ |
263 { classifier.W1: classifier.W1 - learning_rate*g_W1 \ | 276 { classifier.W1: classifier.W1 - classifier.lr*g_W1 \ |
264 , classifier.b1: classifier.b1 - learning_rate*g_b1 \ | 277 , classifier.b1: classifier.b1 - classifier.lr*g_b1 \ |
265 , classifier.W2: classifier.W2 - learning_rate*g_W2 \ | 278 , classifier.W2: classifier.W2 - classifier.lr*g_W2 \ |
266 , classifier.b2: classifier.b2 - learning_rate*g_b2 } | 279 , classifier.b2: classifier.b2 - classifier.lr*g_b2 } |
267 | 280 |
268 # compiling a theano function `train_model` that returns the cost, but in | 281 # compiling a theano function `train_model` that returns the cost, but in |
269 # the same time updates the parameter of the model based on the rules | 282 # the same time updates the parameter of the model based on the rules |
270 # defined in `updates` | 283 # defined in `updates` |
271 train_model = theano.function([x, y], cost, updates = updates ) | 284 train_model = theano.function([x, y], cost, updates = updates ) |
272 n_minibatches = len(train_batches) | 285 n_minibatches = len(train_batches) |
273 | 286 |
274 | 287 |
275 | 288 |
289 | |
290 | |
276 | 291 |
277 #conditions for stopping the adaptation: | 292 #conditions for stopping the adaptation: |
278 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size) | 293 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size) |
279 #2) validation error is going up (probable overfitting) | 294 #2) validation error is going up twice in a row(probable overfitting) |
280 | 295 |
281 # This means we no longer stop on slow convergence as low learning rates stopped | 296 # This means we no longer stop on slow convergence as low learning rates stopped |
282 # too fast. | 297 # too fast. |
298 | |
299 # no longer relevant | |
283 patience =nb_max_exemples/batch_size | 300 patience =nb_max_exemples/batch_size |
284 patience_increase = 2 # wait this much longer when a new best is | 301 patience_increase = 2 # wait this much longer when a new best is |
285 # found | 302 # found |
286 improvement_threshold = 0.995 # a relative improvement of this much is | 303 improvement_threshold = 0.995 # a relative improvement of this much is |
287 # considered significant | 304 # considered significant |
294 best_validation_loss = float('inf') | 311 best_validation_loss = float('inf') |
295 best_iter = 0 | 312 best_iter = 0 |
296 test_score = 0. | 313 test_score = 0. |
297 start_time = time.clock() | 314 start_time = time.clock() |
298 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples | 315 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples |
299 n_iter = n_iter/n_minibatches + 1 | 316 n_iter = n_iter/n_minibatches + 1 #round up |
300 n_iter=max(1,n_iter) # run at least once on short debug call | 317 n_iter=max(1,n_iter) # run at least once on short debug call |
301 # have a maximum of `n_iter` iterations through the entire dataset | 318 |
302 | 319 |
303 if verbose == True: | 320 if verbose == True: |
304 print 'looping at most %d times through the data set' %n_iter | 321 print 'looping at most %d times through the data set' %n_iter |
305 for iter in xrange(n_iter* n_minibatches): | 322 for iter in xrange(n_iter* n_minibatches): |
306 | 323 |
307 # get epoch and minibatch index | 324 # get epoch and minibatch index |
308 epoch = iter / n_minibatches | 325 epoch = iter / n_minibatches |
309 minibatch_index = iter % n_minibatches | 326 minibatch_index = iter % n_minibatches |
310 | 327 |
328 | |
329 | |
311 # get the minibatches corresponding to `iter` modulo | 330 # get the minibatches corresponding to `iter` modulo |
312 # `len(train_batches)` | 331 # `len(train_batches)` |
313 x,y = train_batches[ minibatch_index ] | 332 x,y = train_batches[ minibatch_index ] |
314 # convert to float | 333 # convert to float |
315 x_float = x/255.0 | 334 x_float = x/255.0 |
316 cost_ij = train_model(x_float,y) | 335 cost_ij = train_model(x_float,y) |
317 | 336 |
318 if (iter+1) % validation_frequency == 0: | 337 if (iter+1) % validation_frequency == 0: |
319 # compute zero-one loss on validation set | 338 # compute zero-one loss on validation set |
320 | 339 |
321 this_validation_loss = 0. | 340 this_validation_loss = 0. |
322 for x,y in validation_batches: | 341 for x,y in validation_batches: |
323 # sum up the errors for each minibatch | 342 # sum up the errors for each minibatch |
324 x_float = x/255.0 | 343 x_float = x/255.0 |
325 this_validation_loss += test_model(x_float,y) | 344 this_validation_loss += test_model(x_float,y) |
326 # get the average by dividing with the number of minibatches | 345 # get the average by dividing with the number of minibatches |
327 this_validation_loss /= len(validation_batches) | 346 this_validation_loss /= len(validation_batches) |
347 #save the validation loss | |
348 total_validation_error_list.append(this_validation_loss) | |
349 | |
350 #get the training error rate | |
351 this_train_loss=0 | |
352 for x,y in train_batches: | |
353 # sum up the errors for each minibatch | |
354 x_float = x/255.0 | |
355 this_train_loss += test_model(x_float,y) | |
356 # get the average by dividing with the number of minibatches | |
357 this_train_loss /= len(train_batches) | |
358 #save the validation loss | |
359 total_train_error_list.append(this_train_loss) | |
360 if(this_train_loss<best_training_error): | |
361 best_training_error=this_train_loss | |
362 | |
328 if verbose == True: | 363 if verbose == True: |
329 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ | 364 print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \ |
330 (epoch, minibatch_index+1, n_minibatches, \ | 365 (epoch, minibatch_index+1, n_minibatches, \ |
331 this_validation_loss*100.)) | 366 this_validation_loss*100.,this_train_loss*100)) |
367 | |
368 | |
369 #save the learning rate | |
370 learning_rate_list.append(classifier.lr.value) | |
332 | 371 |
333 | 372 |
334 # if we got the best validation score until now | 373 # if we got the best validation score until now |
335 if this_validation_loss < best_validation_loss: | 374 if this_validation_loss < best_validation_loss: |
336 | |
337 #improve patience if loss improvement is good enough | |
338 if this_validation_loss < best_validation_loss * \ | |
339 improvement_threshold : | |
340 patience = max(patience, iter * patience_increase) | |
341 elif verbose == True: | |
342 print 'slow convergence stop' | |
343 | |
344 # save best validation score and iteration number | 375 # save best validation score and iteration number |
345 best_validation_loss = this_validation_loss | 376 best_validation_loss = this_validation_loss |
346 best_iter = iter | 377 best_iter = iter |
347 | 378 # reset patience if we are going down again |
379 # so we continue exploring | |
380 patience=nb_max_exemples/batch_size | |
348 # test it on the test set | 381 # test it on the test set |
349 test_score = 0. | 382 test_score = 0. |
350 for x,y in test_batches: | 383 for x,y in test_batches: |
351 x_float=x/255.0 | 384 x_float=x/255.0 |
352 test_score += test_model(x_float,y) | 385 test_score += test_model(x_float,y) |
355 print((' epoch %i, minibatch %i/%i, test error of best ' | 388 print((' epoch %i, minibatch %i/%i, test error of best ' |
356 'model %f %%') % | 389 'model %f %%') % |
357 (epoch, minibatch_index+1, n_minibatches, | 390 (epoch, minibatch_index+1, n_minibatches, |
358 test_score*100.)) | 391 test_score*100.)) |
359 | 392 |
360 #if the validation error is going up, we are overfitting | 393 # if the validation error is going up, we are overfitting (or oscillating) |
361 #stop converging | 394 # stop converging but run at least to next validation |
362 elif this_validation_loss > best_validation_loss: | 395 # to check overfitting or ocsillation |
396 # the saved weights of the model will be a bit off in that case | |
397 elif this_validation_loss >= best_validation_loss: | |
363 #calculate the test error at this point and exit | 398 #calculate the test error at this point and exit |
364 # test it on the test set | 399 # test it on the test set |
365 if verbose==True: | 400 # however, if adaptive_lr is true, try reducing the lr to |
366 print ' We are diverging' | 401 # get us out of an oscilliation |
367 best_iter = iter | 402 if adaptive_lr==1: |
403 classifier.lr.value=classifier.lr.value/2.0 | |
404 | |
368 test_score = 0. | 405 test_score = 0. |
406 #cap the patience so we are allowed one more validation error | |
407 #calculation before aborting | |
408 patience = iter+validation_frequency+1 | |
369 for x,y in test_batches: | 409 for x,y in test_batches: |
370 x_float=x/255.0 | 410 x_float=x/255.0 |
371 test_score += test_model(x_float,y) | 411 test_score += test_model(x_float,y) |
372 test_score /= len(test_batches) | 412 test_score /= len(test_batches) |
373 if verbose == True: | 413 if verbose == True: |
374 print ' validation error is going up, stopping now' | 414 print ' validation error is going up, possibly stopping soon' |
375 print((' epoch %i, minibatch %i/%i, test error of best ' | 415 print((' epoch %i, minibatch %i/%i, test error of best ' |
376 'model %f %%') % | 416 'model %f %%') % |
377 (epoch, minibatch_index+1, n_minibatches, | 417 (epoch, minibatch_index+1, n_minibatches, |
378 test_score*100.)) | 418 test_score*100.)) |
379 | 419 |
380 break | 420 |
381 | 421 |
382 | 422 |
383 | 423 if iter>patience: |
384 if patience <= iter : | 424 print 'we have diverged' |
385 break | 425 break |
386 | 426 |
387 | 427 |
388 end_time = time.clock() | 428 end_time = time.clock() |
389 if verbose == True: | 429 if verbose == True: |
390 print(('Optimization complete. Best validation score of %f %% ' | 430 print(('Optimization complete. Best validation score of %f %% ' |
391 'obtained at iteration %i, with test performance %f %%') % | 431 'obtained at iteration %i, with test performance %f %%') % |
392 (best_validation_loss * 100., best_iter, test_score*100.)) | 432 (best_validation_loss * 100., best_iter, test_score*100.)) |
393 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | 433 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) |
394 print iter | 434 print iter |
395 return (best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60) | 435 |
436 #save the model and the weights | |
437 numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value) | |
438 numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\ | |
439 learning_rate_list=learning_rate_list) | |
440 | |
441 return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60) | |
396 | 442 |
397 | 443 |
398 if __name__ == '__main__': | 444 if __name__ == '__main__': |
399 mlp_full_mnist() | 445 mlp_full_mnist() |
400 | 446 |
401 def jobman_mlp_full_nist(state,channel): | 447 def jobman_mlp_full_nist(state,channel): |
402 (validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ | 448 (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ |
403 nb_max_exemples=state.nb_max_exemples,\ | 449 nb_max_exemples=state.nb_max_exemples,\ |
404 nb_hidden=state.nb_hidden) | 450 nb_hidden=state.nb_hidden,\ |
451 adaptive_lr=state.adaptive_lr) | |
452 state.train_error=train_error | |
405 state.validation_error=validation_error | 453 state.validation_error=validation_error |
406 state.test_error=test_error | 454 state.test_error=test_error |
407 state.nb_exemples=nb_exemples | 455 state.nb_exemples=nb_exemples |
408 state.time=time | 456 state.time=time |
409 return channel.COMPLETE | 457 return channel.COMPLETE |