comparison baseline_algorithms/mlp/mlp_nist.py @ 143:f341a4efb44a

added adaptive lr, weight file save, traine error and error curves
author XavierMuller
date Tue, 23 Feb 2010 18:08:11 -0500
parents 93b4b84d86cf
children 8ceaaf812891
comparison
equal deleted inserted replaced
110:93b4b84d86cf 143:f341a4efb44a
28 import pylab 28 import pylab
29 import theano 29 import theano
30 import theano.tensor as T 30 import theano.tensor as T
31 import time 31 import time
32 import theano.tensor.nnet 32 import theano.tensor.nnet
33 import pylearn
33 from pylearn.io import filetensor as ft 34 from pylearn.io import filetensor as ft
34 35
35 data_path = '/data/lisa/data/nist/by_class/' 36 data_path = '/data/lisa/data/nist/by_class/'
36 37
37 class MLP(object): 38 class MLP(object):
43 sigmoid function while the top layer is a softamx layer. 44 sigmoid function while the top layer is a softamx layer.
44 """ 45 """
45 46
46 47
47 48
48 def __init__(self, input, n_in, n_hidden, n_out): 49 def __init__(self, input, n_in, n_hidden, n_out,learning_rate):
49 """Initialize the parameters for the multilayer perceptron 50 """Initialize the parameters for the multilayer perceptron
50 51
51 :param input: symbolic variable that describes the input of the 52 :param input: symbolic variable that describes the input of the
52 architecture (one minibatch) 53 architecture (one minibatch)
53 54
92 dtype= theano.config.floatX)) 93 dtype= theano.config.floatX))
93 self.W2 = theano.shared( value = W2_values ) 94 self.W2 = theano.shared( value = W2_values )
94 self.b2 = theano.shared( value = numpy.zeros((n_out,), 95 self.b2 = theano.shared( value = numpy.zeros((n_out,),
95 dtype= theano.config.floatX)) 96 dtype= theano.config.floatX))
96 97
98 #include the learning rate in the classifer so
99 #we can modify it on the fly when we want
100 lr_value=learning_rate
101 self.lr=theano.shared(value=lr_value)
97 # symbolic expression computing the values of the hidden layer 102 # symbolic expression computing the values of the hidden layer
98 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) 103 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
104
105
99 106
100 # symbolic expression computing the values of the top layer 107 # symbolic expression computing the values of the top layer
101 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) 108 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
102 109
103 # compute prediction as class whose probability is maximal in 110 # compute prediction as class whose probability is maximal in
104 # symbolic form 111 # symbolic form
105 self.y_pred = T.argmax( self.p_y_given_x, axis =1) 112 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
113 self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1)
114
115
116
106 117
107 # L1 norm ; one regularization option is to enforce L1 norm to 118 # L1 norm ; one regularization option is to enforce L1 norm to
108 # be small 119 # be small
109 self.L1 = abs(self.W1).sum() + abs(self.W2).sum() 120 self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
110 121
148 # represents a mistake in prediction 159 # represents a mistake in prediction
149 return T.mean(T.neq(self.y_pred, y)) 160 return T.mean(T.neq(self.y_pred, y))
150 else: 161 else:
151 raise NotImplementedError() 162 raise NotImplementedError()
152 163
153 #def jobman_mlp(state,channel):
154 # (validation_error,test_error,nb_exemples,time)=mlp_full_nist(state.learning_rate,\
155 # state.n_iter,\
156 # state.batch_size,\
157 # state.nb_hidden_units)
158 # state.validation_error = validation_error
159 # state.test_error = test_error
160 # state.nb_exemples = nb_exemples
161 # state.time=time
162 # return channel.COMPLETE
163
164
165
166 164
167 def mlp_full_nist( verbose = False,\ 165 def mlp_full_nist( verbose = False,\
166 adaptive_lr = False,\
168 train_data = 'all/all_train_data.ft',\ 167 train_data = 'all/all_train_data.ft',\
169 train_labels = 'all/all_train_labels.ft',\ 168 train_labels = 'all/all_train_labels.ft',\
170 test_data = 'all/all_test_data.ft',\ 169 test_data = 'all/all_test_data.ft',\
171 test_labels = 'all/all_test_labels.ft',\ 170 test_labels = 'all/all_test_labels.ft',\
172 learning_rate=0.01,\ 171 learning_rate=0.01,\
175 nb_max_exemples=1000000,\ 174 nb_max_exemples=1000000,\
176 batch_size=20,\ 175 batch_size=20,\
177 nb_hidden = 500,\ 176 nb_hidden = 500,\
178 nb_targets = 62): 177 nb_targets = 62):
179 178
179
180 configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr]
181
182 total_validation_error_list = []
183 total_train_error_list = []
184 learning_rate_list=[]
185 best_training_error=float('inf');
186
180 187
181 188
182 f = open(data_path+train_data) 189 f = open(data_path+train_data)
183 g= open(data_path+train_labels) 190 g= open(data_path+train_labels)
184 h = open(data_path+test_data) 191 h = open(data_path+test_data)
233 # allocate symbolic variables for the data 240 # allocate symbolic variables for the data
234 x = T.fmatrix() # the data is presented as rasterized images 241 x = T.fmatrix() # the data is presented as rasterized images
235 y = T.lvector() # the labels are presented as 1D vector of 242 y = T.lvector() # the labels are presented as 1D vector of
236 # [long int] labels 243 # [long int] labels
237 244
245 if verbose==True:
246 print 'finished parsing the data'
238 # construct the logistic regression class 247 # construct the logistic regression class
239 classifier = MLP( input=x.reshape((batch_size,32*32)),\ 248 classifier = MLP( input=x.reshape((batch_size,32*32)),\
240 n_in=32*32,\ 249 n_in=32*32,\
241 n_hidden=nb_hidden,\ 250 n_hidden=nb_hidden,\
242 n_out=nb_targets) 251 n_out=nb_targets,
252 learning_rate=learning_rate)
253
254
255
243 256
244 # the cost we minimize during training is the negative log likelihood of 257 # the cost we minimize during training is the negative log likelihood of
245 # the model plus the regularization terms (L1 and L2); cost is expressed 258 # the model plus the regularization terms (L1 and L2); cost is expressed
246 # here symbolically 259 # here symbolically
247 cost = classifier.negative_log_likelihood(y) \ 260 cost = classifier.negative_log_likelihood(y) \
258 g_W2 = T.grad(cost, classifier.W2) 271 g_W2 = T.grad(cost, classifier.W2)
259 g_b2 = T.grad(cost, classifier.b2) 272 g_b2 = T.grad(cost, classifier.b2)
260 273
261 # specify how to update the parameters of the model as a dictionary 274 # specify how to update the parameters of the model as a dictionary
262 updates = \ 275 updates = \
263 { classifier.W1: classifier.W1 - learning_rate*g_W1 \ 276 { classifier.W1: classifier.W1 - classifier.lr*g_W1 \
264 , classifier.b1: classifier.b1 - learning_rate*g_b1 \ 277 , classifier.b1: classifier.b1 - classifier.lr*g_b1 \
265 , classifier.W2: classifier.W2 - learning_rate*g_W2 \ 278 , classifier.W2: classifier.W2 - classifier.lr*g_W2 \
266 , classifier.b2: classifier.b2 - learning_rate*g_b2 } 279 , classifier.b2: classifier.b2 - classifier.lr*g_b2 }
267 280
268 # compiling a theano function `train_model` that returns the cost, but in 281 # compiling a theano function `train_model` that returns the cost, but in
269 # the same time updates the parameter of the model based on the rules 282 # the same time updates the parameter of the model based on the rules
270 # defined in `updates` 283 # defined in `updates`
271 train_model = theano.function([x, y], cost, updates = updates ) 284 train_model = theano.function([x, y], cost, updates = updates )
272 n_minibatches = len(train_batches) 285 n_minibatches = len(train_batches)
273 286
274 287
275 288
289
290
276 291
277 #conditions for stopping the adaptation: 292 #conditions for stopping the adaptation:
278 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size) 293 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size)
279 #2) validation error is going up (probable overfitting) 294 #2) validation error is going up twice in a row(probable overfitting)
280 295
281 # This means we no longer stop on slow convergence as low learning rates stopped 296 # This means we no longer stop on slow convergence as low learning rates stopped
282 # too fast. 297 # too fast.
298
299 # no longer relevant
283 patience =nb_max_exemples/batch_size 300 patience =nb_max_exemples/batch_size
284 patience_increase = 2 # wait this much longer when a new best is 301 patience_increase = 2 # wait this much longer when a new best is
285 # found 302 # found
286 improvement_threshold = 0.995 # a relative improvement of this much is 303 improvement_threshold = 0.995 # a relative improvement of this much is
287 # considered significant 304 # considered significant
294 best_validation_loss = float('inf') 311 best_validation_loss = float('inf')
295 best_iter = 0 312 best_iter = 0
296 test_score = 0. 313 test_score = 0.
297 start_time = time.clock() 314 start_time = time.clock()
298 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples 315 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples
299 n_iter = n_iter/n_minibatches + 1 316 n_iter = n_iter/n_minibatches + 1 #round up
300 n_iter=max(1,n_iter) # run at least once on short debug call 317 n_iter=max(1,n_iter) # run at least once on short debug call
301 # have a maximum of `n_iter` iterations through the entire dataset 318
302 319
303 if verbose == True: 320 if verbose == True:
304 print 'looping at most %d times through the data set' %n_iter 321 print 'looping at most %d times through the data set' %n_iter
305 for iter in xrange(n_iter* n_minibatches): 322 for iter in xrange(n_iter* n_minibatches):
306 323
307 # get epoch and minibatch index 324 # get epoch and minibatch index
308 epoch = iter / n_minibatches 325 epoch = iter / n_minibatches
309 minibatch_index = iter % n_minibatches 326 minibatch_index = iter % n_minibatches
310 327
328
329
311 # get the minibatches corresponding to `iter` modulo 330 # get the minibatches corresponding to `iter` modulo
312 # `len(train_batches)` 331 # `len(train_batches)`
313 x,y = train_batches[ minibatch_index ] 332 x,y = train_batches[ minibatch_index ]
314 # convert to float 333 # convert to float
315 x_float = x/255.0 334 x_float = x/255.0
316 cost_ij = train_model(x_float,y) 335 cost_ij = train_model(x_float,y)
317 336
318 if (iter+1) % validation_frequency == 0: 337 if (iter+1) % validation_frequency == 0:
319 # compute zero-one loss on validation set 338 # compute zero-one loss on validation set
320 339
321 this_validation_loss = 0. 340 this_validation_loss = 0.
322 for x,y in validation_batches: 341 for x,y in validation_batches:
323 # sum up the errors for each minibatch 342 # sum up the errors for each minibatch
324 x_float = x/255.0 343 x_float = x/255.0
325 this_validation_loss += test_model(x_float,y) 344 this_validation_loss += test_model(x_float,y)
326 # get the average by dividing with the number of minibatches 345 # get the average by dividing with the number of minibatches
327 this_validation_loss /= len(validation_batches) 346 this_validation_loss /= len(validation_batches)
347 #save the validation loss
348 total_validation_error_list.append(this_validation_loss)
349
350 #get the training error rate
351 this_train_loss=0
352 for x,y in train_batches:
353 # sum up the errors for each minibatch
354 x_float = x/255.0
355 this_train_loss += test_model(x_float,y)
356 # get the average by dividing with the number of minibatches
357 this_train_loss /= len(train_batches)
358 #save the validation loss
359 total_train_error_list.append(this_train_loss)
360 if(this_train_loss<best_training_error):
361 best_training_error=this_train_loss
362
328 if verbose == True: 363 if verbose == True:
329 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ 364 print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \
330 (epoch, minibatch_index+1, n_minibatches, \ 365 (epoch, minibatch_index+1, n_minibatches, \
331 this_validation_loss*100.)) 366 this_validation_loss*100.,this_train_loss*100))
367
368
369 #save the learning rate
370 learning_rate_list.append(classifier.lr.value)
332 371
333 372
334 # if we got the best validation score until now 373 # if we got the best validation score until now
335 if this_validation_loss < best_validation_loss: 374 if this_validation_loss < best_validation_loss:
336
337 #improve patience if loss improvement is good enough
338 if this_validation_loss < best_validation_loss * \
339 improvement_threshold :
340 patience = max(patience, iter * patience_increase)
341 elif verbose == True:
342 print 'slow convergence stop'
343
344 # save best validation score and iteration number 375 # save best validation score and iteration number
345 best_validation_loss = this_validation_loss 376 best_validation_loss = this_validation_loss
346 best_iter = iter 377 best_iter = iter
347 378 # reset patience if we are going down again
379 # so we continue exploring
380 patience=nb_max_exemples/batch_size
348 # test it on the test set 381 # test it on the test set
349 test_score = 0. 382 test_score = 0.
350 for x,y in test_batches: 383 for x,y in test_batches:
351 x_float=x/255.0 384 x_float=x/255.0
352 test_score += test_model(x_float,y) 385 test_score += test_model(x_float,y)
355 print((' epoch %i, minibatch %i/%i, test error of best ' 388 print((' epoch %i, minibatch %i/%i, test error of best '
356 'model %f %%') % 389 'model %f %%') %
357 (epoch, minibatch_index+1, n_minibatches, 390 (epoch, minibatch_index+1, n_minibatches,
358 test_score*100.)) 391 test_score*100.))
359 392
360 #if the validation error is going up, we are overfitting 393 # if the validation error is going up, we are overfitting (or oscillating)
361 #stop converging 394 # stop converging but run at least to next validation
362 elif this_validation_loss > best_validation_loss: 395 # to check overfitting or ocsillation
396 # the saved weights of the model will be a bit off in that case
397 elif this_validation_loss >= best_validation_loss:
363 #calculate the test error at this point and exit 398 #calculate the test error at this point and exit
364 # test it on the test set 399 # test it on the test set
365 if verbose==True: 400 # however, if adaptive_lr is true, try reducing the lr to
366 print ' We are diverging' 401 # get us out of an oscilliation
367 best_iter = iter 402 if adaptive_lr==True:
403 classifier.lr.value=classifier.lr.value/2.0
404
368 test_score = 0. 405 test_score = 0.
406 #cap the patience so we are allowed one more validation error
407 #calculation before aborting
408 patience = iter+validation_frequency+1
369 for x,y in test_batches: 409 for x,y in test_batches:
370 x_float=x/255.0 410 x_float=x/255.0
371 test_score += test_model(x_float,y) 411 test_score += test_model(x_float,y)
372 test_score /= len(test_batches) 412 test_score /= len(test_batches)
373 if verbose == True: 413 if verbose == True:
374 print ' validation error is going up, stopping now' 414 print ' validation error is going up, possibly stopping soon'
375 print((' epoch %i, minibatch %i/%i, test error of best ' 415 print((' epoch %i, minibatch %i/%i, test error of best '
376 'model %f %%') % 416 'model %f %%') %
377 (epoch, minibatch_index+1, n_minibatches, 417 (epoch, minibatch_index+1, n_minibatches,
378 test_score*100.)) 418 test_score*100.))
379 419
380 break 420
381 421
382 422
383 423 if iter>patience:
384 if patience <= iter : 424 print 'we have diverged'
385 break 425 break
386 426
387 427
388 end_time = time.clock() 428 end_time = time.clock()
389 if verbose == True: 429 if verbose == True:
390 print(('Optimization complete. Best validation score of %f %% ' 430 print(('Optimization complete. Best validation score of %f %% '
391 'obtained at iteration %i, with test performance %f %%') % 431 'obtained at iteration %i, with test performance %f %%') %
392 (best_validation_loss * 100., best_iter, test_score*100.)) 432 (best_validation_loss * 100., best_iter, test_score*100.))
393 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) 433 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
394 print iter 434 print iter
395 return (best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60) 435
436 #save the model and the weights
437 numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value)
438 numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\
439 learning_rate_list=learning_rate_list)
440
441 return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
396 442
397 443
398 if __name__ == '__main__': 444 if __name__ == '__main__':
399 mlp_full_mnist() 445 mlp_full_mnist()
400 446
401 def jobman_mlp_full_nist(state,channel): 447 def jobman_mlp_full_nist(state,channel):
402 (validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ 448 (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
403 nb_max_exemples=state.nb_max_exemples,\ 449 nb_max_exemples=state.nb_max_exemples,\
404 nb_hidden=state.nb_hidden) 450 nb_hidden=state.nb_hidden,\
451 adaptive_lr=state.adaptive_lr)
452 state.train_error=train_error
405 state.validation_error=validation_error 453 state.validation_error=validation_error
406 state.test_error=test_error 454 state.test_error=test_error
407 state.nb_exemples=nb_exemples 455 state.nb_exemples=nb_exemples
408 state.time=time 456 state.time=time
409 return channel.COMPLETE 457 return channel.COMPLETE