Mercurial > ift6266
comparison baseline/mlp/v_youssouf/mlp_nist.py @ 413:f2dd75248483
initial commit of mlp with options for detection and 36 classes
author | youssouf |
---|---|
date | Thu, 29 Apr 2010 16:51:03 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
412:6478eef4f8aa | 413:f2dd75248483 |
---|---|
1 """ | |
2 This tutorial introduces the multilayer perceptron using Theano. | |
3 | |
4 A multilayer perceptron is a logistic regressor where | |
5 instead of feeding the input to the logistic regression you insert a | |
6 intermidiate layer, called the hidden layer, that has a nonlinear | |
7 activation function (usually tanh or sigmoid) . One can use many such | |
8 hidden layers making the architecture deep. The tutorial will also tackle | |
9 the problem of MNIST digit classification. | |
10 | |
11 .. math:: | |
12 | |
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), | |
14 | |
15 References: | |
16 | |
17 - textbooks: "Pattern Recognition and Machine Learning" - | |
18 Christopher M. Bishop, section 5 | |
19 | |
20 TODO: recommended preprocessing, lr ranges, regularization ranges (explain | |
21 to do lr first, then add regularization) | |
22 | |
23 """ | |
24 __docformat__ = 'restructedtext en' | |
25 | |
26 import pdb | |
27 import numpy | |
28 import pylab | |
29 import theano | |
30 import theano.tensor as T | |
31 import time | |
32 import theano.tensor.nnet | |
33 import pylearn | |
34 import theano,pylearn.version,ift6266 | |
35 from pylearn.io import filetensor as ft | |
36 from ift6266 import datasets | |
37 | |
38 data_path = '/data/lisa/data/nist/by_class/' | |
39 | |
40 class MLP(object): | |
41 """Multi-Layer Perceptron Class | |
42 | |
43 A multilayer perceptron is a feedforward artificial neural network model | |
44 that has one layer or more of hidden units and nonlinear activations. | |
45 Intermidiate layers usually have as activation function thanh or the | |
46 sigmoid function while the top layer is a softamx layer. | |
47 """ | |
48 | |
49 | |
50 | |
51 def __init__(self, input, n_in, n_hidden, n_out,learning_rate, detection_mode=0): | |
52 """Initialize the parameters for the multilayer perceptron | |
53 | |
54 :param input: symbolic variable that describes the input of the | |
55 architecture (one minibatch) | |
56 | |
57 :param n_in: number of input units, the dimension of the space in | |
58 which the datapoints lie | |
59 | |
60 :param n_hidden: number of hidden units | |
61 | |
62 :param n_out: number of output units, the dimension of the space in | |
63 which the labels lie | |
64 | |
65 """ | |
66 | |
67 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this | |
68 # example contains only one hidden layer, but one can have as many | |
69 # layers as he/she wishes, making the network deeper. The only | |
70 # problem making the network deep this way is during learning, | |
71 # backpropagation being unable to move the network from the starting | |
72 # point towards; this is where pre-training helps, giving a good | |
73 # starting point for backpropagation, but more about this in the | |
74 # other tutorials | |
75 | |
76 # `W1` is initialized with `W1_values` which is uniformely sampled | |
77 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) | |
78 # the output of uniform if converted using asarray to dtype | |
79 # theano.config.floatX so that the code is runable on GPU | |
80 W1_values = numpy.asarray( numpy.random.uniform( \ | |
81 low = -numpy.sqrt(6./(n_in+n_hidden)), \ | |
82 high = numpy.sqrt(6./(n_in+n_hidden)), \ | |
83 size = (n_in, n_hidden)), dtype = theano.config.floatX) | |
84 # `W2` is initialized with `W2_values` which is uniformely sampled | |
85 # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out) | |
86 # the output of uniform if converted using asarray to dtype | |
87 # theano.config.floatX so that the code is runable on GPU | |
88 W2_values = numpy.asarray( numpy.random.uniform( | |
89 low = -numpy.sqrt(6./(n_hidden+n_out)), \ | |
90 high= numpy.sqrt(6./(n_hidden+n_out)),\ | |
91 size= (n_hidden, n_out)), dtype = theano.config.floatX) | |
92 | |
93 self.W1 = theano.shared( value = W1_values ) | |
94 self.b1 = theano.shared( value = numpy.zeros((n_hidden,), | |
95 dtype= theano.config.floatX)) | |
96 self.W2 = theano.shared( value = W2_values ) | |
97 self.b2 = theano.shared( value = numpy.zeros((n_out,), | |
98 dtype= theano.config.floatX)) | |
99 | |
100 #include the learning rate in the classifer so | |
101 #we can modify it on the fly when we want | |
102 lr_value=learning_rate | |
103 self.lr=theano.shared(value=lr_value) | |
104 # symbolic expression computing the values of the hidden layer | |
105 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) | |
106 | |
107 | |
108 | |
109 # symbolic expression computing the values of the top layer | |
110 if(detection_mode): | |
111 self.p_y_given_x= T.nnet.sigmoid(T.dot(self.hidden, self.W2)+self.b2) | |
112 else: | |
113 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) | |
114 | |
115 # compute prediction as class whose probability is maximal in | |
116 # symbolic form | |
117 self.y_pred = T.argmax( self.p_y_given_x, axis =1) | |
118 self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1) | |
119 | |
120 | |
121 | |
122 | |
123 # L1 norm ; one regularization option is to enforce L1 norm to | |
124 # be small | |
125 self.L1 = abs(self.W1).sum() + abs(self.W2).sum() | |
126 | |
127 # square of L2 norm ; one regularization option is to enforce | |
128 # square of L2 norm to be small | |
129 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() | |
130 | |
131 | |
132 | |
133 def negative_log_likelihood(self, y): | |
134 """Return the mean of the negative log-likelihood of the prediction | |
135 of this model under a given target distribution. | |
136 | |
137 .. math:: | |
138 | |
139 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = | |
140 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ | |
141 \ell (\theta=\{W,b\}, \mathcal{D}) | |
142 | |
143 | |
144 :param y: corresponds to a vector that gives for each example the | |
145 :correct label | |
146 """ | |
147 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) | |
148 | |
149 | |
150 def cross_entropy(self, y): | |
151 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]+T.sum(T.log(1-self.p_y_given_x), axis=1)-T.log(1-self.p_y_given_x)[T.arange(y.shape[0]),y]) | |
152 | |
153 def errors(self, y): | |
154 """Return a float representing the number of errors in the minibatch | |
155 over the total number of examples of the minibatch | |
156 """ | |
157 | |
158 # check if y has same dimension of y_pred | |
159 if y.ndim != self.y_pred.ndim: | |
160 raise TypeError('y should have the same shape as self.y_pred', | |
161 ('y', target.type, 'y_pred', self.y_pred.type)) | |
162 # check if y is of the correct datatype | |
163 if y.dtype.startswith('int'): | |
164 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
165 # represents a mistake in prediction | |
166 return T.mean(T.neq(self.y_pred, y)) | |
167 else: | |
168 raise NotImplementedError() | |
169 | |
170 | |
171 def mlp_full_nist( verbose = 1,\ | |
172 adaptive_lr = 0,\ | |
173 data_set=0,\ | |
174 learning_rate=0.01,\ | |
175 L1_reg = 0.00,\ | |
176 L2_reg = 0.0001,\ | |
177 nb_max_exemples=1000000,\ | |
178 batch_size=20,\ | |
179 nb_hidden = 30,\ | |
180 nb_targets = 62, | |
181 tau=1e6,\ | |
182 lr_t2_factor=0.5,\ | |
183 detection_mode = 0,\ | |
184 reduce_label = 0): | |
185 | |
186 | |
187 configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr, detection_mode, reduce_label] | |
188 | |
189 if(verbose): | |
190 print(('verbose: %i') % (verbose)) | |
191 print(('adaptive_lr: %i') % (adaptive_lr)) | |
192 print(('data_set: %i') % (data_set)) | |
193 print(('learning_rate: %f') % (learning_rate)) | |
194 print(('L1_reg: %f') % (L1_reg)) | |
195 print(('L2_reg: %f') % (L2_reg)) | |
196 print(('nb_max_exemples: %i') % (nb_max_exemples)) | |
197 print(('batch_size: %i') % (batch_size)) | |
198 print(('nb_hidden: %i') % (nb_hidden)) | |
199 print(('nb_targets: %f') % (nb_targets)) | |
200 print(('tau: %f') % (tau)) | |
201 print(('lr_t2_factor: %f') % (lr_t2_factor)) | |
202 print(('detection_mode: %i') % (detection_mode)) | |
203 print(('reduce_label: %i') % (reduce_label)) | |
204 | |
205 # define the number of output - reduce_label : merge the lower and upper case. i.e a and A will both have label 10 | |
206 if(reduce_label): | |
207 nb_targets = 36 | |
208 else: | |
209 nb_targets = 62 | |
210 | |
211 #save initial learning rate if classical adaptive lr is used | |
212 initial_lr=learning_rate | |
213 | |
214 total_validation_error_list = [] | |
215 total_train_error_list = [] | |
216 learning_rate_list=[] | |
217 best_training_error=float('inf'); | |
218 | |
219 if data_set==0: | |
220 dataset=datasets.nist_all() | |
221 | |
222 | |
223 | |
224 | |
225 ishape = (32,32) # this is the size of NIST images | |
226 | |
227 # allocate symbolic variables for the data | |
228 x = T.fmatrix() # the data is presented as rasterized images | |
229 y = T.lvector() # the labels are presented as 1D vector of | |
230 # [long int] labels | |
231 | |
232 | |
233 # construct the logistic regression class | |
234 classifier = MLP( input=x,\ | |
235 n_in=32*32,\ | |
236 n_hidden=nb_hidden,\ | |
237 n_out=nb_targets, | |
238 learning_rate=learning_rate, | |
239 detection_mode = detection_mode) | |
240 | |
241 | |
242 | |
243 | |
244 # the cost we minimize during training is the negative log likelihood of | |
245 # the model plus the regularization terms (L1 and L2); cost is expressed | |
246 # here symbolically | |
247 if(detection_mode): | |
248 cost = classifier.cross_entropy(y) \ | |
249 + L1_reg * classifier.L1 \ | |
250 + L2_reg * classifier.L2_sqr | |
251 else: | |
252 cost = classifier.negative_log_likelihood(y) \ | |
253 + L1_reg * classifier.L1 \ | |
254 + L2_reg * classifier.L2_sqr | |
255 | |
256 # compiling a theano function that computes the mistakes that are made by | |
257 # the model on a minibatch | |
258 test_model = theano.function([x,y], classifier.errors(y)) | |
259 | |
260 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) | |
261 g_W1 = T.grad(cost, classifier.W1) | |
262 g_b1 = T.grad(cost, classifier.b1) | |
263 g_W2 = T.grad(cost, classifier.W2) | |
264 g_b2 = T.grad(cost, classifier.b2) | |
265 | |
266 # specify how to update the parameters of the model as a dictionary | |
267 updates = \ | |
268 { classifier.W1: classifier.W1 - classifier.lr*g_W1 \ | |
269 , classifier.b1: classifier.b1 - classifier.lr*g_b1 \ | |
270 , classifier.W2: classifier.W2 - classifier.lr*g_W2 \ | |
271 , classifier.b2: classifier.b2 - classifier.lr*g_b2 } | |
272 | |
273 # compiling a theano function `train_model` that returns the cost, but in | |
274 # the same time updates the parameter of the model based on the rules | |
275 # defined in `updates` | |
276 train_model = theano.function([x, y], cost, updates = updates ) | |
277 | |
278 | |
279 | |
280 | |
281 | |
282 | |
283 | |
284 | |
285 | |
286 #conditions for stopping the adaptation: | |
287 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size) | |
288 #2) validation error is going up twice in a row(probable overfitting) | |
289 | |
290 # This means we no longer stop on slow convergence as low learning rates stopped | |
291 # too fast. | |
292 | |
293 #approximate number of samples in the training set | |
294 #this is just to have a validation frequency | |
295 #roughly proportionnal to the training set | |
296 n_minibatches = 650000/batch_size | |
297 | |
298 | |
299 patience =nb_max_exemples/batch_size #in units of minibatch | |
300 patience_increase = 2 # wait this much longer when a new best is | |
301 # found | |
302 improvement_threshold = 0.995 # a relative improvement of this much is | |
303 # considered significant | |
304 validation_frequency = n_minibatches/4 | |
305 | |
306 | |
307 | |
308 | |
309 | |
310 best_validation_loss = float('inf') | |
311 best_iter = 0 | |
312 test_score = 0. | |
313 start_time = time.clock() | |
314 time_n=0 #in unit of exemples | |
315 minibatch_index=0 | |
316 epoch=0 | |
317 temp=0 | |
318 | |
319 | |
320 | |
321 if verbose == 1: | |
322 print 'looking at most at %i exemples' %nb_max_exemples | |
323 while(minibatch_index*batch_size<nb_max_exemples): | |
324 | |
325 for x, y in dataset.train(batch_size): | |
326 | |
327 if reduce_label: | |
328 y[y > 35] = y[y > 35]-26 | |
329 minibatch_index = minibatch_index + 1 | |
330 if adaptive_lr==2: | |
331 classifier.lr.value = tau*initial_lr/(tau+time_n) | |
332 | |
333 | |
334 #train model | |
335 cost_ij = train_model(x,y) | |
336 | |
337 if (minibatch_index+1) % validation_frequency == 0: | |
338 | |
339 #save the current learning rate | |
340 learning_rate_list.append(classifier.lr.value) | |
341 | |
342 # compute the validation error | |
343 this_validation_loss = 0. | |
344 temp=0 | |
345 for xv,yv in dataset.valid(1): | |
346 if reduce_label: | |
347 yv[yv > 35] = yv[yv > 35]-26 | |
348 # sum up the errors for each minibatch | |
349 axxa=test_model(xv,yv) | |
350 this_validation_loss += axxa | |
351 temp=temp+1 | |
352 # get the average by dividing with the number of minibatches | |
353 this_validation_loss /= temp | |
354 #save the validation loss | |
355 total_validation_error_list.append(this_validation_loss) | |
356 if verbose == 1: | |
357 print(('epoch %i, minibatch %i, learning rate %f current validation error %f ') % | |
358 (epoch, minibatch_index+1,classifier.lr.value, | |
359 this_validation_loss*100.)) | |
360 | |
361 # if we got the best validation score until now | |
362 if this_validation_loss < best_validation_loss: | |
363 # save best validation score and iteration number | |
364 best_validation_loss = this_validation_loss | |
365 best_iter = minibatch_index | |
366 # reset patience if we are going down again | |
367 # so we continue exploring | |
368 patience=nb_max_exemples/batch_size | |
369 # test it on the test set | |
370 test_score = 0. | |
371 temp =0 | |
372 for xt,yt in dataset.test(batch_size): | |
373 if reduce_label: | |
374 yt[yt > 35] = yt[yt > 35]-26 | |
375 test_score += test_model(xt,yt) | |
376 temp = temp+1 | |
377 test_score /= temp | |
378 if verbose == 1: | |
379 print(('epoch %i, minibatch %i, test error of best ' | |
380 'model %f %%') % | |
381 (epoch, minibatch_index+1, | |
382 test_score*100.)) | |
383 | |
384 # if the validation error is going up, we are overfitting (or oscillating) | |
385 # stop converging but run at least to next validation | |
386 # to check overfitting or ocsillation | |
387 # the saved weights of the model will be a bit off in that case | |
388 elif this_validation_loss >= best_validation_loss: | |
389 #calculate the test error at this point and exit | |
390 # test it on the test set | |
391 # however, if adaptive_lr is true, try reducing the lr to | |
392 # get us out of an oscilliation | |
393 if adaptive_lr==1: | |
394 classifier.lr.value=classifier.lr.value*lr_t2_factor | |
395 | |
396 test_score = 0. | |
397 #cap the patience so we are allowed one more validation error | |
398 #calculation before aborting | |
399 patience = minibatch_index+validation_frequency+1 | |
400 temp=0 | |
401 for xt,yt in dataset.test(batch_size): | |
402 if reduce_label: | |
403 yt[yt > 35] = yt[yt > 35]-26 | |
404 | |
405 test_score += test_model(xt,yt) | |
406 temp=temp+1 | |
407 test_score /= temp | |
408 if verbose == 1: | |
409 print ' validation error is going up, possibly stopping soon' | |
410 print((' epoch %i, minibatch %i, test error of best ' | |
411 'model %f %%') % | |
412 (epoch, minibatch_index+1, | |
413 test_score*100.)) | |
414 | |
415 | |
416 | |
417 | |
418 if minibatch_index>patience: | |
419 print 'we have diverged' | |
420 break | |
421 | |
422 | |
423 time_n= time_n + batch_size | |
424 epoch = epoch+1 | |
425 end_time = time.clock() | |
426 if verbose == 1: | |
427 print(('Optimization complete. Best validation score of %f %% ' | |
428 'obtained at iteration %i, with test performance %f %%') % | |
429 (best_validation_loss * 100., best_iter, test_score*100.)) | |
430 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | |
431 print minibatch_index | |
432 | |
433 #save the model and the weights | |
434 numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value) | |
435 numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\ | |
436 learning_rate_list=learning_rate_list) | |
437 | |
438 return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60) | |
439 | |
440 def test_error(model_file): | |
441 | |
442 print((' test error on all NIST')) | |
443 # load the model | |
444 a=numpy.load(model_file) | |
445 W1=a['W1'] | |
446 W2=a['W2'] | |
447 b1=a['b1'] | |
448 b2=a['b2'] | |
449 configuration=a['config'] | |
450 #configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr] | |
451 learning_rate = configuration[0] | |
452 nb_max_exemples = configuration[1] | |
453 nb_hidden = configuration[2] | |
454 adaptive_lr = configuration[3] | |
455 | |
456 if(len(configuration) == 6): | |
457 detection_mode = configuration[4] | |
458 reduce_label = configuration[5] | |
459 else: | |
460 detection_mode = 0 | |
461 reduce_label = 0 | |
462 | |
463 # define the batch size | |
464 batch_size=20 | |
465 #define the nb of target | |
466 nb_targets = 62 | |
467 | |
468 # create the mlp | |
469 ishape = (32,32) # this is the size of NIST images | |
470 | |
471 # allocate symbolic variables for the data | |
472 x = T.fmatrix() # the data is presented as rasterized images | |
473 y = T.lvector() # the labels are presented as 1D vector of | |
474 # [long int] labels | |
475 | |
476 | |
477 # construct the logistic regression class | |
478 classifier = MLP( input=x,\ | |
479 n_in=32*32,\ | |
480 n_hidden=nb_hidden,\ | |
481 n_out=nb_targets, | |
482 learning_rate=learning_rate,\ | |
483 detection_mode=detection_mode) | |
484 | |
485 | |
486 # set the weight into the model | |
487 classifier.W1.value = W1 | |
488 classifier.b1.value = b1 | |
489 classifier.W2.value = W2 | |
490 classifier.b2.value = b2 | |
491 | |
492 | |
493 # compiling a theano function that computes the mistakes that are made by | |
494 # the model on a minibatch | |
495 test_model = theano.function([x,y], classifier.errors(y)) | |
496 | |
497 # test it on the test set | |
498 | |
499 # load NIST ALL | |
500 dataset=datasets.nist_all() | |
501 test_score = 0. | |
502 temp =0 | |
503 for xt,yt in dataset.test(batch_size): | |
504 if reduce_label: | |
505 yt[yt > 35] = yt[yt > 35]-26 | |
506 test_score += test_model(xt,yt) | |
507 temp = temp+1 | |
508 test_score /= temp | |
509 | |
510 print(( ' test error NIST ALL : %f %%') %(test_score*100.0)) | |
511 | |
512 # load NIST DIGITS | |
513 dataset=datasets.nist_digits() | |
514 test_score = 0. | |
515 temp =0 | |
516 for xt,yt in dataset.test(batch_size): | |
517 if reduce_label: | |
518 yt[yt > 35] = yt[yt > 35]-26 | |
519 test_score += test_model(xt,yt) | |
520 temp = temp+1 | |
521 test_score /= temp | |
522 | |
523 print(( ' test error NIST digits : %f %%') %(test_score*100.0)) | |
524 | |
525 # load NIST lower | |
526 dataset=datasets.nist_lower() | |
527 test_score = 0. | |
528 temp =0 | |
529 for xt,yt in dataset.test(batch_size): | |
530 if reduce_label: | |
531 yt[yt > 35] = yt[yt > 35]-26 | |
532 test_score += test_model(xt,yt) | |
533 temp = temp+1 | |
534 test_score /= temp | |
535 | |
536 print(( ' test error NIST lower : %f %%') %(test_score*100.0)) | |
537 | |
538 # load NIST upper | |
539 dataset=datasets.nist_upper() | |
540 test_score = 0. | |
541 temp =0 | |
542 for xt,yt in dataset.test(batch_size): | |
543 if reduce_label: | |
544 yt[yt > 35] = yt[yt > 35]-26 | |
545 test_score += test_model(xt,yt) | |
546 temp = temp+1 | |
547 test_score /= temp | |
548 | |
549 print(( ' test error NIST upper : %f %%') %(test_score*100.0)) | |
550 | |
551 | |
552 if __name__ == '__main__': | |
553 ''' | |
554 mlp_full_nist( verbose = 1,\ | |
555 adaptive_lr = 1,\ | |
556 data_set=0,\ | |
557 learning_rate=0.5,\ | |
558 L1_reg = 0.00,\ | |
559 L2_reg = 0.0001,\ | |
560 nb_max_exemples=10000000,\ | |
561 batch_size=20,\ | |
562 nb_hidden = 500,\ | |
563 nb_targets = 62, | |
564 tau=100000,\ | |
565 lr_t2_factor=0.5) | |
566 ''' | |
567 | |
568 test_error('model.npy.npz') | |
569 | |
570 def jobman_mlp_full_nist(state,channel): | |
571 (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ | |
572 nb_max_exemples=state.nb_max_exemples,\ | |
573 nb_hidden=state.nb_hidden,\ | |
574 adaptive_lr=state.adaptive_lr,\ | |
575 tau=state.tau,\ | |
576 verbose = state.verbose,\ | |
577 lr_t2_factor=state.lr_t2_factor,\ | |
578 detection_mode = state.detection_mode,\ | |
579 reduce_label = state.reduce_label) | |
580 state.train_error=train_error | |
581 state.validation_error=validation_error | |
582 state.test_error=test_error | |
583 state.nb_exemples=nb_exemples | |
584 state.time=time | |
585 pylearn.version.record_versions(state,[theano,ift6266,pylearn]) | |
586 return channel.COMPLETE | |
587 | |
588 |