comparison baseline/mlp/v_youssouf/mlp_nist.py @ 413:f2dd75248483

initial commit of mlp with options for detection and 36 classes
author youssouf
date Thu, 29 Apr 2010 16:51:03 -0400
parents
children
comparison
equal deleted inserted replaced
412:6478eef4f8aa 413:f2dd75248483
1 """
2 This tutorial introduces the multilayer perceptron using Theano.
3
4 A multilayer perceptron is a logistic regressor where
5 instead of feeding the input to the logistic regression you insert a
6 intermidiate layer, called the hidden layer, that has a nonlinear
7 activation function (usually tanh or sigmoid) . One can use many such
8 hidden layers making the architecture deep. The tutorial will also tackle
9 the problem of MNIST digit classification.
10
11 .. math::
12
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
14
15 References:
16
17 - textbooks: "Pattern Recognition and Machine Learning" -
18 Christopher M. Bishop, section 5
19
20 TODO: recommended preprocessing, lr ranges, regularization ranges (explain
21 to do lr first, then add regularization)
22
23 """
24 __docformat__ = 'restructedtext en'
25
26 import pdb
27 import numpy
28 import pylab
29 import theano
30 import theano.tensor as T
31 import time
32 import theano.tensor.nnet
33 import pylearn
34 import theano,pylearn.version,ift6266
35 from pylearn.io import filetensor as ft
36 from ift6266 import datasets
37
38 data_path = '/data/lisa/data/nist/by_class/'
39
40 class MLP(object):
41 """Multi-Layer Perceptron Class
42
43 A multilayer perceptron is a feedforward artificial neural network model
44 that has one layer or more of hidden units and nonlinear activations.
45 Intermidiate layers usually have as activation function thanh or the
46 sigmoid function while the top layer is a softamx layer.
47 """
48
49
50
51 def __init__(self, input, n_in, n_hidden, n_out,learning_rate, detection_mode=0):
52 """Initialize the parameters for the multilayer perceptron
53
54 :param input: symbolic variable that describes the input of the
55 architecture (one minibatch)
56
57 :param n_in: number of input units, the dimension of the space in
58 which the datapoints lie
59
60 :param n_hidden: number of hidden units
61
62 :param n_out: number of output units, the dimension of the space in
63 which the labels lie
64
65 """
66
67 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this
68 # example contains only one hidden layer, but one can have as many
69 # layers as he/she wishes, making the network deeper. The only
70 # problem making the network deep this way is during learning,
71 # backpropagation being unable to move the network from the starting
72 # point towards; this is where pre-training helps, giving a good
73 # starting point for backpropagation, but more about this in the
74 # other tutorials
75
76 # `W1` is initialized with `W1_values` which is uniformely sampled
77 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
78 # the output of uniform if converted using asarray to dtype
79 # theano.config.floatX so that the code is runable on GPU
80 W1_values = numpy.asarray( numpy.random.uniform( \
81 low = -numpy.sqrt(6./(n_in+n_hidden)), \
82 high = numpy.sqrt(6./(n_in+n_hidden)), \
83 size = (n_in, n_hidden)), dtype = theano.config.floatX)
84 # `W2` is initialized with `W2_values` which is uniformely sampled
85 # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
86 # the output of uniform if converted using asarray to dtype
87 # theano.config.floatX so that the code is runable on GPU
88 W2_values = numpy.asarray( numpy.random.uniform(
89 low = -numpy.sqrt(6./(n_hidden+n_out)), \
90 high= numpy.sqrt(6./(n_hidden+n_out)),\
91 size= (n_hidden, n_out)), dtype = theano.config.floatX)
92
93 self.W1 = theano.shared( value = W1_values )
94 self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
95 dtype= theano.config.floatX))
96 self.W2 = theano.shared( value = W2_values )
97 self.b2 = theano.shared( value = numpy.zeros((n_out,),
98 dtype= theano.config.floatX))
99
100 #include the learning rate in the classifer so
101 #we can modify it on the fly when we want
102 lr_value=learning_rate
103 self.lr=theano.shared(value=lr_value)
104 # symbolic expression computing the values of the hidden layer
105 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
106
107
108
109 # symbolic expression computing the values of the top layer
110 if(detection_mode):
111 self.p_y_given_x= T.nnet.sigmoid(T.dot(self.hidden, self.W2)+self.b2)
112 else:
113 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
114
115 # compute prediction as class whose probability is maximal in
116 # symbolic form
117 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
118 self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1)
119
120
121
122
123 # L1 norm ; one regularization option is to enforce L1 norm to
124 # be small
125 self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
126
127 # square of L2 norm ; one regularization option is to enforce
128 # square of L2 norm to be small
129 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
130
131
132
133 def negative_log_likelihood(self, y):
134 """Return the mean of the negative log-likelihood of the prediction
135 of this model under a given target distribution.
136
137 .. math::
138
139 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
140 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
141 \ell (\theta=\{W,b\}, \mathcal{D})
142
143
144 :param y: corresponds to a vector that gives for each example the
145 :correct label
146 """
147 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
148
149
150 def cross_entropy(self, y):
151 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]+T.sum(T.log(1-self.p_y_given_x), axis=1)-T.log(1-self.p_y_given_x)[T.arange(y.shape[0]),y])
152
153 def errors(self, y):
154 """Return a float representing the number of errors in the minibatch
155 over the total number of examples of the minibatch
156 """
157
158 # check if y has same dimension of y_pred
159 if y.ndim != self.y_pred.ndim:
160 raise TypeError('y should have the same shape as self.y_pred',
161 ('y', target.type, 'y_pred', self.y_pred.type))
162 # check if y is of the correct datatype
163 if y.dtype.startswith('int'):
164 # the T.neq operator returns a vector of 0s and 1s, where 1
165 # represents a mistake in prediction
166 return T.mean(T.neq(self.y_pred, y))
167 else:
168 raise NotImplementedError()
169
170
171 def mlp_full_nist( verbose = 1,\
172 adaptive_lr = 0,\
173 data_set=0,\
174 learning_rate=0.01,\
175 L1_reg = 0.00,\
176 L2_reg = 0.0001,\
177 nb_max_exemples=1000000,\
178 batch_size=20,\
179 nb_hidden = 30,\
180 nb_targets = 62,
181 tau=1e6,\
182 lr_t2_factor=0.5,\
183 detection_mode = 0,\
184 reduce_label = 0):
185
186
187 configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr, detection_mode, reduce_label]
188
189 if(verbose):
190 print(('verbose: %i') % (verbose))
191 print(('adaptive_lr: %i') % (adaptive_lr))
192 print(('data_set: %i') % (data_set))
193 print(('learning_rate: %f') % (learning_rate))
194 print(('L1_reg: %f') % (L1_reg))
195 print(('L2_reg: %f') % (L2_reg))
196 print(('nb_max_exemples: %i') % (nb_max_exemples))
197 print(('batch_size: %i') % (batch_size))
198 print(('nb_hidden: %i') % (nb_hidden))
199 print(('nb_targets: %f') % (nb_targets))
200 print(('tau: %f') % (tau))
201 print(('lr_t2_factor: %f') % (lr_t2_factor))
202 print(('detection_mode: %i') % (detection_mode))
203 print(('reduce_label: %i') % (reduce_label))
204
205 # define the number of output - reduce_label : merge the lower and upper case. i.e a and A will both have label 10
206 if(reduce_label):
207 nb_targets = 36
208 else:
209 nb_targets = 62
210
211 #save initial learning rate if classical adaptive lr is used
212 initial_lr=learning_rate
213
214 total_validation_error_list = []
215 total_train_error_list = []
216 learning_rate_list=[]
217 best_training_error=float('inf');
218
219 if data_set==0:
220 dataset=datasets.nist_all()
221
222
223
224
225 ishape = (32,32) # this is the size of NIST images
226
227 # allocate symbolic variables for the data
228 x = T.fmatrix() # the data is presented as rasterized images
229 y = T.lvector() # the labels are presented as 1D vector of
230 # [long int] labels
231
232
233 # construct the logistic regression class
234 classifier = MLP( input=x,\
235 n_in=32*32,\
236 n_hidden=nb_hidden,\
237 n_out=nb_targets,
238 learning_rate=learning_rate,
239 detection_mode = detection_mode)
240
241
242
243
244 # the cost we minimize during training is the negative log likelihood of
245 # the model plus the regularization terms (L1 and L2); cost is expressed
246 # here symbolically
247 if(detection_mode):
248 cost = classifier.cross_entropy(y) \
249 + L1_reg * classifier.L1 \
250 + L2_reg * classifier.L2_sqr
251 else:
252 cost = classifier.negative_log_likelihood(y) \
253 + L1_reg * classifier.L1 \
254 + L2_reg * classifier.L2_sqr
255
256 # compiling a theano function that computes the mistakes that are made by
257 # the model on a minibatch
258 test_model = theano.function([x,y], classifier.errors(y))
259
260 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
261 g_W1 = T.grad(cost, classifier.W1)
262 g_b1 = T.grad(cost, classifier.b1)
263 g_W2 = T.grad(cost, classifier.W2)
264 g_b2 = T.grad(cost, classifier.b2)
265
266 # specify how to update the parameters of the model as a dictionary
267 updates = \
268 { classifier.W1: classifier.W1 - classifier.lr*g_W1 \
269 , classifier.b1: classifier.b1 - classifier.lr*g_b1 \
270 , classifier.W2: classifier.W2 - classifier.lr*g_W2 \
271 , classifier.b2: classifier.b2 - classifier.lr*g_b2 }
272
273 # compiling a theano function `train_model` that returns the cost, but in
274 # the same time updates the parameter of the model based on the rules
275 # defined in `updates`
276 train_model = theano.function([x, y], cost, updates = updates )
277
278
279
280
281
282
283
284
285
286 #conditions for stopping the adaptation:
287 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size)
288 #2) validation error is going up twice in a row(probable overfitting)
289
290 # This means we no longer stop on slow convergence as low learning rates stopped
291 # too fast.
292
293 #approximate number of samples in the training set
294 #this is just to have a validation frequency
295 #roughly proportionnal to the training set
296 n_minibatches = 650000/batch_size
297
298
299 patience =nb_max_exemples/batch_size #in units of minibatch
300 patience_increase = 2 # wait this much longer when a new best is
301 # found
302 improvement_threshold = 0.995 # a relative improvement of this much is
303 # considered significant
304 validation_frequency = n_minibatches/4
305
306
307
308
309
310 best_validation_loss = float('inf')
311 best_iter = 0
312 test_score = 0.
313 start_time = time.clock()
314 time_n=0 #in unit of exemples
315 minibatch_index=0
316 epoch=0
317 temp=0
318
319
320
321 if verbose == 1:
322 print 'looking at most at %i exemples' %nb_max_exemples
323 while(minibatch_index*batch_size<nb_max_exemples):
324
325 for x, y in dataset.train(batch_size):
326
327 if reduce_label:
328 y[y > 35] = y[y > 35]-26
329 minibatch_index = minibatch_index + 1
330 if adaptive_lr==2:
331 classifier.lr.value = tau*initial_lr/(tau+time_n)
332
333
334 #train model
335 cost_ij = train_model(x,y)
336
337 if (minibatch_index+1) % validation_frequency == 0:
338
339 #save the current learning rate
340 learning_rate_list.append(classifier.lr.value)
341
342 # compute the validation error
343 this_validation_loss = 0.
344 temp=0
345 for xv,yv in dataset.valid(1):
346 if reduce_label:
347 yv[yv > 35] = yv[yv > 35]-26
348 # sum up the errors for each minibatch
349 axxa=test_model(xv,yv)
350 this_validation_loss += axxa
351 temp=temp+1
352 # get the average by dividing with the number of minibatches
353 this_validation_loss /= temp
354 #save the validation loss
355 total_validation_error_list.append(this_validation_loss)
356 if verbose == 1:
357 print(('epoch %i, minibatch %i, learning rate %f current validation error %f ') %
358 (epoch, minibatch_index+1,classifier.lr.value,
359 this_validation_loss*100.))
360
361 # if we got the best validation score until now
362 if this_validation_loss < best_validation_loss:
363 # save best validation score and iteration number
364 best_validation_loss = this_validation_loss
365 best_iter = minibatch_index
366 # reset patience if we are going down again
367 # so we continue exploring
368 patience=nb_max_exemples/batch_size
369 # test it on the test set
370 test_score = 0.
371 temp =0
372 for xt,yt in dataset.test(batch_size):
373 if reduce_label:
374 yt[yt > 35] = yt[yt > 35]-26
375 test_score += test_model(xt,yt)
376 temp = temp+1
377 test_score /= temp
378 if verbose == 1:
379 print(('epoch %i, minibatch %i, test error of best '
380 'model %f %%') %
381 (epoch, minibatch_index+1,
382 test_score*100.))
383
384 # if the validation error is going up, we are overfitting (or oscillating)
385 # stop converging but run at least to next validation
386 # to check overfitting or ocsillation
387 # the saved weights of the model will be a bit off in that case
388 elif this_validation_loss >= best_validation_loss:
389 #calculate the test error at this point and exit
390 # test it on the test set
391 # however, if adaptive_lr is true, try reducing the lr to
392 # get us out of an oscilliation
393 if adaptive_lr==1:
394 classifier.lr.value=classifier.lr.value*lr_t2_factor
395
396 test_score = 0.
397 #cap the patience so we are allowed one more validation error
398 #calculation before aborting
399 patience = minibatch_index+validation_frequency+1
400 temp=0
401 for xt,yt in dataset.test(batch_size):
402 if reduce_label:
403 yt[yt > 35] = yt[yt > 35]-26
404
405 test_score += test_model(xt,yt)
406 temp=temp+1
407 test_score /= temp
408 if verbose == 1:
409 print ' validation error is going up, possibly stopping soon'
410 print((' epoch %i, minibatch %i, test error of best '
411 'model %f %%') %
412 (epoch, minibatch_index+1,
413 test_score*100.))
414
415
416
417
418 if minibatch_index>patience:
419 print 'we have diverged'
420 break
421
422
423 time_n= time_n + batch_size
424 epoch = epoch+1
425 end_time = time.clock()
426 if verbose == 1:
427 print(('Optimization complete. Best validation score of %f %% '
428 'obtained at iteration %i, with test performance %f %%') %
429 (best_validation_loss * 100., best_iter, test_score*100.))
430 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
431 print minibatch_index
432
433 #save the model and the weights
434 numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value)
435 numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\
436 learning_rate_list=learning_rate_list)
437
438 return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
439
440 def test_error(model_file):
441
442 print((' test error on all NIST'))
443 # load the model
444 a=numpy.load(model_file)
445 W1=a['W1']
446 W2=a['W2']
447 b1=a['b1']
448 b2=a['b2']
449 configuration=a['config']
450 #configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr]
451 learning_rate = configuration[0]
452 nb_max_exemples = configuration[1]
453 nb_hidden = configuration[2]
454 adaptive_lr = configuration[3]
455
456 if(len(configuration) == 6):
457 detection_mode = configuration[4]
458 reduce_label = configuration[5]
459 else:
460 detection_mode = 0
461 reduce_label = 0
462
463 # define the batch size
464 batch_size=20
465 #define the nb of target
466 nb_targets = 62
467
468 # create the mlp
469 ishape = (32,32) # this is the size of NIST images
470
471 # allocate symbolic variables for the data
472 x = T.fmatrix() # the data is presented as rasterized images
473 y = T.lvector() # the labels are presented as 1D vector of
474 # [long int] labels
475
476
477 # construct the logistic regression class
478 classifier = MLP( input=x,\
479 n_in=32*32,\
480 n_hidden=nb_hidden,\
481 n_out=nb_targets,
482 learning_rate=learning_rate,\
483 detection_mode=detection_mode)
484
485
486 # set the weight into the model
487 classifier.W1.value = W1
488 classifier.b1.value = b1
489 classifier.W2.value = W2
490 classifier.b2.value = b2
491
492
493 # compiling a theano function that computes the mistakes that are made by
494 # the model on a minibatch
495 test_model = theano.function([x,y], classifier.errors(y))
496
497 # test it on the test set
498
499 # load NIST ALL
500 dataset=datasets.nist_all()
501 test_score = 0.
502 temp =0
503 for xt,yt in dataset.test(batch_size):
504 if reduce_label:
505 yt[yt > 35] = yt[yt > 35]-26
506 test_score += test_model(xt,yt)
507 temp = temp+1
508 test_score /= temp
509
510 print(( ' test error NIST ALL : %f %%') %(test_score*100.0))
511
512 # load NIST DIGITS
513 dataset=datasets.nist_digits()
514 test_score = 0.
515 temp =0
516 for xt,yt in dataset.test(batch_size):
517 if reduce_label:
518 yt[yt > 35] = yt[yt > 35]-26
519 test_score += test_model(xt,yt)
520 temp = temp+1
521 test_score /= temp
522
523 print(( ' test error NIST digits : %f %%') %(test_score*100.0))
524
525 # load NIST lower
526 dataset=datasets.nist_lower()
527 test_score = 0.
528 temp =0
529 for xt,yt in dataset.test(batch_size):
530 if reduce_label:
531 yt[yt > 35] = yt[yt > 35]-26
532 test_score += test_model(xt,yt)
533 temp = temp+1
534 test_score /= temp
535
536 print(( ' test error NIST lower : %f %%') %(test_score*100.0))
537
538 # load NIST upper
539 dataset=datasets.nist_upper()
540 test_score = 0.
541 temp =0
542 for xt,yt in dataset.test(batch_size):
543 if reduce_label:
544 yt[yt > 35] = yt[yt > 35]-26
545 test_score += test_model(xt,yt)
546 temp = temp+1
547 test_score /= temp
548
549 print(( ' test error NIST upper : %f %%') %(test_score*100.0))
550
551
552 if __name__ == '__main__':
553 '''
554 mlp_full_nist( verbose = 1,\
555 adaptive_lr = 1,\
556 data_set=0,\
557 learning_rate=0.5,\
558 L1_reg = 0.00,\
559 L2_reg = 0.0001,\
560 nb_max_exemples=10000000,\
561 batch_size=20,\
562 nb_hidden = 500,\
563 nb_targets = 62,
564 tau=100000,\
565 lr_t2_factor=0.5)
566 '''
567
568 test_error('model.npy.npz')
569
570 def jobman_mlp_full_nist(state,channel):
571 (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
572 nb_max_exemples=state.nb_max_exemples,\
573 nb_hidden=state.nb_hidden,\
574 adaptive_lr=state.adaptive_lr,\
575 tau=state.tau,\
576 verbose = state.verbose,\
577 lr_t2_factor=state.lr_t2_factor,\
578 detection_mode = state.detection_mode,\
579 reduce_label = state.reduce_label)
580 state.train_error=train_error
581 state.validation_error=validation_error
582 state.test_error=test_error
583 state.nb_exemples=nb_exemples
584 state.time=time
585 pylearn.version.record_versions(state,[theano,ift6266,pylearn])
586 return channel.COMPLETE
587
588