comparison baseline/mlp/ratio_classes/mlp_nist_ratio.py @ 357:9a7b74927f7d

version mlp modifiée pour la selection du ratio de la classe principale
author Guillaume Sicard <guitch21@gmail.com>
date Thu, 22 Apr 2010 00:00:09 -0400
parents
children d8129a09ffb1
comparison
equal deleted inserted replaced
356:b0741ea3ff6f 357:9a7b74927f7d
1 # -*- coding: utf-8 -*-
2 """
3 This tutorial introduces the multilayer perceptron using Theano.
4
5 A multilayer perceptron is a logistic regressor where
6 instead of feeding the input to the logistic regression you insert a
7 intermidiate layer, called the hidden layer, that has a nonlinear
8 activation function (usually tanh or sigmoid) . One can use many such
9 hidden layers making the architecture deep. The tutorial will also tackle
10 the problem of MNIST digit classification.
11
12 .. math::
13
14 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
15
16 References:
17
18 - textbooks: "Pattern Recognition and Machine Learning" -
19 Christopher M. Bishop, section 5
20
21 TODO: recommended preprocessing, lr ranges, regularization ranges (explain
22 to do lr first, then add regularization)
23
24 """
25 __docformat__ = 'restructedtext en'
26
27 import ift6266
28 from scripts import setup_batches
29 import pdb
30 import numpy
31
32 import theano
33 import theano.tensor as T
34 import time
35 import theano.tensor.nnet
36 import pylearn
37 import theano,pylearn.version
38 from pylearn.io import filetensor as ft
39
40 data_path = '/data/lisa/data/nist/by_class/'
41
42 class MLP(object):
43 """Multi-Layer Perceptron Class
44
45 A multilayer perceptron is a feedforward artificial neural network model
46 that has one layer or more of hidden units and nonlinear activations.
47 Intermidiate layers usually have as activation function thanh or the
48 sigmoid function while the top layer is a softamx layer.
49 """
50
51
52
53 def __init__(self, input, n_in, n_hidden, n_out,learning_rate):
54 """Initialize the parameters for the multilayer perceptron
55
56 :param input: symbolic variable that describes the input of the
57 architecture (one minibatch)
58
59 :param n_in: number of input units, the dimension of the space in
60 which the datapoints lie
61
62 :param n_hidden: number of hidden units
63
64 :param n_out: number of output units, the dimension of the space in
65 which the labels lie
66
67 """
68
69 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this
70 # example contains only one hidden layer, but one can have as many
71 # layers as he/she wishes, making the network deeper. The only
72 # problem making the network deep this way is during learning,
73 # backpropagation being unable to move the network from the starting
74 # point towards; this is where pre-training helps, giving a good
75 # starting point for backpropagation, but more about this in the
76 # other tutorials
77
78 # `W1` is initialized with `W1_values` which is uniformely sampled
79 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
80 # the output of uniform if converted using asarray to dtype
81 # theano.config.floatX so that the code is runable on GPU
82 W1_values = numpy.asarray( numpy.random.uniform( \
83 low = -numpy.sqrt(6./(n_in+n_hidden)), \
84 high = numpy.sqrt(6./(n_in+n_hidden)), \
85 size = (n_in, n_hidden)), dtype = theano.config.floatX)
86 # `W2` is initialized with `W2_values` which is uniformely sampled
87 # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
88 # the output of uniform if converted using asarray to dtype
89 # theano.config.floatX so that the code is runable on GPU
90 W2_values = numpy.asarray( numpy.random.uniform(
91 low = -numpy.sqrt(6./(n_hidden+n_out)), \
92 high= numpy.sqrt(6./(n_hidden+n_out)),\
93 size= (n_hidden, n_out)), dtype = theano.config.floatX)
94
95 self.W1 = theano.shared( value = W1_values )
96 self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
97 dtype= theano.config.floatX))
98 self.W2 = theano.shared( value = W2_values )
99 self.b2 = theano.shared( value = numpy.zeros((n_out,),
100 dtype= theano.config.floatX))
101
102 #include the learning rate in the classifer so
103 #we can modify it on the fly when we want
104 lr_value=learning_rate
105 self.lr=theano.shared(value=lr_value)
106 # symbolic expression computing the values of the hidden layer
107 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
108
109
110
111 # symbolic expression computing the values of the top layer
112 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
113
114 # compute prediction as class whose probability is maximal in
115 # symbolic form
116 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
117 self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1)
118
119
120
121
122 # L1 norm ; one regularization option is to enforce L1 norm to
123 # be small
124 self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
125
126 # square of L2 norm ; one regularization option is to enforce
127 # square of L2 norm to be small
128 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
129
130
131
132 def negative_log_likelihood(self, y):
133 """Return the mean of the negative log-likelihood of the prediction
134 of this model under a given target distribution.
135
136 .. math::
137
138 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
139 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
140 \ell (\theta=\{W,b\}, \mathcal{D})
141
142
143 :param y: corresponds to a vector that gives for each example the
144 :correct label
145 """
146 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
147
148
149
150
151 def errors(self, y):
152 """Return a float representing the number of errors in the minibatch
153 over the total number of examples of the minibatch
154 """
155
156 # check if y has same dimension of y_pred
157 if y.ndim != self.y_pred.ndim:
158 raise TypeError('y should have the same shape as self.y_pred',
159 ('y', target.type, 'y_pred', self.y_pred.type))
160 # check if y is of the correct datatype
161 if y.dtype.startswith('int'):
162 # the T.neq operator returns a vector of 0s and 1s, where 1
163 # represents a mistake in prediction
164 return T.mean(T.neq(self.y_pred, y))
165 else:
166 raise NotImplementedError()
167
168
169 def mlp_full_nist( verbose = False,\
170 adaptive_lr = 1,\
171 train_data = 'all/all_train_data.ft',\
172 train_labels = 'all/all_train_labels.ft',\
173 test_data = 'all/all_test_data.ft',\
174 test_labels = 'all/all_test_labels.ft',\
175 learning_rate=0.5,\
176 L1_reg = 0.00,\
177 L2_reg = 0.0001,\
178 nb_max_exemples=1000000,\
179 batch_size=20,\
180 nb_hidden = 500,\
181 nb_targets = 62,\
182 tau=1e6,\
183 main_class="d",\
184 start_ratio=1,\
185 end_ratio=1):
186
187
188 configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr]
189
190 #save initial learning rate if classical adaptive lr is used
191 initial_lr=learning_rate
192
193 total_validation_error_list = []
194 total_train_error_list = []
195 learning_rate_list=[]
196 best_training_error=float('inf');
197
198 # set up batches
199 batches = setup_batches.Batches()
200 batches.set_batches(main_class, start_ratio,end_ratio,batch_size,verbose)
201
202 train_batches = batches.get_train_batches()
203 test_batches = batches.get_test_batches()
204 validation_batches = batches.get_validation_batches()
205
206 ishape = (32,32) # this is the size of NIST images
207
208 # allocate symbolic variables for the data
209 x = T.fmatrix() # the data is presented as rasterized images
210 y = T.lvector() # the labels are presented as 1D vector of
211 # [long int] labels
212
213 if verbose==True:
214 print 'finished parsing the data'
215 # construct the logistic regression class
216 classifier = MLP( input=x.reshape((batch_size,32*32)),\
217 n_in=32*32,\
218 n_hidden=nb_hidden,\
219 n_out=nb_targets,
220 learning_rate=learning_rate)
221
222
223
224
225 # the cost we minimize during training is the negative log likelihood of
226 # the model plus the regularization terms (L1 and L2); cost is expressed
227 # here symbolically
228 cost = classifier.negative_log_likelihood(y) \
229 + L1_reg * classifier.L1 \
230 + L2_reg * classifier.L2_sqr
231
232 # compiling a theano function that computes the mistakes that are made by
233 # the model on a minibatch
234 test_model = theano.function([x,y], classifier.errors(y))
235
236 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
237 g_W1 = T.grad(cost, classifier.W1)
238 g_b1 = T.grad(cost, classifier.b1)
239 g_W2 = T.grad(cost, classifier.W2)
240 g_b2 = T.grad(cost, classifier.b2)
241
242 # specify how to update the parameters of the model as a dictionary
243 updates = \
244 { classifier.W1: classifier.W1 - classifier.lr*g_W1 \
245 , classifier.b1: classifier.b1 - classifier.lr*g_b1 \
246 , classifier.W2: classifier.W2 - classifier.lr*g_W2 \
247 , classifier.b2: classifier.b2 - classifier.lr*g_b2 }
248
249 # compiling a theano function `train_model` that returns the cost, but in
250 # the same time updates the parameter of the model based on the rules
251 # defined in `updates`
252 train_model = theano.function([x, y], cost, updates = updates )
253 n_minibatches = len(train_batches)
254
255
256
257
258
259
260 #conditions for stopping the adaptation:
261 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size)
262 #2) validation error is going up twice in a row(probable overfitting)
263
264 # This means we no longer stop on slow convergence as low learning rates stopped
265 # too fast.
266
267 # no longer relevant
268 patience =nb_max_exemples/batch_size
269 patience_increase = 2 # wait this much longer when a new best is
270 # found
271 improvement_threshold = 0.995 # a relative improvement of this much is
272 # considered significant
273 validation_frequency = n_minibatches/4
274
275
276
277
278 best_params = None
279 best_validation_loss = float('inf')
280 best_iter = 0
281 test_score = 0.
282 start_time = time.clock()
283 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples
284 n_iter = n_iter/n_minibatches + 1 #round up
285 n_iter=max(1,n_iter) # run at least once on short debug call
286 time_n=0 #in unit of exemples
287
288
289
290 if verbose == True:
291 print 'looping at most %d times through the data set' %n_iter
292 for iter in xrange(n_iter* n_minibatches):
293
294 # get epoch and minibatch index
295 epoch = iter / n_minibatches
296 minibatch_index = iter % n_minibatches
297
298
299 if adaptive_lr==2:
300 classifier.lr.value = tau*initial_lr/(tau+time_n)
301
302 # get the minibatches corresponding to `iter` modulo
303 # `len(train_batches)`
304 x,y = train_batches[ minibatch_index ]
305 # convert to float
306 x_float = x/255.0
307 cost_ij = train_model(x_float,y)
308
309 if (iter+1) % validation_frequency == 0:
310 # compute zero-one loss on validation set
311
312 this_validation_loss = 0.
313 for x,y in validation_batches:
314 # sum up the errors for each minibatch
315 x_float = x/255.0
316 this_validation_loss += test_model(x_float,y)
317 # get the average by dividing with the number of minibatches
318 this_validation_loss /= len(validation_batches)
319 #save the validation loss
320 total_validation_error_list.append(this_validation_loss)
321
322 #get the training error rate
323 this_train_loss=0
324 for x,y in train_batches:
325 # sum up the errors for each minibatch
326 x_float = x/255.0
327 this_train_loss += test_model(x_float,y)
328 # get the average by dividing with the number of minibatches
329 this_train_loss /= len(train_batches)
330 #save the validation loss
331 total_train_error_list.append(this_train_loss)
332 if(this_train_loss<best_training_error):
333 best_training_error=this_train_loss
334
335 if verbose == True:
336 print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \
337 (epoch, minibatch_index+1, n_minibatches, \
338 this_validation_loss*100.,this_train_loss*100))
339 print 'learning rate = %f' %classifier.lr.value
340 print 'time = %i' %time_n
341
342
343 #save the learning rate
344 learning_rate_list.append(classifier.lr.value)
345
346
347 # if we got the best validation score until now
348 if this_validation_loss < best_validation_loss:
349 # save best validation score and iteration number
350 best_validation_loss = this_validation_loss
351 best_iter = iter
352 # reset patience if we are going down again
353 # so we continue exploring
354 patience=nb_max_exemples/batch_size
355 # test it on the test set
356 test_score = 0.
357 for x,y in test_batches:
358 x_float=x/255.0
359 test_score += test_model(x_float,y)
360 test_score /= len(test_batches)
361 if verbose == True:
362 print((' epoch %i, minibatch %i/%i, test error of best '
363 'model %f %%') %
364 (epoch, minibatch_index+1, n_minibatches,
365 test_score*100.))
366
367 # if the validation error is going up, we are overfitting (or oscillating)
368 # stop converging but run at least to next validation
369 # to check overfitting or ocsillation
370 # the saved weights of the model will be a bit off in that case
371 elif this_validation_loss >= best_validation_loss:
372 #calculate the test error at this point and exit
373 # test it on the test set
374 # however, if adaptive_lr is true, try reducing the lr to
375 # get us out of an oscilliation
376 if adaptive_lr==1:
377 classifier.lr.value=classifier.lr.value/2.0
378
379 test_score = 0.
380 #cap the patience so we are allowed one more validation error
381 #calculation before aborting
382 patience = iter+validation_frequency+1
383 for x,y in test_batches:
384 x_float=x/255.0
385 test_score += test_model(x_float,y)
386 test_score /= len(test_batches)
387 if verbose == True:
388 print ' validation error is going up, possibly stopping soon'
389 print((' epoch %i, minibatch %i/%i, test error of best '
390 'model %f %%') %
391 (epoch, minibatch_index+1, n_minibatches,
392 test_score*100.))
393
394
395
396
397 if iter>patience:
398 print 'we have diverged'
399 break
400
401
402 time_n= time_n + batch_size
403 end_time = time.clock()
404 if verbose == True:
405 print(('Optimization complete. Best validation score of %f %% '
406 'obtained at iteration %i, with test performance %f %%') %
407 (best_validation_loss * 100., best_iter, test_score*100.))
408 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
409 print iter
410
411 #save the model and the weights
412 numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value)
413 numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\
414 learning_rate_list=learning_rate_list)
415
416 return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
417
418
419 if __name__ == '__main__':
420 mlp_full_nist(True)
421
422 def jobman_mlp_full_nist(state,channel):
423 (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
424 nb_max_exemples=state.nb_max_exemples,\
425 nb_hidden=state.nb_hidden,\
426 adaptive_lr=state.adaptive_lr,\
427 tau=state.tau,\
428 main_class=state.main_class,\
429 start_ratio=state.start_ratio,\
430 end_ratio=state.end_ratio)
431 state.train_error=train_error
432 state.validation_error=validation_error
433 state.test_error=test_error
434 state.nb_exemples=nb_exemples
435 state.time=time
436 return channel.COMPLETE
437
438