comparison baseline/mlp/mlp_nist.py @ 169:d37c944133c3

directory name change
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 14:24:11 -0500
parents baseline_algorithms/mlp/mlp_nist.py@8ceaaf812891
children e390b0454515
comparison
equal deleted inserted replaced
168:5e0e5f1860ec 169:d37c944133c3
1 """
2 This tutorial introduces the multilayer perceptron using Theano.
3
4 A multilayer perceptron is a logistic regressor where
5 instead of feeding the input to the logistic regression you insert a
6 intermidiate layer, called the hidden layer, that has a nonlinear
7 activation function (usually tanh or sigmoid) . One can use many such
8 hidden layers making the architecture deep. The tutorial will also tackle
9 the problem of MNIST digit classification.
10
11 .. math::
12
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
14
15 References:
16
17 - textbooks: "Pattern Recognition and Machine Learning" -
18 Christopher M. Bishop, section 5
19
20 TODO: recommended preprocessing, lr ranges, regularization ranges (explain
21 to do lr first, then add regularization)
22
23 """
24 __docformat__ = 'restructedtext en'
25
26 import pdb
27 import numpy
28 import pylab
29 import theano
30 import theano.tensor as T
31 import time
32 import theano.tensor.nnet
33 import pylearn
34 from pylearn.io import filetensor as ft
35
36 data_path = '/data/lisa/data/nist/by_class/'
37
38 class MLP(object):
39 """Multi-Layer Perceptron Class
40
41 A multilayer perceptron is a feedforward artificial neural network model
42 that has one layer or more of hidden units and nonlinear activations.
43 Intermidiate layers usually have as activation function thanh or the
44 sigmoid function while the top layer is a softamx layer.
45 """
46
47
48
49 def __init__(self, input, n_in, n_hidden, n_out,learning_rate):
50 """Initialize the parameters for the multilayer perceptron
51
52 :param input: symbolic variable that describes the input of the
53 architecture (one minibatch)
54
55 :param n_in: number of input units, the dimension of the space in
56 which the datapoints lie
57
58 :param n_hidden: number of hidden units
59
60 :param n_out: number of output units, the dimension of the space in
61 which the labels lie
62
63 """
64
65 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this
66 # example contains only one hidden layer, but one can have as many
67 # layers as he/she wishes, making the network deeper. The only
68 # problem making the network deep this way is during learning,
69 # backpropagation being unable to move the network from the starting
70 # point towards; this is where pre-training helps, giving a good
71 # starting point for backpropagation, but more about this in the
72 # other tutorials
73
74 # `W1` is initialized with `W1_values` which is uniformely sampled
75 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
76 # the output of uniform if converted using asarray to dtype
77 # theano.config.floatX so that the code is runable on GPU
78 W1_values = numpy.asarray( numpy.random.uniform( \
79 low = -numpy.sqrt(6./(n_in+n_hidden)), \
80 high = numpy.sqrt(6./(n_in+n_hidden)), \
81 size = (n_in, n_hidden)), dtype = theano.config.floatX)
82 # `W2` is initialized with `W2_values` which is uniformely sampled
83 # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
84 # the output of uniform if converted using asarray to dtype
85 # theano.config.floatX so that the code is runable on GPU
86 W2_values = numpy.asarray( numpy.random.uniform(
87 low = -numpy.sqrt(6./(n_hidden+n_out)), \
88 high= numpy.sqrt(6./(n_hidden+n_out)),\
89 size= (n_hidden, n_out)), dtype = theano.config.floatX)
90
91 self.W1 = theano.shared( value = W1_values )
92 self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
93 dtype= theano.config.floatX))
94 self.W2 = theano.shared( value = W2_values )
95 self.b2 = theano.shared( value = numpy.zeros((n_out,),
96 dtype= theano.config.floatX))
97
98 #include the learning rate in the classifer so
99 #we can modify it on the fly when we want
100 lr_value=learning_rate
101 self.lr=theano.shared(value=lr_value)
102 # symbolic expression computing the values of the hidden layer
103 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
104
105
106
107 # symbolic expression computing the values of the top layer
108 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
109
110 # compute prediction as class whose probability is maximal in
111 # symbolic form
112 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
113 self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1)
114
115
116
117
118 # L1 norm ; one regularization option is to enforce L1 norm to
119 # be small
120 self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
121
122 # square of L2 norm ; one regularization option is to enforce
123 # square of L2 norm to be small
124 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
125
126
127
128 def negative_log_likelihood(self, y):
129 """Return the mean of the negative log-likelihood of the prediction
130 of this model under a given target distribution.
131
132 .. math::
133
134 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
135 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
136 \ell (\theta=\{W,b\}, \mathcal{D})
137
138
139 :param y: corresponds to a vector that gives for each example the
140 :correct label
141 """
142 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
143
144
145
146
147 def errors(self, y):
148 """Return a float representing the number of errors in the minibatch
149 over the total number of examples of the minibatch
150 """
151
152 # check if y has same dimension of y_pred
153 if y.ndim != self.y_pred.ndim:
154 raise TypeError('y should have the same shape as self.y_pred',
155 ('y', target.type, 'y_pred', self.y_pred.type))
156 # check if y is of the correct datatype
157 if y.dtype.startswith('int'):
158 # the T.neq operator returns a vector of 0s and 1s, where 1
159 # represents a mistake in prediction
160 return T.mean(T.neq(self.y_pred, y))
161 else:
162 raise NotImplementedError()
163
164
165 def mlp_full_nist( verbose = False,\
166 adaptive_lr = 0,\
167 train_data = 'all/all_train_data.ft',\
168 train_labels = 'all/all_train_labels.ft',\
169 test_data = 'all/all_test_data.ft',\
170 test_labels = 'all/all_test_labels.ft',\
171 learning_rate=0.01,\
172 L1_reg = 0.00,\
173 L2_reg = 0.0001,\
174 nb_max_exemples=1000000,\
175 batch_size=20,\
176 nb_hidden = 500,\
177 nb_targets = 62):
178
179
180 configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr]
181
182 total_validation_error_list = []
183 total_train_error_list = []
184 learning_rate_list=[]
185 best_training_error=float('inf');
186
187
188
189 f = open(data_path+train_data)
190 g= open(data_path+train_labels)
191 h = open(data_path+test_data)
192 i= open(data_path+test_labels)
193
194 raw_train_data = ft.read(f)
195 raw_train_labels = ft.read(g)
196 raw_test_data = ft.read(h)
197 raw_test_labels = ft.read(i)
198
199 f.close()
200 g.close()
201 i.close()
202 h.close()
203 #create a validation set the same size as the test size
204 #use the end of the training array for this purpose
205 #discard the last remaining so we get a %batch_size number
206 test_size=len(raw_test_labels)
207 test_size = int(test_size/batch_size)
208 test_size*=batch_size
209 train_size = len(raw_train_data)
210 train_size = int(train_size/batch_size)
211 train_size*=batch_size
212 validation_size =test_size
213 offset = train_size-test_size
214 if verbose == True:
215 print 'train size = %d' %train_size
216 print 'test size = %d' %test_size
217 print 'valid size = %d' %validation_size
218 print 'offset = %d' %offset
219
220
221 train_set = (raw_train_data,raw_train_labels)
222 train_batches = []
223 for i in xrange(0, train_size-test_size, batch_size):
224 train_batches = train_batches + \
225 [(raw_train_data[i:i+batch_size], raw_train_labels[i:i+batch_size])]
226
227 test_batches = []
228 for i in xrange(0, test_size, batch_size):
229 test_batches = test_batches + \
230 [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])]
231
232 validation_batches = []
233 for i in xrange(0, test_size, batch_size):
234 validation_batches = validation_batches + \
235 [(raw_train_data[offset+i:offset+i+batch_size], raw_train_labels[offset+i:offset+i+batch_size])]
236
237
238 ishape = (32,32) # this is the size of NIST images
239
240 # allocate symbolic variables for the data
241 x = T.fmatrix() # the data is presented as rasterized images
242 y = T.lvector() # the labels are presented as 1D vector of
243 # [long int] labels
244
245 if verbose==True:
246 print 'finished parsing the data'
247 # construct the logistic regression class
248 classifier = MLP( input=x.reshape((batch_size,32*32)),\
249 n_in=32*32,\
250 n_hidden=nb_hidden,\
251 n_out=nb_targets,
252 learning_rate=learning_rate)
253
254
255
256
257 # the cost we minimize during training is the negative log likelihood of
258 # the model plus the regularization terms (L1 and L2); cost is expressed
259 # here symbolically
260 cost = classifier.negative_log_likelihood(y) \
261 + L1_reg * classifier.L1 \
262 + L2_reg * classifier.L2_sqr
263
264 # compiling a theano function that computes the mistakes that are made by
265 # the model on a minibatch
266 test_model = theano.function([x,y], classifier.errors(y))
267
268 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
269 g_W1 = T.grad(cost, classifier.W1)
270 g_b1 = T.grad(cost, classifier.b1)
271 g_W2 = T.grad(cost, classifier.W2)
272 g_b2 = T.grad(cost, classifier.b2)
273
274 # specify how to update the parameters of the model as a dictionary
275 updates = \
276 { classifier.W1: classifier.W1 - classifier.lr*g_W1 \
277 , classifier.b1: classifier.b1 - classifier.lr*g_b1 \
278 , classifier.W2: classifier.W2 - classifier.lr*g_W2 \
279 , classifier.b2: classifier.b2 - classifier.lr*g_b2 }
280
281 # compiling a theano function `train_model` that returns the cost, but in
282 # the same time updates the parameter of the model based on the rules
283 # defined in `updates`
284 train_model = theano.function([x, y], cost, updates = updates )
285 n_minibatches = len(train_batches)
286
287
288
289
290
291
292 #conditions for stopping the adaptation:
293 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size)
294 #2) validation error is going up twice in a row(probable overfitting)
295
296 # This means we no longer stop on slow convergence as low learning rates stopped
297 # too fast.
298
299 # no longer relevant
300 patience =nb_max_exemples/batch_size
301 patience_increase = 2 # wait this much longer when a new best is
302 # found
303 improvement_threshold = 0.995 # a relative improvement of this much is
304 # considered significant
305 validation_frequency = n_minibatches/4
306
307
308
309
310 best_params = None
311 best_validation_loss = float('inf')
312 best_iter = 0
313 test_score = 0.
314 start_time = time.clock()
315 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples
316 n_iter = n_iter/n_minibatches + 1 #round up
317 n_iter=max(1,n_iter) # run at least once on short debug call
318
319
320 if verbose == True:
321 print 'looping at most %d times through the data set' %n_iter
322 for iter in xrange(n_iter* n_minibatches):
323
324 # get epoch and minibatch index
325 epoch = iter / n_minibatches
326 minibatch_index = iter % n_minibatches
327
328
329
330 # get the minibatches corresponding to `iter` modulo
331 # `len(train_batches)`
332 x,y = train_batches[ minibatch_index ]
333 # convert to float
334 x_float = x/255.0
335 cost_ij = train_model(x_float,y)
336
337 if (iter+1) % validation_frequency == 0:
338 # compute zero-one loss on validation set
339
340 this_validation_loss = 0.
341 for x,y in validation_batches:
342 # sum up the errors for each minibatch
343 x_float = x/255.0
344 this_validation_loss += test_model(x_float,y)
345 # get the average by dividing with the number of minibatches
346 this_validation_loss /= len(validation_batches)
347 #save the validation loss
348 total_validation_error_list.append(this_validation_loss)
349
350 #get the training error rate
351 this_train_loss=0
352 for x,y in train_batches:
353 # sum up the errors for each minibatch
354 x_float = x/255.0
355 this_train_loss += test_model(x_float,y)
356 # get the average by dividing with the number of minibatches
357 this_train_loss /= len(train_batches)
358 #save the validation loss
359 total_train_error_list.append(this_train_loss)
360 if(this_train_loss<best_training_error):
361 best_training_error=this_train_loss
362
363 if verbose == True:
364 print('epoch %i, minibatch %i/%i, validation error %f, training error %f %%' % \
365 (epoch, minibatch_index+1, n_minibatches, \
366 this_validation_loss*100.,this_train_loss*100))
367
368
369 #save the learning rate
370 learning_rate_list.append(classifier.lr.value)
371
372
373 # if we got the best validation score until now
374 if this_validation_loss < best_validation_loss:
375 # save best validation score and iteration number
376 best_validation_loss = this_validation_loss
377 best_iter = iter
378 # reset patience if we are going down again
379 # so we continue exploring
380 patience=nb_max_exemples/batch_size
381 # test it on the test set
382 test_score = 0.
383 for x,y in test_batches:
384 x_float=x/255.0
385 test_score += test_model(x_float,y)
386 test_score /= len(test_batches)
387 if verbose == True:
388 print((' epoch %i, minibatch %i/%i, test error of best '
389 'model %f %%') %
390 (epoch, minibatch_index+1, n_minibatches,
391 test_score*100.))
392
393 # if the validation error is going up, we are overfitting (or oscillating)
394 # stop converging but run at least to next validation
395 # to check overfitting or ocsillation
396 # the saved weights of the model will be a bit off in that case
397 elif this_validation_loss >= best_validation_loss:
398 #calculate the test error at this point and exit
399 # test it on the test set
400 # however, if adaptive_lr is true, try reducing the lr to
401 # get us out of an oscilliation
402 if adaptive_lr==1:
403 classifier.lr.value=classifier.lr.value/2.0
404
405 test_score = 0.
406 #cap the patience so we are allowed one more validation error
407 #calculation before aborting
408 patience = iter+validation_frequency+1
409 for x,y in test_batches:
410 x_float=x/255.0
411 test_score += test_model(x_float,y)
412 test_score /= len(test_batches)
413 if verbose == True:
414 print ' validation error is going up, possibly stopping soon'
415 print((' epoch %i, minibatch %i/%i, test error of best '
416 'model %f %%') %
417 (epoch, minibatch_index+1, n_minibatches,
418 test_score*100.))
419
420
421
422
423 if iter>patience:
424 print 'we have diverged'
425 break
426
427
428 end_time = time.clock()
429 if verbose == True:
430 print(('Optimization complete. Best validation score of %f %% '
431 'obtained at iteration %i, with test performance %f %%') %
432 (best_validation_loss * 100., best_iter, test_score*100.))
433 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
434 print iter
435
436 #save the model and the weights
437 numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value)
438 numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\
439 learning_rate_list=learning_rate_list)
440
441 return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
442
443
444 if __name__ == '__main__':
445 mlp_full_mnist()
446
447 def jobman_mlp_full_nist(state,channel):
448 (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
449 nb_max_exemples=state.nb_max_exemples,\
450 nb_hidden=state.nb_hidden,\
451 adaptive_lr=state.adaptive_lr)
452 state.train_error=train_error
453 state.validation_error=validation_error
454 state.test_error=test_error
455 state.nb_exemples=nb_exemples
456 state.time=time
457 return channel.COMPLETE
458
459