110
|
1 """
|
|
2 This tutorial introduces the multilayer perceptron using Theano.
|
|
3
|
|
4 A multilayer perceptron is a logistic regressor where
|
|
5 instead of feeding the input to the logistic regression you insert a
|
|
6 intermidiate layer, called the hidden layer, that has a nonlinear
|
|
7 activation function (usually tanh or sigmoid) . One can use many such
|
|
8 hidden layers making the architecture deep. The tutorial will also tackle
|
|
9 the problem of MNIST digit classification.
|
|
10
|
|
11 .. math::
|
|
12
|
|
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
|
|
14
|
|
15 References:
|
|
16
|
|
17 - textbooks: "Pattern Recognition and Machine Learning" -
|
|
18 Christopher M. Bishop, section 5
|
|
19
|
|
20 TODO: recommended preprocessing, lr ranges, regularization ranges (explain
|
|
21 to do lr first, then add regularization)
|
|
22
|
|
23 """
|
|
24 __docformat__ = 'restructedtext en'
|
|
25
|
|
26 import pdb
|
|
27 import numpy
|
|
28 import pylab
|
|
29 import theano
|
|
30 import theano.tensor as T
|
|
31 import time
|
|
32 import theano.tensor.nnet
|
|
33 from pylearn.io import filetensor as ft
|
|
34
|
|
35 data_path = '/data/lisa/data/nist/by_class/'
|
|
36
|
|
37 class MLP(object):
|
|
38 """Multi-Layer Perceptron Class
|
|
39
|
|
40 A multilayer perceptron is a feedforward artificial neural network model
|
|
41 that has one layer or more of hidden units and nonlinear activations.
|
|
42 Intermidiate layers usually have as activation function thanh or the
|
|
43 sigmoid function while the top layer is a softamx layer.
|
|
44 """
|
|
45
|
|
46
|
|
47
|
|
48 def __init__(self, input, n_in, n_hidden, n_out):
|
|
49 """Initialize the parameters for the multilayer perceptron
|
|
50
|
|
51 :param input: symbolic variable that describes the input of the
|
|
52 architecture (one minibatch)
|
|
53
|
|
54 :param n_in: number of input units, the dimension of the space in
|
|
55 which the datapoints lie
|
|
56
|
|
57 :param n_hidden: number of hidden units
|
|
58
|
|
59 :param n_out: number of output units, the dimension of the space in
|
|
60 which the labels lie
|
|
61
|
|
62 """
|
|
63
|
|
64 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this
|
|
65 # example contains only one hidden layer, but one can have as many
|
|
66 # layers as he/she wishes, making the network deeper. The only
|
|
67 # problem making the network deep this way is during learning,
|
|
68 # backpropagation being unable to move the network from the starting
|
|
69 # point towards; this is where pre-training helps, giving a good
|
|
70 # starting point for backpropagation, but more about this in the
|
|
71 # other tutorials
|
|
72
|
|
73 # `W1` is initialized with `W1_values` which is uniformely sampled
|
|
74 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
|
|
75 # the output of uniform if converted using asarray to dtype
|
|
76 # theano.config.floatX so that the code is runable on GPU
|
|
77 W1_values = numpy.asarray( numpy.random.uniform( \
|
|
78 low = -numpy.sqrt(6./(n_in+n_hidden)), \
|
|
79 high = numpy.sqrt(6./(n_in+n_hidden)), \
|
|
80 size = (n_in, n_hidden)), dtype = theano.config.floatX)
|
|
81 # `W2` is initialized with `W2_values` which is uniformely sampled
|
|
82 # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
|
|
83 # the output of uniform if converted using asarray to dtype
|
|
84 # theano.config.floatX so that the code is runable on GPU
|
|
85 W2_values = numpy.asarray( numpy.random.uniform(
|
|
86 low = -numpy.sqrt(6./(n_hidden+n_out)), \
|
|
87 high= numpy.sqrt(6./(n_hidden+n_out)),\
|
|
88 size= (n_hidden, n_out)), dtype = theano.config.floatX)
|
|
89
|
|
90 self.W1 = theano.shared( value = W1_values )
|
|
91 self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
|
|
92 dtype= theano.config.floatX))
|
|
93 self.W2 = theano.shared( value = W2_values )
|
|
94 self.b2 = theano.shared( value = numpy.zeros((n_out,),
|
|
95 dtype= theano.config.floatX))
|
|
96
|
|
97 # symbolic expression computing the values of the hidden layer
|
|
98 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
|
|
99
|
|
100 # symbolic expression computing the values of the top layer
|
|
101 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
|
|
102
|
|
103 # compute prediction as class whose probability is maximal in
|
|
104 # symbolic form
|
|
105 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
|
|
106
|
|
107 # L1 norm ; one regularization option is to enforce L1 norm to
|
|
108 # be small
|
|
109 self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
|
|
110
|
|
111 # square of L2 norm ; one regularization option is to enforce
|
|
112 # square of L2 norm to be small
|
|
113 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
|
|
114
|
|
115
|
|
116
|
|
117 def negative_log_likelihood(self, y):
|
|
118 """Return the mean of the negative log-likelihood of the prediction
|
|
119 of this model under a given target distribution.
|
|
120
|
|
121 .. math::
|
|
122
|
|
123 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
|
|
124 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
|
|
125 \ell (\theta=\{W,b\}, \mathcal{D})
|
|
126
|
|
127
|
|
128 :param y: corresponds to a vector that gives for each example the
|
|
129 :correct label
|
|
130 """
|
|
131 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
|
|
132
|
|
133
|
|
134
|
|
135
|
|
136 def errors(self, y):
|
|
137 """Return a float representing the number of errors in the minibatch
|
|
138 over the total number of examples of the minibatch
|
|
139 """
|
|
140
|
|
141 # check if y has same dimension of y_pred
|
|
142 if y.ndim != self.y_pred.ndim:
|
|
143 raise TypeError('y should have the same shape as self.y_pred',
|
|
144 ('y', target.type, 'y_pred', self.y_pred.type))
|
|
145 # check if y is of the correct datatype
|
|
146 if y.dtype.startswith('int'):
|
|
147 # the T.neq operator returns a vector of 0s and 1s, where 1
|
|
148 # represents a mistake in prediction
|
|
149 return T.mean(T.neq(self.y_pred, y))
|
|
150 else:
|
|
151 raise NotImplementedError()
|
|
152
|
|
153 #def jobman_mlp(state,channel):
|
|
154 # (validation_error,test_error,nb_exemples,time)=mlp_full_nist(state.learning_rate,\
|
|
155 # state.n_iter,\
|
|
156 # state.batch_size,\
|
|
157 # state.nb_hidden_units)
|
|
158 # state.validation_error = validation_error
|
|
159 # state.test_error = test_error
|
|
160 # state.nb_exemples = nb_exemples
|
|
161 # state.time=time
|
|
162 # return channel.COMPLETE
|
|
163
|
|
164
|
|
165
|
|
166
|
|
167 def mlp_full_nist( verbose = False,\
|
|
168 train_data = 'all/all_train_data.ft',\
|
|
169 train_labels = 'all/all_train_labels.ft',\
|
|
170 test_data = 'all/all_test_data.ft',\
|
|
171 test_labels = 'all/all_test_labels.ft',\
|
|
172 learning_rate=0.01,\
|
|
173 L1_reg = 0.00,\
|
|
174 L2_reg = 0.0001,\
|
|
175 nb_max_exemples=1000000,\
|
|
176 batch_size=20,\
|
|
177 nb_hidden = 500,\
|
|
178 nb_targets = 62):
|
|
179
|
|
180
|
|
181
|
|
182 f = open(data_path+train_data)
|
|
183 g= open(data_path+train_labels)
|
|
184 h = open(data_path+test_data)
|
|
185 i= open(data_path+test_labels)
|
|
186
|
|
187 raw_train_data = ft.read(f)
|
|
188 raw_train_labels = ft.read(g)
|
|
189 raw_test_data = ft.read(h)
|
|
190 raw_test_labels = ft.read(i)
|
|
191
|
|
192 f.close()
|
|
193 g.close()
|
|
194 i.close()
|
|
195 h.close()
|
|
196 #create a validation set the same size as the test size
|
|
197 #use the end of the training array for this purpose
|
|
198 #discard the last remaining so we get a %batch_size number
|
|
199 test_size=len(raw_test_labels)
|
|
200 test_size = int(test_size/batch_size)
|
|
201 test_size*=batch_size
|
|
202 train_size = len(raw_train_data)
|
|
203 train_size = int(train_size/batch_size)
|
|
204 train_size*=batch_size
|
|
205 validation_size =test_size
|
|
206 offset = train_size-test_size
|
|
207 if verbose == True:
|
|
208 print 'train size = %d' %train_size
|
|
209 print 'test size = %d' %test_size
|
|
210 print 'valid size = %d' %validation_size
|
|
211 print 'offset = %d' %offset
|
|
212
|
|
213
|
|
214 train_set = (raw_train_data,raw_train_labels)
|
|
215 train_batches = []
|
|
216 for i in xrange(0, train_size-test_size, batch_size):
|
|
217 train_batches = train_batches + \
|
|
218 [(raw_train_data[i:i+batch_size], raw_train_labels[i:i+batch_size])]
|
|
219
|
|
220 test_batches = []
|
|
221 for i in xrange(0, test_size, batch_size):
|
|
222 test_batches = test_batches + \
|
|
223 [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])]
|
|
224
|
|
225 validation_batches = []
|
|
226 for i in xrange(0, test_size, batch_size):
|
|
227 validation_batches = validation_batches + \
|
|
228 [(raw_train_data[offset+i:offset+i+batch_size], raw_train_labels[offset+i:offset+i+batch_size])]
|
|
229
|
|
230
|
|
231 ishape = (32,32) # this is the size of NIST images
|
|
232
|
|
233 # allocate symbolic variables for the data
|
|
234 x = T.fmatrix() # the data is presented as rasterized images
|
|
235 y = T.lvector() # the labels are presented as 1D vector of
|
|
236 # [long int] labels
|
|
237
|
|
238 # construct the logistic regression class
|
|
239 classifier = MLP( input=x.reshape((batch_size,32*32)),\
|
|
240 n_in=32*32,\
|
|
241 n_hidden=nb_hidden,\
|
|
242 n_out=nb_targets)
|
|
243
|
|
244 # the cost we minimize during training is the negative log likelihood of
|
|
245 # the model plus the regularization terms (L1 and L2); cost is expressed
|
|
246 # here symbolically
|
|
247 cost = classifier.negative_log_likelihood(y) \
|
|
248 + L1_reg * classifier.L1 \
|
|
249 + L2_reg * classifier.L2_sqr
|
|
250
|
|
251 # compiling a theano function that computes the mistakes that are made by
|
|
252 # the model on a minibatch
|
|
253 test_model = theano.function([x,y], classifier.errors(y))
|
|
254
|
|
255 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
|
|
256 g_W1 = T.grad(cost, classifier.W1)
|
|
257 g_b1 = T.grad(cost, classifier.b1)
|
|
258 g_W2 = T.grad(cost, classifier.W2)
|
|
259 g_b2 = T.grad(cost, classifier.b2)
|
|
260
|
|
261 # specify how to update the parameters of the model as a dictionary
|
|
262 updates = \
|
|
263 { classifier.W1: classifier.W1 - learning_rate*g_W1 \
|
|
264 , classifier.b1: classifier.b1 - learning_rate*g_b1 \
|
|
265 , classifier.W2: classifier.W2 - learning_rate*g_W2 \
|
|
266 , classifier.b2: classifier.b2 - learning_rate*g_b2 }
|
|
267
|
|
268 # compiling a theano function `train_model` that returns the cost, but in
|
|
269 # the same time updates the parameter of the model based on the rules
|
|
270 # defined in `updates`
|
|
271 train_model = theano.function([x, y], cost, updates = updates )
|
|
272 n_minibatches = len(train_batches)
|
|
273
|
|
274
|
|
275
|
|
276
|
|
277 #conditions for stopping the adaptation:
|
|
278 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size)
|
|
279 #2) validation error is going up (probable overfitting)
|
|
280
|
|
281 # This means we no longer stop on slow convergence as low learning rates stopped
|
|
282 # too fast.
|
|
283 patience =nb_max_exemples/batch_size
|
|
284 patience_increase = 2 # wait this much longer when a new best is
|
|
285 # found
|
|
286 improvement_threshold = 0.995 # a relative improvement of this much is
|
|
287 # considered significant
|
|
288 validation_frequency = n_minibatches/4
|
|
289
|
|
290
|
|
291
|
|
292
|
|
293 best_params = None
|
|
294 best_validation_loss = float('inf')
|
|
295 best_iter = 0
|
|
296 test_score = 0.
|
|
297 start_time = time.clock()
|
|
298 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples
|
|
299 n_iter = n_iter/n_minibatches + 1
|
|
300 n_iter=max(1,n_iter) # run at least once on short debug call
|
|
301 # have a maximum of `n_iter` iterations through the entire dataset
|
|
302
|
|
303 if verbose == True:
|
|
304 print 'looping at most %d times through the data set' %n_iter
|
|
305 for iter in xrange(n_iter* n_minibatches):
|
|
306
|
|
307 # get epoch and minibatch index
|
|
308 epoch = iter / n_minibatches
|
|
309 minibatch_index = iter % n_minibatches
|
|
310
|
|
311 # get the minibatches corresponding to `iter` modulo
|
|
312 # `len(train_batches)`
|
|
313 x,y = train_batches[ minibatch_index ]
|
|
314 # convert to float
|
|
315 x_float = x/255.0
|
|
316 cost_ij = train_model(x_float,y)
|
|
317
|
|
318 if (iter+1) % validation_frequency == 0:
|
|
319 # compute zero-one loss on validation set
|
|
320
|
|
321 this_validation_loss = 0.
|
|
322 for x,y in validation_batches:
|
|
323 # sum up the errors for each minibatch
|
|
324 x_float = x/255.0
|
|
325 this_validation_loss += test_model(x_float,y)
|
|
326 # get the average by dividing with the number of minibatches
|
|
327 this_validation_loss /= len(validation_batches)
|
|
328 if verbose == True:
|
|
329 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
|
|
330 (epoch, minibatch_index+1, n_minibatches, \
|
|
331 this_validation_loss*100.))
|
|
332
|
|
333
|
|
334 # if we got the best validation score until now
|
|
335 if this_validation_loss < best_validation_loss:
|
|
336
|
|
337 #improve patience if loss improvement is good enough
|
|
338 if this_validation_loss < best_validation_loss * \
|
|
339 improvement_threshold :
|
|
340 patience = max(patience, iter * patience_increase)
|
|
341 elif verbose == True:
|
|
342 print 'slow convergence stop'
|
|
343
|
|
344 # save best validation score and iteration number
|
|
345 best_validation_loss = this_validation_loss
|
|
346 best_iter = iter
|
|
347
|
|
348 # test it on the test set
|
|
349 test_score = 0.
|
|
350 for x,y in test_batches:
|
|
351 x_float=x/255.0
|
|
352 test_score += test_model(x_float,y)
|
|
353 test_score /= len(test_batches)
|
|
354 if verbose == True:
|
|
355 print((' epoch %i, minibatch %i/%i, test error of best '
|
|
356 'model %f %%') %
|
|
357 (epoch, minibatch_index+1, n_minibatches,
|
|
358 test_score*100.))
|
|
359
|
|
360 #if the validation error is going up, we are overfitting
|
|
361 #stop converging
|
|
362 elif this_validation_loss > best_validation_loss:
|
|
363 #calculate the test error at this point and exit
|
|
364 # test it on the test set
|
|
365 if verbose==True:
|
|
366 print ' We are diverging'
|
|
367 best_iter = iter
|
|
368 test_score = 0.
|
|
369 for x,y in test_batches:
|
|
370 x_float=x/255.0
|
|
371 test_score += test_model(x_float,y)
|
|
372 test_score /= len(test_batches)
|
|
373 if verbose == True:
|
|
374 print ' validation error is going up, stopping now'
|
|
375 print((' epoch %i, minibatch %i/%i, test error of best '
|
|
376 'model %f %%') %
|
|
377 (epoch, minibatch_index+1, n_minibatches,
|
|
378 test_score*100.))
|
|
379
|
|
380 break
|
|
381
|
|
382
|
|
383
|
|
384 if patience <= iter :
|
|
385 break
|
|
386
|
|
387
|
|
388 end_time = time.clock()
|
|
389 if verbose == True:
|
|
390 print(('Optimization complete. Best validation score of %f %% '
|
|
391 'obtained at iteration %i, with test performance %f %%') %
|
|
392 (best_validation_loss * 100., best_iter, test_score*100.))
|
|
393 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
|
|
394 print iter
|
|
395 return (best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
|
|
396
|
|
397
|
|
398 if __name__ == '__main__':
|
|
399 mlp_full_mnist()
|
|
400
|
|
401 def jobman_mlp_full_nist(state,channel):
|
|
402 (validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
|
|
403 nb_max_exemples=state.nb_max_exemples,\
|
|
404 nb_hidden=state.nb_hidden)
|
|
405 state.validation_error=validation_error
|
|
406 state.test_error=test_error
|
|
407 state.nb_exemples=nb_exemples
|
|
408 state.time=time
|
|
409 return channel.COMPLETE
|
|
410
|
|
411 |