comparison baseline_algorithms/mlp/mlp_nist.py @ 110:93b4b84d86cf

added simple mlp file
author XavierMuller
date Tue, 16 Feb 2010 17:12:35 -0500
parents
children f341a4efb44a
comparison
equal deleted inserted replaced
109:9c45e0071b52 110:93b4b84d86cf
1 """
2 This tutorial introduces the multilayer perceptron using Theano.
3
4 A multilayer perceptron is a logistic regressor where
5 instead of feeding the input to the logistic regression you insert a
6 intermidiate layer, called the hidden layer, that has a nonlinear
7 activation function (usually tanh or sigmoid) . One can use many such
8 hidden layers making the architecture deep. The tutorial will also tackle
9 the problem of MNIST digit classification.
10
11 .. math::
12
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
14
15 References:
16
17 - textbooks: "Pattern Recognition and Machine Learning" -
18 Christopher M. Bishop, section 5
19
20 TODO: recommended preprocessing, lr ranges, regularization ranges (explain
21 to do lr first, then add regularization)
22
23 """
24 __docformat__ = 'restructedtext en'
25
26 import pdb
27 import numpy
28 import pylab
29 import theano
30 import theano.tensor as T
31 import time
32 import theano.tensor.nnet
33 from pylearn.io import filetensor as ft
34
35 data_path = '/data/lisa/data/nist/by_class/'
36
37 class MLP(object):
38 """Multi-Layer Perceptron Class
39
40 A multilayer perceptron is a feedforward artificial neural network model
41 that has one layer or more of hidden units and nonlinear activations.
42 Intermidiate layers usually have as activation function thanh or the
43 sigmoid function while the top layer is a softamx layer.
44 """
45
46
47
48 def __init__(self, input, n_in, n_hidden, n_out):
49 """Initialize the parameters for the multilayer perceptron
50
51 :param input: symbolic variable that describes the input of the
52 architecture (one minibatch)
53
54 :param n_in: number of input units, the dimension of the space in
55 which the datapoints lie
56
57 :param n_hidden: number of hidden units
58
59 :param n_out: number of output units, the dimension of the space in
60 which the labels lie
61
62 """
63
64 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this
65 # example contains only one hidden layer, but one can have as many
66 # layers as he/she wishes, making the network deeper. The only
67 # problem making the network deep this way is during learning,
68 # backpropagation being unable to move the network from the starting
69 # point towards; this is where pre-training helps, giving a good
70 # starting point for backpropagation, but more about this in the
71 # other tutorials
72
73 # `W1` is initialized with `W1_values` which is uniformely sampled
74 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
75 # the output of uniform if converted using asarray to dtype
76 # theano.config.floatX so that the code is runable on GPU
77 W1_values = numpy.asarray( numpy.random.uniform( \
78 low = -numpy.sqrt(6./(n_in+n_hidden)), \
79 high = numpy.sqrt(6./(n_in+n_hidden)), \
80 size = (n_in, n_hidden)), dtype = theano.config.floatX)
81 # `W2` is initialized with `W2_values` which is uniformely sampled
82 # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
83 # the output of uniform if converted using asarray to dtype
84 # theano.config.floatX so that the code is runable on GPU
85 W2_values = numpy.asarray( numpy.random.uniform(
86 low = -numpy.sqrt(6./(n_hidden+n_out)), \
87 high= numpy.sqrt(6./(n_hidden+n_out)),\
88 size= (n_hidden, n_out)), dtype = theano.config.floatX)
89
90 self.W1 = theano.shared( value = W1_values )
91 self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
92 dtype= theano.config.floatX))
93 self.W2 = theano.shared( value = W2_values )
94 self.b2 = theano.shared( value = numpy.zeros((n_out,),
95 dtype= theano.config.floatX))
96
97 # symbolic expression computing the values of the hidden layer
98 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
99
100 # symbolic expression computing the values of the top layer
101 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
102
103 # compute prediction as class whose probability is maximal in
104 # symbolic form
105 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
106
107 # L1 norm ; one regularization option is to enforce L1 norm to
108 # be small
109 self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
110
111 # square of L2 norm ; one regularization option is to enforce
112 # square of L2 norm to be small
113 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
114
115
116
117 def negative_log_likelihood(self, y):
118 """Return the mean of the negative log-likelihood of the prediction
119 of this model under a given target distribution.
120
121 .. math::
122
123 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
124 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
125 \ell (\theta=\{W,b\}, \mathcal{D})
126
127
128 :param y: corresponds to a vector that gives for each example the
129 :correct label
130 """
131 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
132
133
134
135
136 def errors(self, y):
137 """Return a float representing the number of errors in the minibatch
138 over the total number of examples of the minibatch
139 """
140
141 # check if y has same dimension of y_pred
142 if y.ndim != self.y_pred.ndim:
143 raise TypeError('y should have the same shape as self.y_pred',
144 ('y', target.type, 'y_pred', self.y_pred.type))
145 # check if y is of the correct datatype
146 if y.dtype.startswith('int'):
147 # the T.neq operator returns a vector of 0s and 1s, where 1
148 # represents a mistake in prediction
149 return T.mean(T.neq(self.y_pred, y))
150 else:
151 raise NotImplementedError()
152
153 #def jobman_mlp(state,channel):
154 # (validation_error,test_error,nb_exemples,time)=mlp_full_nist(state.learning_rate,\
155 # state.n_iter,\
156 # state.batch_size,\
157 # state.nb_hidden_units)
158 # state.validation_error = validation_error
159 # state.test_error = test_error
160 # state.nb_exemples = nb_exemples
161 # state.time=time
162 # return channel.COMPLETE
163
164
165
166
167 def mlp_full_nist( verbose = False,\
168 train_data = 'all/all_train_data.ft',\
169 train_labels = 'all/all_train_labels.ft',\
170 test_data = 'all/all_test_data.ft',\
171 test_labels = 'all/all_test_labels.ft',\
172 learning_rate=0.01,\
173 L1_reg = 0.00,\
174 L2_reg = 0.0001,\
175 nb_max_exemples=1000000,\
176 batch_size=20,\
177 nb_hidden = 500,\
178 nb_targets = 62):
179
180
181
182 f = open(data_path+train_data)
183 g= open(data_path+train_labels)
184 h = open(data_path+test_data)
185 i= open(data_path+test_labels)
186
187 raw_train_data = ft.read(f)
188 raw_train_labels = ft.read(g)
189 raw_test_data = ft.read(h)
190 raw_test_labels = ft.read(i)
191
192 f.close()
193 g.close()
194 i.close()
195 h.close()
196 #create a validation set the same size as the test size
197 #use the end of the training array for this purpose
198 #discard the last remaining so we get a %batch_size number
199 test_size=len(raw_test_labels)
200 test_size = int(test_size/batch_size)
201 test_size*=batch_size
202 train_size = len(raw_train_data)
203 train_size = int(train_size/batch_size)
204 train_size*=batch_size
205 validation_size =test_size
206 offset = train_size-test_size
207 if verbose == True:
208 print 'train size = %d' %train_size
209 print 'test size = %d' %test_size
210 print 'valid size = %d' %validation_size
211 print 'offset = %d' %offset
212
213
214 train_set = (raw_train_data,raw_train_labels)
215 train_batches = []
216 for i in xrange(0, train_size-test_size, batch_size):
217 train_batches = train_batches + \
218 [(raw_train_data[i:i+batch_size], raw_train_labels[i:i+batch_size])]
219
220 test_batches = []
221 for i in xrange(0, test_size, batch_size):
222 test_batches = test_batches + \
223 [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])]
224
225 validation_batches = []
226 for i in xrange(0, test_size, batch_size):
227 validation_batches = validation_batches + \
228 [(raw_train_data[offset+i:offset+i+batch_size], raw_train_labels[offset+i:offset+i+batch_size])]
229
230
231 ishape = (32,32) # this is the size of NIST images
232
233 # allocate symbolic variables for the data
234 x = T.fmatrix() # the data is presented as rasterized images
235 y = T.lvector() # the labels are presented as 1D vector of
236 # [long int] labels
237
238 # construct the logistic regression class
239 classifier = MLP( input=x.reshape((batch_size,32*32)),\
240 n_in=32*32,\
241 n_hidden=nb_hidden,\
242 n_out=nb_targets)
243
244 # the cost we minimize during training is the negative log likelihood of
245 # the model plus the regularization terms (L1 and L2); cost is expressed
246 # here symbolically
247 cost = classifier.negative_log_likelihood(y) \
248 + L1_reg * classifier.L1 \
249 + L2_reg * classifier.L2_sqr
250
251 # compiling a theano function that computes the mistakes that are made by
252 # the model on a minibatch
253 test_model = theano.function([x,y], classifier.errors(y))
254
255 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
256 g_W1 = T.grad(cost, classifier.W1)
257 g_b1 = T.grad(cost, classifier.b1)
258 g_W2 = T.grad(cost, classifier.W2)
259 g_b2 = T.grad(cost, classifier.b2)
260
261 # specify how to update the parameters of the model as a dictionary
262 updates = \
263 { classifier.W1: classifier.W1 - learning_rate*g_W1 \
264 , classifier.b1: classifier.b1 - learning_rate*g_b1 \
265 , classifier.W2: classifier.W2 - learning_rate*g_W2 \
266 , classifier.b2: classifier.b2 - learning_rate*g_b2 }
267
268 # compiling a theano function `train_model` that returns the cost, but in
269 # the same time updates the parameter of the model based on the rules
270 # defined in `updates`
271 train_model = theano.function([x, y], cost, updates = updates )
272 n_minibatches = len(train_batches)
273
274
275
276
277 #conditions for stopping the adaptation:
278 #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size)
279 #2) validation error is going up (probable overfitting)
280
281 # This means we no longer stop on slow convergence as low learning rates stopped
282 # too fast.
283 patience =nb_max_exemples/batch_size
284 patience_increase = 2 # wait this much longer when a new best is
285 # found
286 improvement_threshold = 0.995 # a relative improvement of this much is
287 # considered significant
288 validation_frequency = n_minibatches/4
289
290
291
292
293 best_params = None
294 best_validation_loss = float('inf')
295 best_iter = 0
296 test_score = 0.
297 start_time = time.clock()
298 n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples
299 n_iter = n_iter/n_minibatches + 1
300 n_iter=max(1,n_iter) # run at least once on short debug call
301 # have a maximum of `n_iter` iterations through the entire dataset
302
303 if verbose == True:
304 print 'looping at most %d times through the data set' %n_iter
305 for iter in xrange(n_iter* n_minibatches):
306
307 # get epoch and minibatch index
308 epoch = iter / n_minibatches
309 minibatch_index = iter % n_minibatches
310
311 # get the minibatches corresponding to `iter` modulo
312 # `len(train_batches)`
313 x,y = train_batches[ minibatch_index ]
314 # convert to float
315 x_float = x/255.0
316 cost_ij = train_model(x_float,y)
317
318 if (iter+1) % validation_frequency == 0:
319 # compute zero-one loss on validation set
320
321 this_validation_loss = 0.
322 for x,y in validation_batches:
323 # sum up the errors for each minibatch
324 x_float = x/255.0
325 this_validation_loss += test_model(x_float,y)
326 # get the average by dividing with the number of minibatches
327 this_validation_loss /= len(validation_batches)
328 if verbose == True:
329 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
330 (epoch, minibatch_index+1, n_minibatches, \
331 this_validation_loss*100.))
332
333
334 # if we got the best validation score until now
335 if this_validation_loss < best_validation_loss:
336
337 #improve patience if loss improvement is good enough
338 if this_validation_loss < best_validation_loss * \
339 improvement_threshold :
340 patience = max(patience, iter * patience_increase)
341 elif verbose == True:
342 print 'slow convergence stop'
343
344 # save best validation score and iteration number
345 best_validation_loss = this_validation_loss
346 best_iter = iter
347
348 # test it on the test set
349 test_score = 0.
350 for x,y in test_batches:
351 x_float=x/255.0
352 test_score += test_model(x_float,y)
353 test_score /= len(test_batches)
354 if verbose == True:
355 print((' epoch %i, minibatch %i/%i, test error of best '
356 'model %f %%') %
357 (epoch, minibatch_index+1, n_minibatches,
358 test_score*100.))
359
360 #if the validation error is going up, we are overfitting
361 #stop converging
362 elif this_validation_loss > best_validation_loss:
363 #calculate the test error at this point and exit
364 # test it on the test set
365 if verbose==True:
366 print ' We are diverging'
367 best_iter = iter
368 test_score = 0.
369 for x,y in test_batches:
370 x_float=x/255.0
371 test_score += test_model(x_float,y)
372 test_score /= len(test_batches)
373 if verbose == True:
374 print ' validation error is going up, stopping now'
375 print((' epoch %i, minibatch %i/%i, test error of best '
376 'model %f %%') %
377 (epoch, minibatch_index+1, n_minibatches,
378 test_score*100.))
379
380 break
381
382
383
384 if patience <= iter :
385 break
386
387
388 end_time = time.clock()
389 if verbose == True:
390 print(('Optimization complete. Best validation score of %f %% '
391 'obtained at iteration %i, with test performance %f %%') %
392 (best_validation_loss * 100., best_iter, test_score*100.))
393 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
394 print iter
395 return (best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60)
396
397
398 if __name__ == '__main__':
399 mlp_full_mnist()
400
401 def jobman_mlp_full_nist(state,channel):
402 (validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\
403 nb_max_exemples=state.nb_max_exemples,\
404 nb_hidden=state.nb_hidden)
405 state.validation_error=validation_error
406 state.test_error=test_error
407 state.nb_exemples=nb_exemples
408 state.time=time
409 return channel.COMPLETE
410
411