comparison code_tutoriel/mlp.py @ 0:fda5f787baa6

commit initial
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Thu, 21 Jan 2010 11:26:43 -0500
parents
children bcc87d3e33a3
comparison
equal deleted inserted replaced
-1:000000000000 0:fda5f787baa6
1 """
2 This tutorial introduces the multilayer perceptron using Theano.
3
4 A multilayer perceptron is a logistic regressor where
5 instead of feeding the input to the logistic regression you insert a
6 intermidiate layer, called the hidden layer, that has a nonlinear
7 activation function (usually tanh or sigmoid) . One can use many such
8 hidden layers making the architecture deep. The tutorial will also tackle
9 the problem of MNIST digit classification.
10
11 .. math::
12
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
14
15 References:
16
17 - textbooks: "Pattern Recognition and Machine Learning" -
18 Christopher M. Bishop, section 5
19
20 TODO: recommended preprocessing, lr ranges, regularization ranges (explain
21 to do lr first, then add regularization)
22
23 """
24 __docformat__ = 'restructedtext en'
25
26
27 import numpy, cPickle, gzip
28
29
30 import theano
31 import theano.tensor as T
32
33 import time
34
35 import theano.tensor.nnet
36
37 class MLP(object):
38 """Multi-Layer Perceptron Class
39
40 A multilayer perceptron is a feedforward artificial neural network model
41 that has one layer or more of hidden units and nonlinear activations.
42 Intermidiate layers usually have as activation function thanh or the
43 sigmoid function while the top layer is a softamx layer.
44 """
45
46
47
48 def __init__(self, input, n_in, n_hidden, n_out):
49 """Initialize the parameters for the multilayer perceptron
50
51 :param input: symbolic variable that describes the input of the
52 architecture (one minibatch)
53
54 :param n_in: number of input units, the dimension of the space in
55 which the datapoints lie
56
57 :param n_hidden: number of hidden units
58
59 :param n_out: number of output units, the dimension of the space in
60 which the labels lie
61
62 """
63
64 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this
65 # example contains only one hidden layer, but one can have as many
66 # layers as he/she wishes, making the network deeper. The only
67 # problem making the network deep this way is during learning,
68 # backpropagation being unable to move the network from the starting
69 # point towards; this is where pre-training helps, giving a good
70 # starting point for backpropagation, but more about this in the
71 # other tutorials
72
73 # `W1` is initialized with `W1_values` which is uniformely sampled
74 # from -1/sqrt(n_in) and 1/sqrt(n_in)
75 # the output of uniform if converted using asarray to dtype
76 # theano.config.floatX so that the code is runable on GPU
77 W1_values = numpy.asarray( numpy.random.uniform( \
78 low = -numpy.sqrt(6./(n_in+n_hidden)), high = numpy.sqrt(6./(n_in+n_hidden)), \
79 size = (n_in, n_hidden)), dtype = theano.config.floatX)
80 # `W2` is initialized with `W2_values` which is uniformely sampled
81 # from -1/sqrt(n_hidden) and 1/sqrt(n_hidden)
82 # the output of uniform if converted using asarray to dtype
83 # theano.config.floatX so that the code is runable on GPU
84 W2_values = numpy.asarray( numpy.random.uniform(
85 low = numpy.sqrt(6./(n_hidden+n_out)), high= numpy.sqrt(6./(n_hidden+n_out)),\
86 size= (n_hidden, n_out)), dtype = theano.config.floatX)
87
88 self.W1 = theano.shared( value = W1_values )
89 self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
90 dtype= theano.config.floatX))
91 self.W2 = theano.shared( value = W2_values )
92 self.b2 = theano.shared( value = numpy.zeros((n_out,),
93 dtype= theano.config.floatX))
94
95 # symbolic expression computing the values of the hidden layer
96 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
97
98 # symbolic expression computing the values of the top layer
99 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
100
101 # compute prediction as class whose probability is maximal in
102 # symbolic form
103 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
104
105 # L1 norm ; one regularization option is to enforce L1 norm to
106 # be small
107 self.L1 = abs(self.W1).sum() + abs(self.W2).sum()
108
109 # square of L2 norm ; one regularization option is to enforce
110 # square of L2 norm to be small
111 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum()
112
113
114
115 def negative_log_likelihood(self, y):
116 """Return the mean of the negative log-likelihood of the prediction
117 of this model under a given target distribution.
118
119 .. math::
120
121 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
122 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
123 \ell (\theta=\{W,b\}, \mathcal{D})
124
125
126 :param y: corresponds to a vector that gives for each example the
127 :correct label
128 """
129 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
130
131
132
133
134 def errors(self, y):
135 """Return a float representing the number of errors in the minibatch
136 over the total number of examples of the minibatch
137 """
138
139 # check if y has same dimension of y_pred
140 if y.ndim != self.y_pred.ndim:
141 raise TypeError('y should have the same shape as self.y_pred',
142 ('y', target.type, 'y_pred', self.y_pred.type))
143 # check if y is of the correct datatype
144 if y.dtype.startswith('int'):
145 # the T.neq operator returns a vector of 0s and 1s, where 1
146 # represents a mistake in prediction
147 return T.mean(T.neq(self.y_pred, y))
148 else:
149 raise NotImplementedError()
150
151
152
153 def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \
154 L2_reg = 0.0001, n_iter=100):
155 """
156 Demonstrate stochastic gradient descent optimization for a multilayer
157 perceptron
158
159 This is demonstrated on MNIST.
160
161 :param learning_rate: learning rate used (factor for the stochastic
162 gradient
163
164 :param n_iter: number of iterations ot run the optimizer
165
166 :param L1_reg: L1-norm's weight when added to the cost (see
167 regularization)
168
169 :param L2_reg: L2-norm's weight when added to the cost (see
170 regularization)
171 """
172
173 # Load the dataset
174 f = gzip.open('mnist.pkl.gz','rb')
175 train_set, valid_set, test_set = cPickle.load(f)
176 f.close()
177
178 # make minibatches of size 20
179 batch_size = 20 # sized of the minibatch
180
181 # Dealing with the training set
182 # get the list of training images (x) and their labels (y)
183 (train_set_x, train_set_y) = train_set
184 # initialize the list of training minibatches with empty list
185 train_batches = []
186 for i in xrange(0, len(train_set_x), batch_size):
187 # add to the list of minibatches the minibatch starting at
188 # position i, ending at position i+batch_size
189 # a minibatch is a pair ; the first element of the pair is a list
190 # of datapoints, the second element is the list of corresponding
191 # labels
192 train_batches = train_batches + \
193 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
194
195 # Dealing with the validation set
196 (valid_set_x, valid_set_y) = valid_set
197 # initialize the list of validation minibatches
198 valid_batches = []
199 for i in xrange(0, len(valid_set_x), batch_size):
200 valid_batches = valid_batches + \
201 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
202
203 # Dealing with the testing set
204 (test_set_x, test_set_y) = test_set
205 # initialize the list of testing minibatches
206 test_batches = []
207 for i in xrange(0, len(test_set_x), batch_size):
208 test_batches = test_batches + \
209 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
210
211
212 ishape = (28,28) # this is the size of MNIST images
213
214 # allocate symbolic variables for the data
215 x = T.fmatrix() # the data is presented as rasterized images
216 y = T.lvector() # the labels are presented as 1D vector of
217 # [long int] labels
218
219 # construct the logistic regression class
220 classifier = MLP( input=x.reshape((batch_size,28*28)),\
221 n_in=28*28, n_hidden = 500, n_out=10)
222
223 # the cost we minimize during training is the negative log likelihood of
224 # the model plus the regularization terms (L1 and L2); cost is expressed
225 # here symbolically
226 cost = classifier.negative_log_likelihood(y) \
227 + L1_reg * classifier.L1 \
228 + L2_reg * classifier.L2_sqr
229
230 # compiling a theano function that computes the mistakes that are made by
231 # the model on a minibatch
232 test_model = theano.function([x,y], classifier.errors(y))
233
234 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
235 g_W1 = T.grad(cost, classifier.W1)
236 g_b1 = T.grad(cost, classifier.b1)
237 g_W2 = T.grad(cost, classifier.W2)
238 g_b2 = T.grad(cost, classifier.b2)
239
240 # specify how to update the parameters of the model as a dictionary
241 updates = \
242 { classifier.W1: classifier.W1 - learning_rate*g_W1 \
243 , classifier.b1: classifier.b1 - learning_rate*g_b1 \
244 , classifier.W2: classifier.W2 - learning_rate*g_W2 \
245 , classifier.b2: classifier.b2 - learning_rate*g_b2 }
246
247 # compiling a theano function `train_model` that returns the cost, but in
248 # the same time updates the parameter of the model based on the rules
249 # defined in `updates`
250 train_model = theano.function([x, y], cost, updates = updates )
251 n_minibatches = len(train_batches)
252
253 # early-stopping parameters
254 patience = 10000 # look as this many examples regardless
255 patience_increase = 2 # wait this much longer when a new best is
256 # found
257 improvement_threshold = 0.995 # a relative improvement of this much is
258 # considered significant
259 validation_frequency = n_minibatches # go through this many
260 # minibatche before checking the network
261 # on the validation set; in this case we
262 # check every epoch
263
264
265 best_params = None
266 best_validation_loss = float('inf')
267 test_score = 0.
268 start_time = time.clock()
269 # have a maximum of `n_iter` iterations through the entire dataset
270 for iter in xrange(n_iter* n_minibatches):
271
272 # get epoch and minibatch index
273 epoch = iter / n_minibatches
274 minibatch_index = iter % n_minibatches
275
276 # get the minibatches corresponding to `iter` modulo
277 # `len(train_batches)`
278 x,y = train_batches[ minibatch_index ]
279 cost_ij = train_model(x,y)
280
281 if (iter+1) % validation_frequency == 0:
282 # compute zero-one loss on validation set
283 this_validation_loss = 0.
284 for x,y in valid_batches:
285 # sum up the errors for each minibatch
286 this_validation_loss += test_model(x,y)
287 # get the average by dividing with the number of minibatches
288 this_validation_loss /= len(valid_batches)
289
290 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
291 (epoch, minibatch_index+1, n_minibatches, \
292 this_validation_loss*100.))
293
294
295 # if we got the best validation score until now
296 if this_validation_loss < best_validation_loss:
297
298 #improve patience if loss improvement is good enough
299 if this_validation_loss < best_validation_loss * \
300 improvement_threshold :
301 patience = max(patience, iter * patience_increase)
302
303 best_validation_loss = this_validation_loss
304 # test it on the test set
305
306 test_score = 0.
307 for x,y in test_batches:
308 test_score += test_model(x,y)
309 test_score /= len(test_batches)
310 print((' epoch %i, minibatch %i/%i, test error of best '
311 'model %f %%') %
312 (epoch, minibatch_index+1, n_minibatches,
313 test_score*100.))
314
315 if patience <= iter :
316 break
317
318 end_time = time.clock()
319 print(('Optimization complete with best validation score of %f %%,'
320 'with test performance %f %%') %
321 (best_validation_loss * 100., test_score*100.))
322 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
323
324
325
326
327
328
329 if __name__ == '__main__':
330 sgd_optimization_mnist()
331