comparison code_tutoriel/mlp.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents 827de2cc34f8
children
comparison
equal deleted inserted replaced
164:e3de934a98b6 165:4bc5eeec6394
15 References: 15 References:
16 16
17 - textbooks: "Pattern Recognition and Machine Learning" - 17 - textbooks: "Pattern Recognition and Machine Learning" -
18 Christopher M. Bishop, section 5 18 Christopher M. Bishop, section 5
19 19
20 TODO: recommended preprocessing, lr ranges, regularization ranges (explain
21 to do lr first, then add regularization)
22
23 """ 20 """
24 __docformat__ = 'restructedtext en' 21 __docformat__ = 'restructedtext en'
25 22
26 23
27 import numpy, cPickle, gzip 24 import numpy, time, cPickle, gzip
28
29 25
30 import theano 26 import theano
31 import theano.tensor as T 27 import theano.tensor as T
32 28
33 import time 29
34 30 from logistic_sgd import LogisticRegression, load_data
35 import theano.tensor.nnet 31
32
33 class HiddenLayer(object):
34 def __init__(self, rng, input, n_in, n_out, activation = T.tanh):
35 """
36 Typical hidden layer of a MLP: units are fully-connected and have
37 sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
38 and the bias vector b is of shape (n_out,).
39
40 NOTE : The nonlinearity used here is tanh
41
42 Hidden unit activation is given by: tanh(dot(input,W) + b)
43
44 :type rng: numpy.random.RandomState
45 :param rng: a random number generator used to initialize weights
46
47 :type input: theano.tensor.dmatrix
48 :param input: a symbolic tensor of shape (n_examples, n_in)
49
50 :type n_in: int
51 :param n_in: dimensionality of input
52
53 :type n_out: int
54 :param n_out: number of hidden units
55
56 :type activation: theano.Op or function
57 :param activation: Non linearity to be applied in the hidden
58 layer
59 """
60 self.input = input
61
62 # `W` is initialized with `W_values` which is uniformely sampled
63 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden)
64 # the output of uniform if converted using asarray to dtype
65 # theano.config.floatX so that the code is runable on GPU
66 W_values = numpy.asarray( rng.uniform( \
67 low = -numpy.sqrt(6./(n_in+n_out)), \
68 high = numpy.sqrt(6./(n_in+n_out)), \
69 size = (n_in, n_out)), dtype = theano.config.floatX)
70 self.W = theano.shared(value = W_values)
71
72 b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
73 self.b = theano.shared(value= b_values)
74
75 self.output = activation(T.dot(input, self.W) + self.b)
76 # parameters of the model
77 self.params = [self.W, self.b]
78
36 79
37 class MLP(object): 80 class MLP(object):
38 """Multi-Layer Perceptron Class 81 """Multi-Layer Perceptron Class
39 82
40 A multilayer perceptron is a feedforward artificial neural network model 83 A multilayer perceptron is a feedforward artificial neural network model
41 that has one layer or more of hidden units and nonlinear activations. 84 that has one layer or more of hidden units and nonlinear activations.
42 Intermidiate layers usually have as activation function thanh or the 85 Intermidiate layers usually have as activation function thanh or the
43 sigmoid function while the top layer is a softamx layer. 86 sigmoid function (defined here by a ``SigmoidalLayer`` class) while the
87 top layer is a softamx layer (defined here by a ``LogisticRegression``
88 class).
44 """ 89 """
45 90
46 91
47 92
48 def __init__(self, input, n_in, n_hidden, n_out): 93 def __init__(self, rng, input, n_in, n_hidden, n_out):
49 """Initialize the parameters for the multilayer perceptron 94 """Initialize the parameters for the multilayer perceptron
50 95
96 :type rng: numpy.random.RandomState
97 :param rng: a random number generator used to initialize weights
98
99 :type input: theano.tensor.TensorType
51 :param input: symbolic variable that describes the input of the 100 :param input: symbolic variable that describes the input of the
52 architecture (one minibatch) 101 architecture (one minibatch)
53 102
103 :type n_in: int
54 :param n_in: number of input units, the dimension of the space in 104 :param n_in: number of input units, the dimension of the space in
55 which the datapoints lie 105 which the datapoints lie
56 106
107 :type n_hidden: int
57 :param n_hidden: number of hidden units 108 :param n_hidden: number of hidden units
58 109
110 :type n_out: int
59 :param n_out: number of output units, the dimension of the space in 111 :param n_out: number of output units, the dimension of the space in
60 which the labels lie 112 which the labels lie
61 113
62 """ 114 """
63 115
64 # initialize the parameters theta = (W1,b1,W2,b2) ; note that this 116 # Since we are dealing with a one hidden layer MLP, this will
65 # example contains only one hidden layer, but one can have as many 117 # translate into a TanhLayer connected to the LogisticRegression
66 # layers as he/she wishes, making the network deeper. The only 118 # layer; this can be replaced by a SigmoidalLayer, or a layer
67 # problem making the network deep this way is during learning, 119 # implementing any other nonlinearity
68 # backpropagation being unable to move the network from the starting 120 self.hiddenLayer = HiddenLayer(rng = rng, input = input,
69 # point towards; this is where pre-training helps, giving a good 121 n_in = n_in, n_out = n_hidden,
70 # starting point for backpropagation, but more about this in the 122 activation = T.tanh)
71 # other tutorials 123
72 124 # The logistic regression layer gets as input the hidden units
73 # `W1` is initialized with `W1_values` which is uniformely sampled 125 # of the hidden layer
74 # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) 126 self.logRegressionLayer = LogisticRegression(
75 # the output of uniform if converted using asarray to dtype 127 input = self.hiddenLayer.output,
76 # theano.config.floatX so that the code is runable on GPU 128 n_in = n_hidden,
77 W1_values = numpy.asarray( numpy.random.uniform( \ 129 n_out = n_out)
78 low = -numpy.sqrt(6./(n_in+n_hidden)), \ 130
79 high = numpy.sqrt(6./(n_in+n_hidden)), \
80 size = (n_in, n_hidden)), dtype = theano.config.floatX)
81 # `W2` is initialized with `W2_values` which is uniformely sampled
82 # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out)
83 # the output of uniform if converted using asarray to dtype
84 # theano.config.floatX so that the code is runable on GPU
85 W2_values = numpy.asarray( numpy.random.uniform(
86 low = -numpy.sqrt(6./(n_hidden+n_out)), \
87 high= numpy.sqrt(6./(n_hidden+n_out)),\
88 size= (n_hidden, n_out)), dtype = theano.config.floatX)
89
90 self.W1 = theano.shared( value = W1_values )
91 self.b1 = theano.shared( value = numpy.zeros((n_hidden,),
92 dtype= theano.config.floatX))
93 self.W2 = theano.shared( value = W2_values )
94 self.b2 = theano.shared( value = numpy.zeros((n_out,),
95 dtype= theano.config.floatX))
96
97 # symbolic expression computing the values of the hidden layer
98 self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1)
99
100 # symbolic expression computing the values of the top layer
101 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2)
102
103 # compute prediction as class whose probability is maximal in
104 # symbolic form
105 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
106
107 # L1 norm ; one regularization option is to enforce L1 norm to 131 # L1 norm ; one regularization option is to enforce L1 norm to
108 # be small 132 # be small
109 self.L1 = abs(self.W1).sum() + abs(self.W2).sum() 133 self.L1 = abs(self.hiddenLayer.W).sum() \
134 + abs(self.logRegressionLayer.W).sum()
110 135
111 # square of L2 norm ; one regularization option is to enforce 136 # square of L2 norm ; one regularization option is to enforce
112 # square of L2 norm to be small 137 # square of L2 norm to be small
113 self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() 138 self.L2_sqr = (self.hiddenLayer.W**2).sum() \
114 139 + (self.logRegressionLayer.W**2).sum()
115 140
116 141 # negative log likelihood of the MLP is given by the negative
117 def negative_log_likelihood(self, y): 142 # log likelihood of the output of the model, computed in the
118 """Return the mean of the negative log-likelihood of the prediction 143 # logistic regression layer
119 of this model under a given target distribution. 144 self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
120 145 # same holds for the function computing the number of errors
121 .. math:: 146 self.errors = self.logRegressionLayer.errors
122 147
123 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 148 # the parameters of the model are the parameters of the two layer it is
124 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ 149 # made out of
125 \ell (\theta=\{W,b\}, \mathcal{D}) 150 self.params = self.hiddenLayer.params + self.logRegressionLayer.params
126 151
127 152
128 :param y: corresponds to a vector that gives for each example the 153 def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000,
129 :correct label 154 dataset = 'mnist.pkl.gz'):
130 """
131 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
132
133
134
135
136 def errors(self, y):
137 """Return a float representing the number of errors in the minibatch
138 over the total number of examples of the minibatch
139 """
140
141 # check if y has same dimension of y_pred
142 if y.ndim != self.y_pred.ndim:
143 raise TypeError('y should have the same shape as self.y_pred',
144 ('y', target.type, 'y_pred', self.y_pred.type))
145 # check if y is of the correct datatype
146 if y.dtype.startswith('int'):
147 # the T.neq operator returns a vector of 0s and 1s, where 1
148 # represents a mistake in prediction
149 return T.mean(T.neq(self.y_pred, y))
150 else:
151 raise NotImplementedError()
152
153
154
155 def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \
156 L2_reg = 0.0001, n_iter=100):
157 """ 155 """
158 Demonstrate stochastic gradient descent optimization for a multilayer 156 Demonstrate stochastic gradient descent optimization for a multilayer
159 perceptron 157 perceptron
160 158
161 This is demonstrated on MNIST. 159 This is demonstrated on MNIST.
162 160
161 :type learning_rate: float
163 :param learning_rate: learning rate used (factor for the stochastic 162 :param learning_rate: learning rate used (factor for the stochastic
164 gradient 163 gradient
165 164
165 :type L1_reg: float
166 :param L1_reg: L1-norm's weight when added to the cost (see 166 :param L1_reg: L1-norm's weight when added to the cost (see
167 regularization) 167 regularization)
168 168
169 :type L2_reg: float
169 :param L2_reg: L2-norm's weight when added to the cost (see 170 :param L2_reg: L2-norm's weight when added to the cost (see
170 regularization) 171 regularization)
171 172
172 :param n_iter: maximal number of iterations ot run the optimizer 173 :type n_epochs: int
174 :param n_epochs: maximal number of epochs to run the optimizer
175
176 :type dataset: string
177 :param dataset: the path of the MNIST dataset file from
178 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
179
173 180
174 """ 181 """
175 182 datasets = load_data(dataset)
176 # Load the dataset 183
177 f = gzip.open('mnist.pkl.gz','rb') 184 train_set_x, train_set_y = datasets[0]
178 train_set, valid_set, test_set = cPickle.load(f) 185 valid_set_x, valid_set_y = datasets[1]
179 f.close() 186 test_set_x , test_set_y = datasets[2]
180 187
181 # make minibatches of size 20 188
182 batch_size = 20 # sized of the minibatch 189
183 190 batch_size = 20 # size of the minibatch
184 # Dealing with the training set 191
185 # get the list of training images (x) and their labels (y) 192 # compute number of minibatches for training, validation and testing
186 (train_set_x, train_set_y) = train_set 193 n_train_batches = train_set_x.value.shape[0] / batch_size
187 # initialize the list of training minibatches with empty list 194 n_valid_batches = valid_set_x.value.shape[0] / batch_size
188 train_batches = [] 195 n_test_batches = test_set_x.value.shape[0] / batch_size
189 for i in xrange(0, len(train_set_x), batch_size): 196
190 # add to the list of minibatches the minibatch starting at 197 ######################
191 # position i, ending at position i+batch_size 198 # BUILD ACTUAL MODEL #
192 # a minibatch is a pair ; the first element of the pair is a list 199 ######################
193 # of datapoints, the second element is the list of corresponding 200 print '... building the model'
194 # labels
195 train_batches = train_batches + \
196 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
197
198 # Dealing with the validation set
199 (valid_set_x, valid_set_y) = valid_set
200 # initialize the list of validation minibatches
201 valid_batches = []
202 for i in xrange(0, len(valid_set_x), batch_size):
203 valid_batches = valid_batches + \
204 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
205
206 # Dealing with the testing set
207 (test_set_x, test_set_y) = test_set
208 # initialize the list of testing minibatches
209 test_batches = []
210 for i in xrange(0, len(test_set_x), batch_size):
211 test_batches = test_batches + \
212 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
213
214
215 ishape = (28,28) # this is the size of MNIST images
216 201
217 # allocate symbolic variables for the data 202 # allocate symbolic variables for the data
218 x = T.fmatrix() # the data is presented as rasterized images 203 index = T.lscalar() # index to a [mini]batch
219 y = T.lvector() # the labels are presented as 1D vector of 204 x = T.matrix('x') # the data is presented as rasterized images
220 # [long int] labels 205 y = T.ivector('y') # the labels are presented as 1D vector of
221 206 # [int] labels
222 # construct the logistic regression class 207
223 classifier = MLP( input=x.reshape((batch_size,28*28)),\ 208 rng = numpy.random.RandomState(1234)
224 n_in=28*28, n_hidden = 500, n_out=10) 209
210 # construct the MLP class
211 classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10)
225 212
226 # the cost we minimize during training is the negative log likelihood of 213 # the cost we minimize during training is the negative log likelihood of
227 # the model plus the regularization terms (L1 and L2); cost is expressed 214 # the model plus the regularization terms (L1 and L2); cost is expressed
228 # here symbolically 215 # here symbolically
229 cost = classifier.negative_log_likelihood(y) \ 216 cost = classifier.negative_log_likelihood(y) \
230 + L1_reg * classifier.L1 \ 217 + L1_reg * classifier.L1 \
231 + L2_reg * classifier.L2_sqr 218 + L2_reg * classifier.L2_sqr
232 219
233 # compiling a theano function that computes the mistakes that are made by 220 # compiling a Theano function that computes the mistakes that are made
234 # the model on a minibatch 221 # by the model on a minibatch
235 test_model = theano.function([x,y], classifier.errors(y)) 222 test_model = theano.function(inputs = [index],
236 223 outputs = classifier.errors(y),
237 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) 224 givens={
238 g_W1 = T.grad(cost, classifier.W1) 225 x:test_set_x[index*batch_size:(index+1)*batch_size],
239 g_b1 = T.grad(cost, classifier.b1) 226 y:test_set_y[index*batch_size:(index+1)*batch_size]})
240 g_W2 = T.grad(cost, classifier.W2) 227
241 g_b2 = T.grad(cost, classifier.b2) 228 validate_model = theano.function(inputs = [index],
229 outputs = classifier.errors(y),
230 givens={
231 x:valid_set_x[index*batch_size:(index+1)*batch_size],
232 y:valid_set_y[index*batch_size:(index+1)*batch_size]})
233
234 # compute the gradient of cost with respect to theta (sotred in params)
235 # the resulting gradients will be stored in a list gparams
236 gparams = []
237 for param in classifier.params:
238 gparam = T.grad(cost, param)
239 gparams.append(gparam)
240
242 241
243 # specify how to update the parameters of the model as a dictionary 242 # specify how to update the parameters of the model as a dictionary
244 updates = \ 243 updates = {}
245 { classifier.W1: classifier.W1 - learning_rate*g_W1 \ 244 # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of
246 , classifier.b1: classifier.b1 - learning_rate*g_b1 \ 245 # same length, zip generates a list C of same size, where each element
247 , classifier.W2: classifier.W2 - learning_rate*g_W2 \ 246 # is a pair formed from the two lists :
248 , classifier.b2: classifier.b2 - learning_rate*g_b2 } 247 # C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ]
249 248 for param, gparam in zip(classifier.params, gparams):
250 # compiling a theano function `train_model` that returns the cost, but in 249 updates[param] = param - learning_rate*gparam
251 # the same time updates the parameter of the model based on the rules 250
251 # compiling a Theano function `train_model` that returns the cost, but
252 # in the same time updates the parameter of the model based on the rules
252 # defined in `updates` 253 # defined in `updates`
253 train_model = theano.function([x, y], cost, updates = updates ) 254 train_model =theano.function( inputs = [index], outputs = cost,
254 n_minibatches = len(train_batches) 255 updates = updates,
255 256 givens={
257 x:train_set_x[index*batch_size:(index+1)*batch_size],
258 y:train_set_y[index*batch_size:(index+1)*batch_size]})
259
260 ###############
261 # TRAIN MODEL #
262 ###############
263 print '... training'
264
256 # early-stopping parameters 265 # early-stopping parameters
257 patience = 10000 # look as this many examples regardless 266 patience = 10000 # look as this many examples regardless
258 patience_increase = 2 # wait this much longer when a new best is 267 patience_increase = 2 # wait this much longer when a new best is
259 # found 268 # found
260 improvement_threshold = 0.995 # a relative improvement of this much is 269 improvement_threshold = 0.995 # a relative improvement of this much is
261 # considered significant 270 # considered significant
262 validation_frequency = n_minibatches # go through this many 271 validation_frequency = min(n_train_batches,patience/2)
272 # go through this many
263 # minibatche before checking the network 273 # minibatche before checking the network
264 # on the validation set; in this case we 274 # on the validation set; in this case we
265 # check every epoch 275 # check every epoch
266 276
267 277
268 best_params = None 278 best_params = None
269 best_validation_loss = float('inf') 279 best_validation_loss = float('inf')
270 best_iter = 0 280 best_iter = 0
271 test_score = 0. 281 test_score = 0.
272 start_time = time.clock() 282 start_time = time.clock()
273 # have a maximum of `n_iter` iterations through the entire dataset 283
274 for iter in xrange(n_iter* n_minibatches): 284 epoch = 0
275 285 done_looping = False
276 # get epoch and minibatch index 286
277 epoch = iter / n_minibatches 287 while (epoch < n_epochs) and (not done_looping):
278 minibatch_index = iter % n_minibatches 288 epoch = epoch + 1
279 289 for minibatch_index in xrange(n_train_batches):
280 # get the minibatches corresponding to `iter` modulo 290
281 # `len(train_batches)` 291 minibatch_avg_cost = train_model(minibatch_index)
282 x,y = train_batches[ minibatch_index ] 292 # iteration number
283 cost_ij = train_model(x,y) 293 iter = epoch * n_train_batches + minibatch_index
284 294
285 if (iter+1) % validation_frequency == 0: 295 if (iter+1) % validation_frequency == 0:
286 # compute zero-one loss on validation set 296 # compute zero-one loss on validation set
287 this_validation_loss = 0. 297 validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
288 for x,y in valid_batches: 298 this_validation_loss = numpy.mean(validation_losses)
289 # sum up the errors for each minibatch
290 this_validation_loss += test_model(x,y)
291 # get the average by dividing with the number of minibatches
292 this_validation_loss /= len(valid_batches)
293 299
294 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ 300 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
295 (epoch, minibatch_index+1, n_minibatches, \ 301 (epoch, minibatch_index+1,n_train_batches, \
296 this_validation_loss*100.)) 302 this_validation_loss*100.))
297 303
298 304
299 # if we got the best validation score until now 305 # if we got the best validation score until now
300 if this_validation_loss < best_validation_loss: 306 if this_validation_loss < best_validation_loss:
301
302 #improve patience if loss improvement is good enough 307 #improve patience if loss improvement is good enough
303 if this_validation_loss < best_validation_loss * \ 308 if this_validation_loss < best_validation_loss * \
304 improvement_threshold : 309 improvement_threshold :
305 patience = max(patience, iter * patience_increase) 310 patience = max(patience, iter * patience_increase)
306 311
307 # save best validation score and iteration number
308 best_validation_loss = this_validation_loss 312 best_validation_loss = this_validation_loss
309 best_iter = iter
310
311 # test it on the test set 313 # test it on the test set
312 test_score = 0. 314
313 for x,y in test_batches: 315 test_losses = [test_model(i) for i in xrange(n_test_batches)]
314 test_score += test_model(x,y) 316 test_score = numpy.mean(test_losses)
315 test_score /= len(test_batches) 317
316 print((' epoch %i, minibatch %i/%i, test error of best ' 318 print((' epoch %i, minibatch %i/%i, test error of best '
317 'model %f %%') % 319 'model %f %%') % \
318 (epoch, minibatch_index+1, n_minibatches, 320 (epoch, minibatch_index+1, n_train_batches,test_score*100.))
319 test_score*100.))
320 321
321 if patience <= iter : 322 if patience <= iter :
322 break 323 done_looping = True
324 break
325
323 326
324 end_time = time.clock() 327 end_time = time.clock()
325 print(('Optimization complete. Best validation score of %f %% ' 328 print(('Optimization complete. Best validation score of %f %% '
326 'obtained at iteration %i, with test performance %f %%') % 329 'obtained at iteration %i, with test performance %f %%') %
327 (best_validation_loss * 100., best_iter, test_score*100.)) 330 (best_validation_loss * 100., best_iter, test_score*100.))
328 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) 331 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
329 332
330 333
331 if __name__ == '__main__': 334 if __name__ == '__main__':
332 sgd_optimization_mnist() 335 test_mlp()
333 336