Mercurial > ift6266
comparison code_tutoriel/SdA.py @ 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
164:e3de934a98b6 | 165:4bc5eeec6394 |
---|---|
1 """ | |
2 This tutorial introduces stacked denoising auto-encoders (SdA) using Theano. | |
3 | |
4 Denoising autoencoders are the building blocks for SdA. | |
5 They are based on auto-encoders as the ones used in Bengio et al. 2007. | |
6 An autoencoder takes an input x and first maps it to a hidden representation | |
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting | |
8 latent representation y is then mapped back to a "reconstructed" vector | |
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight | |
10 matrix W' can optionally be constrained such that W' = W^T, in which case | |
11 the autoencoder is said to have tied weights. The network is trained such | |
12 that to minimize the reconstruction error (the error between x and z). | |
13 | |
14 For the denosing autoencoder, during training, first x is corrupted into | |
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means | |
16 of a stochastic mapping. Afterwards y is computed as before (using | |
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction | |
18 error is now measured between z and the uncorrupted input x, which is | |
19 computed as the cross-entropy : | |
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] | |
21 | |
22 | |
23 References : | |
24 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and | |
25 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, | |
26 2008 | |
27 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise | |
28 Training of Deep Networks, Advances in Neural Information Processing | |
29 Systems 19, 2007 | |
30 | |
31 """ | |
32 | |
33 import numpy, time, cPickle, gzip | |
34 | |
35 import theano | |
36 import theano.tensor as T | |
37 from theano.tensor.shared_randomstreams import RandomStreams | |
38 | |
39 from logistic_sgd import LogisticRegression, load_data | |
40 from mlp import HiddenLayer | |
41 from dA import dA | |
42 | |
43 | |
44 | |
45 class SdA(object): | |
46 """Stacked denoising auto-encoder class (SdA) | |
47 | |
48 A stacked denoising autoencoder model is obtained by stacking several | |
49 dAs. The hidden layer of the dA at layer `i` becomes the input of | |
50 the dA at layer `i+1`. The first layer dA gets as input the input of | |
51 the SdA, and the hidden layer of the last dA represents the output. | |
52 Note that after pretraining, the SdA is dealt with as a normal MLP, | |
53 the dAs are only used to initialize the weights. | |
54 """ | |
55 | |
56 def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, | |
57 hidden_layers_sizes = [500,500], n_outs = 10, | |
58 corruption_levels = [0.1, 0.1]): | |
59 """ This class is made to support a variable number of layers. | |
60 | |
61 :type numpy_rng: numpy.random.RandomState | |
62 :param numpy_rng: numpy random number generator used to draw initial | |
63 weights | |
64 | |
65 :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams | |
66 :param theano_rng: Theano random generator; if None is given one is | |
67 generated based on a seed drawn from `rng` | |
68 | |
69 :type n_ins: int | |
70 :param n_ins: dimension of the input to the sdA | |
71 | |
72 :type n_layers_sizes: list of ints | |
73 :param n_layers_sizes: intermidiate layers size, must contain | |
74 at least one value | |
75 | |
76 :type n_outs: int | |
77 :param n_outs: dimension of the output of the network | |
78 | |
79 :type corruption_levels: list of float | |
80 :param corruption_levels: amount of corruption to use for each | |
81 layer | |
82 """ | |
83 | |
84 self.sigmoid_layers = [] | |
85 self.dA_layers = [] | |
86 self.params = [] | |
87 self.n_layers = len(hidden_layers_sizes) | |
88 | |
89 assert self.n_layers > 0 | |
90 | |
91 if not theano_rng: | |
92 theano_rng = RandomStreams(numpy_rng.randint(2**30)) | |
93 # allocate symbolic variables for the data | |
94 self.x = T.matrix('x') # the data is presented as rasterized images | |
95 self.y = T.ivector('y') # the labels are presented as 1D vector of | |
96 # [int] labels | |
97 | |
98 # The SdA is an MLP, for which all weights of intermidiate layers | |
99 # are shared with a different denoising autoencoders | |
100 # We will first construct the SdA as a deep multilayer perceptron, | |
101 # and when constructing each sigmoidal layer we also construct a | |
102 # denoising autoencoder that shares weights with that layer | |
103 # During pretraining we will train these autoencoders (which will | |
104 # lead to chainging the weights of the MLP as well) | |
105 # During finetunining we will finish training the SdA by doing | |
106 # stochastich gradient descent on the MLP | |
107 | |
108 for i in xrange( self.n_layers ): | |
109 # construct the sigmoidal layer | |
110 | |
111 # the size of the input is either the number of hidden units of | |
112 # the layer below or the input size if we are on the first layer | |
113 if i == 0 : | |
114 input_size = n_ins | |
115 else: | |
116 input_size = hidden_layers_sizes[i-1] | |
117 | |
118 # the input to this layer is either the activation of the hidden | |
119 # layer below or the input of the SdA if you are on the first | |
120 # layer | |
121 if i == 0 : | |
122 layer_input = self.x | |
123 else: | |
124 layer_input = self.sigmoid_layers[-1].output | |
125 | |
126 sigmoid_layer = HiddenLayer(rng = numpy_rng, | |
127 input = layer_input, | |
128 n_in = input_size, | |
129 n_out = hidden_layers_sizes[i], | |
130 activation = T.nnet.sigmoid) | |
131 # add the layer to our list of layers | |
132 self.sigmoid_layers.append(sigmoid_layer) | |
133 # its arguably a philosophical question... | |
134 # but we are going to only declare that the parameters of the | |
135 # sigmoid_layers are parameters of the StackedDAA | |
136 # the visible biases in the dA are parameters of those | |
137 # dA, but not the SdA | |
138 self.params.extend(sigmoid_layer.params) | |
139 | |
140 # Construct a denoising autoencoder that shared weights with this | |
141 # layer | |
142 dA_layer = dA(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, | |
143 n_visible = input_size, | |
144 n_hidden = hidden_layers_sizes[i], | |
145 W = sigmoid_layer.W, bhid = sigmoid_layer.b) | |
146 self.dA_layers.append(dA_layer) | |
147 | |
148 | |
149 # We now need to add a logistic layer on top of the MLP | |
150 self.logLayer = LogisticRegression(\ | |
151 input = self.sigmoid_layers[-1].output,\ | |
152 n_in = hidden_layers_sizes[-1], n_out = n_outs) | |
153 | |
154 self.params.extend(self.logLayer.params) | |
155 # construct a function that implements one step of finetunining | |
156 | |
157 # compute the cost for second phase of training, | |
158 # defined as the negative log likelihood | |
159 self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) | |
160 # compute the gradients with respect to the model parameters | |
161 # symbolic variable that points to the number of errors made on the | |
162 # minibatch given by self.x and self.y | |
163 self.errors = self.logLayer.errors(self.y) | |
164 | |
165 def pretraining_functions(self, train_set_x, batch_size): | |
166 ''' Generates a list of functions, each of them implementing one | |
167 step in trainnig the dA corresponding to the layer with same index. | |
168 The function will require as input the minibatch index, and to train | |
169 a dA you just need to iterate, calling the corresponding function on | |
170 all minibatch indexes. | |
171 | |
172 :type train_set_x: theano.tensor.TensorType | |
173 :param train_set_x: Shared variable that contains all datapoints used | |
174 for training the dA | |
175 | |
176 :type batch_size: int | |
177 :param batch_size: size of a [mini]batch | |
178 | |
179 :type learning_rate: float | |
180 :param learning_rate: learning rate used during training for any of | |
181 the dA layers | |
182 ''' | |
183 | |
184 # index to a [mini]batch | |
185 index = T.lscalar('index') # index to a minibatch | |
186 corruption_level = T.scalar('corruption') # amount of corruption to use | |
187 learning_rate = T.scalar('lr') # learning rate to use | |
188 # number of batches | |
189 n_batches = train_set_x.value.shape[0] / batch_size | |
190 # begining of a batch, given `index` | |
191 batch_begin = index * batch_size | |
192 # ending of a batch given `index` | |
193 batch_end = batch_begin+batch_size | |
194 | |
195 pretrain_fns = [] | |
196 for dA in self.dA_layers: | |
197 # get the cost and the updates list | |
198 cost,updates = dA.get_cost_updates( corruption_level, learning_rate) | |
199 # compile the theano function | |
200 fn = theano.function( inputs = [index, | |
201 theano.Param(corruption_level, default = 0.2), | |
202 theano.Param(learning_rate, default = 0.1)], | |
203 outputs = cost, | |
204 updates = updates, | |
205 givens = {self.x :train_set_x[batch_begin:batch_end]}) | |
206 # append `fn` to the list of functions | |
207 pretrain_fns.append(fn) | |
208 | |
209 return pretrain_fns | |
210 | |
211 | |
212 def build_finetune_functions(self, datasets, batch_size, learning_rate): | |
213 '''Generates a function `train` that implements one step of | |
214 finetuning, a function `validate` that computes the error on | |
215 a batch from the validation set, and a function `test` that | |
216 computes the error on a batch from the testing set | |
217 | |
218 :type datasets: list of pairs of theano.tensor.TensorType | |
219 :param datasets: It is a list that contain all the datasets; | |
220 the has to contain three pairs, `train`, | |
221 `valid`, `test` in this order, where each pair | |
222 is formed of two Theano variables, one for the | |
223 datapoints, the other for the labels | |
224 | |
225 :type batch_size: int | |
226 :param batch_size: size of a minibatch | |
227 | |
228 :type learning_rate: float | |
229 :param learning_rate: learning rate used during finetune stage | |
230 ''' | |
231 | |
232 (train_set_x, train_set_y) = datasets[0] | |
233 (valid_set_x, valid_set_y) = datasets[1] | |
234 (test_set_x , test_set_y ) = datasets[2] | |
235 | |
236 # compute number of minibatches for training, validation and testing | |
237 n_valid_batches = valid_set_x.value.shape[0] / batch_size | |
238 n_test_batches = test_set_x.value.shape[0] / batch_size | |
239 | |
240 index = T.lscalar('index') # index to a [mini]batch | |
241 | |
242 # compute the gradients with respect to the model parameters | |
243 gparams = T.grad(self.finetune_cost, self.params) | |
244 | |
245 # compute list of fine-tuning updates | |
246 updates = {} | |
247 for param, gparam in zip(self.params, gparams): | |
248 updates[param] = param - gparam*learning_rate | |
249 | |
250 train_fn = theano.function(inputs = [index], | |
251 outputs = self.finetune_cost, | |
252 updates = updates, | |
253 givens = { | |
254 self.x : train_set_x[index*batch_size:(index+1)*batch_size], | |
255 self.y : train_set_y[index*batch_size:(index+1)*batch_size]}) | |
256 | |
257 test_score_i = theano.function([index], self.errors, | |
258 givens = { | |
259 self.x: test_set_x[index*batch_size:(index+1)*batch_size], | |
260 self.y: test_set_y[index*batch_size:(index+1)*batch_size]}) | |
261 | |
262 valid_score_i = theano.function([index], self.errors, | |
263 givens = { | |
264 self.x: valid_set_x[index*batch_size:(index+1)*batch_size], | |
265 self.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) | |
266 | |
267 # Create a function that scans the entire validation set | |
268 def valid_score(): | |
269 return [valid_score_i(i) for i in xrange(n_valid_batches)] | |
270 | |
271 # Create a function that scans the entire test set | |
272 def test_score(): | |
273 return [test_score_i(i) for i in xrange(n_test_batches)] | |
274 | |
275 return train_fn, valid_score, test_score | |
276 | |
277 | |
278 | |
279 | |
280 | |
281 | |
282 def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \ | |
283 pretrain_lr = 0.1, training_epochs = 1000, \ | |
284 dataset='mnist.pkl.gz'): | |
285 """ | |
286 Demonstrates how to train and test a stochastic denoising autoencoder. | |
287 | |
288 This is demonstrated on MNIST. | |
289 | |
290 :type learning_rate: float | |
291 :param learning_rate: learning rate used in the finetune stage | |
292 (factor for the stochastic gradient) | |
293 | |
294 :type pretraining_epochs: int | |
295 :param pretraining_epochs: number of epoch to do pretraining | |
296 | |
297 :type pretrain_lr: float | |
298 :param pretrain_lr: learning rate to be used during pre-training | |
299 | |
300 :type n_iter: int | |
301 :param n_iter: maximal number of iterations ot run the optimizer | |
302 | |
303 :type dataset: string | |
304 :param dataset: path the the pickled dataset | |
305 | |
306 """ | |
307 | |
308 datasets = load_data(dataset) | |
309 | |
310 train_set_x, train_set_y = datasets[0] | |
311 valid_set_x, valid_set_y = datasets[1] | |
312 test_set_x , test_set_y = datasets[2] | |
313 | |
314 | |
315 batch_size = 20 # size of the minibatch | |
316 | |
317 # compute number of minibatches for training, validation and testing | |
318 n_train_batches = train_set_x.value.shape[0] / batch_size | |
319 | |
320 # numpy random generator | |
321 numpy_rng = numpy.random.RandomState(123) | |
322 print '... building the model' | |
323 # construct the stacked denoising autoencoder class | |
324 sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28, | |
325 hidden_layers_sizes = [1000,1000,1000], | |
326 n_outs = 10) | |
327 | |
328 | |
329 ######################### | |
330 # PRETRAINING THE MODEL # | |
331 ######################### | |
332 print '... getting the pretraining functions' | |
333 pretraining_fns = sda.pretraining_functions( | |
334 train_set_x = train_set_x, | |
335 batch_size = batch_size ) | |
336 | |
337 print '... pre-training the model' | |
338 start_time = time.clock() | |
339 ## Pre-train layer-wise | |
340 for i in xrange(sda.n_layers): | |
341 # go through pretraining epochs | |
342 for epoch in xrange(pretraining_epochs): | |
343 # go through the training set | |
344 c = [] | |
345 for batch_index in xrange(n_train_batches): | |
346 c.append( pretraining_fns[i](index = batch_index, | |
347 corruption = 0.2, lr = pretrain_lr ) ) | |
348 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c) | |
349 | |
350 end_time = time.clock() | |
351 | |
352 print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) | |
353 | |
354 ######################## | |
355 # FINETUNING THE MODEL # | |
356 ######################## | |
357 | |
358 # get the training, validation and testing function for the model | |
359 print '... getting the finetuning functions' | |
360 train_fn, validate_model, test_model = sda.build_finetune_functions ( | |
361 datasets = datasets, batch_size = batch_size, | |
362 learning_rate = finetune_lr) | |
363 | |
364 print '... finetunning the model' | |
365 # early-stopping parameters | |
366 patience = 10000 # look as this many examples regardless | |
367 patience_increase = 2. # wait this much longer when a new best is | |
368 # found | |
369 improvement_threshold = 0.995 # a relative improvement of this much is | |
370 # considered significant | |
371 validation_frequency = min(n_train_batches, patience/2) | |
372 # go through this many | |
373 # minibatche before checking the network | |
374 # on the validation set; in this case we | |
375 # check every epoch | |
376 | |
377 | |
378 best_params = None | |
379 best_validation_loss = float('inf') | |
380 test_score = 0. | |
381 start_time = time.clock() | |
382 | |
383 done_looping = False | |
384 epoch = 0 | |
385 | |
386 while (epoch < training_epochs) and (not done_looping): | |
387 epoch = epoch + 1 | |
388 for minibatch_index in xrange(n_train_batches): | |
389 | |
390 minibatch_avg_cost = train_fn(minibatch_index) | |
391 iter = epoch * n_train_batches + minibatch_index | |
392 | |
393 if (iter+1) % validation_frequency == 0: | |
394 | |
395 validation_losses = validate_model() | |
396 this_validation_loss = numpy.mean(validation_losses) | |
397 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ | |
398 (epoch, minibatch_index+1, n_train_batches, \ | |
399 this_validation_loss*100.)) | |
400 | |
401 | |
402 # if we got the best validation score until now | |
403 if this_validation_loss < best_validation_loss: | |
404 | |
405 #improve patience if loss improvement is good enough | |
406 if this_validation_loss < best_validation_loss * \ | |
407 improvement_threshold : | |
408 patience = max(patience, iter * patience_increase) | |
409 | |
410 # save best validation score and iteration number | |
411 best_validation_loss = this_validation_loss | |
412 best_iter = iter | |
413 | |
414 # test it on the test set | |
415 test_losses = test_model() | |
416 test_score = numpy.mean(test_losses) | |
417 print((' epoch %i, minibatch %i/%i, test error of best ' | |
418 'model %f %%') % | |
419 (epoch, minibatch_index+1, n_train_batches, | |
420 test_score*100.)) | |
421 | |
422 | |
423 if patience <= iter : | |
424 done_looping = True | |
425 break | |
426 | |
427 end_time = time.clock() | |
428 print(('Optimization complete with best validation score of %f %%,' | |
429 'with test performance %f %%') % | |
430 (best_validation_loss * 100., test_score*100.)) | |
431 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | |
432 | |
433 | |
434 | |
435 | |
436 | |
437 | |
438 if __name__ == '__main__': | |
439 test_SdA() | |
440 | |
441 |