comparison code_tutoriel/SdA.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents
children
comparison
equal deleted inserted replaced
164:e3de934a98b6 165:4bc5eeec6394
1 """
2 This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.
3
4 Denoising autoencoders are the building blocks for SdA.
5 They are based on auto-encoders as the ones used in Bengio et al. 2007.
6 An autoencoder takes an input x and first maps it to a hidden representation
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
8 latent representation y is then mapped back to a "reconstructed" vector
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight
10 matrix W' can optionally be constrained such that W' = W^T, in which case
11 the autoencoder is said to have tied weights. The network is trained such
12 that to minimize the reconstruction error (the error between x and z).
13
14 For the denosing autoencoder, during training, first x is corrupted into
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
16 of a stochastic mapping. Afterwards y is computed as before (using
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
18 error is now measured between z and the uncorrupted input x, which is
19 computed as the cross-entropy :
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
21
22
23 References :
24 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
25 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
26 2008
27 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
28 Training of Deep Networks, Advances in Neural Information Processing
29 Systems 19, 2007
30
31 """
32
33 import numpy, time, cPickle, gzip
34
35 import theano
36 import theano.tensor as T
37 from theano.tensor.shared_randomstreams import RandomStreams
38
39 from logistic_sgd import LogisticRegression, load_data
40 from mlp import HiddenLayer
41 from dA import dA
42
43
44
45 class SdA(object):
46 """Stacked denoising auto-encoder class (SdA)
47
48 A stacked denoising autoencoder model is obtained by stacking several
49 dAs. The hidden layer of the dA at layer `i` becomes the input of
50 the dA at layer `i+1`. The first layer dA gets as input the input of
51 the SdA, and the hidden layer of the last dA represents the output.
52 Note that after pretraining, the SdA is dealt with as a normal MLP,
53 the dAs are only used to initialize the weights.
54 """
55
56 def __init__(self, numpy_rng, theano_rng = None, n_ins = 784,
57 hidden_layers_sizes = [500,500], n_outs = 10,
58 corruption_levels = [0.1, 0.1]):
59 """ This class is made to support a variable number of layers.
60
61 :type numpy_rng: numpy.random.RandomState
62 :param numpy_rng: numpy random number generator used to draw initial
63 weights
64
65 :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
66 :param theano_rng: Theano random generator; if None is given one is
67 generated based on a seed drawn from `rng`
68
69 :type n_ins: int
70 :param n_ins: dimension of the input to the sdA
71
72 :type n_layers_sizes: list of ints
73 :param n_layers_sizes: intermidiate layers size, must contain
74 at least one value
75
76 :type n_outs: int
77 :param n_outs: dimension of the output of the network
78
79 :type corruption_levels: list of float
80 :param corruption_levels: amount of corruption to use for each
81 layer
82 """
83
84 self.sigmoid_layers = []
85 self.dA_layers = []
86 self.params = []
87 self.n_layers = len(hidden_layers_sizes)
88
89 assert self.n_layers > 0
90
91 if not theano_rng:
92 theano_rng = RandomStreams(numpy_rng.randint(2**30))
93 # allocate symbolic variables for the data
94 self.x = T.matrix('x') # the data is presented as rasterized images
95 self.y = T.ivector('y') # the labels are presented as 1D vector of
96 # [int] labels
97
98 # The SdA is an MLP, for which all weights of intermidiate layers
99 # are shared with a different denoising autoencoders
100 # We will first construct the SdA as a deep multilayer perceptron,
101 # and when constructing each sigmoidal layer we also construct a
102 # denoising autoencoder that shares weights with that layer
103 # During pretraining we will train these autoencoders (which will
104 # lead to chainging the weights of the MLP as well)
105 # During finetunining we will finish training the SdA by doing
106 # stochastich gradient descent on the MLP
107
108 for i in xrange( self.n_layers ):
109 # construct the sigmoidal layer
110
111 # the size of the input is either the number of hidden units of
112 # the layer below or the input size if we are on the first layer
113 if i == 0 :
114 input_size = n_ins
115 else:
116 input_size = hidden_layers_sizes[i-1]
117
118 # the input to this layer is either the activation of the hidden
119 # layer below or the input of the SdA if you are on the first
120 # layer
121 if i == 0 :
122 layer_input = self.x
123 else:
124 layer_input = self.sigmoid_layers[-1].output
125
126 sigmoid_layer = HiddenLayer(rng = numpy_rng,
127 input = layer_input,
128 n_in = input_size,
129 n_out = hidden_layers_sizes[i],
130 activation = T.nnet.sigmoid)
131 # add the layer to our list of layers
132 self.sigmoid_layers.append(sigmoid_layer)
133 # its arguably a philosophical question...
134 # but we are going to only declare that the parameters of the
135 # sigmoid_layers are parameters of the StackedDAA
136 # the visible biases in the dA are parameters of those
137 # dA, but not the SdA
138 self.params.extend(sigmoid_layer.params)
139
140 # Construct a denoising autoencoder that shared weights with this
141 # layer
142 dA_layer = dA(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input,
143 n_visible = input_size,
144 n_hidden = hidden_layers_sizes[i],
145 W = sigmoid_layer.W, bhid = sigmoid_layer.b)
146 self.dA_layers.append(dA_layer)
147
148
149 # We now need to add a logistic layer on top of the MLP
150 self.logLayer = LogisticRegression(\
151 input = self.sigmoid_layers[-1].output,\
152 n_in = hidden_layers_sizes[-1], n_out = n_outs)
153
154 self.params.extend(self.logLayer.params)
155 # construct a function that implements one step of finetunining
156
157 # compute the cost for second phase of training,
158 # defined as the negative log likelihood
159 self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
160 # compute the gradients with respect to the model parameters
161 # symbolic variable that points to the number of errors made on the
162 # minibatch given by self.x and self.y
163 self.errors = self.logLayer.errors(self.y)
164
165 def pretraining_functions(self, train_set_x, batch_size):
166 ''' Generates a list of functions, each of them implementing one
167 step in trainnig the dA corresponding to the layer with same index.
168 The function will require as input the minibatch index, and to train
169 a dA you just need to iterate, calling the corresponding function on
170 all minibatch indexes.
171
172 :type train_set_x: theano.tensor.TensorType
173 :param train_set_x: Shared variable that contains all datapoints used
174 for training the dA
175
176 :type batch_size: int
177 :param batch_size: size of a [mini]batch
178
179 :type learning_rate: float
180 :param learning_rate: learning rate used during training for any of
181 the dA layers
182 '''
183
184 # index to a [mini]batch
185 index = T.lscalar('index') # index to a minibatch
186 corruption_level = T.scalar('corruption') # amount of corruption to use
187 learning_rate = T.scalar('lr') # learning rate to use
188 # number of batches
189 n_batches = train_set_x.value.shape[0] / batch_size
190 # begining of a batch, given `index`
191 batch_begin = index * batch_size
192 # ending of a batch given `index`
193 batch_end = batch_begin+batch_size
194
195 pretrain_fns = []
196 for dA in self.dA_layers:
197 # get the cost and the updates list
198 cost,updates = dA.get_cost_updates( corruption_level, learning_rate)
199 # compile the theano function
200 fn = theano.function( inputs = [index,
201 theano.Param(corruption_level, default = 0.2),
202 theano.Param(learning_rate, default = 0.1)],
203 outputs = cost,
204 updates = updates,
205 givens = {self.x :train_set_x[batch_begin:batch_end]})
206 # append `fn` to the list of functions
207 pretrain_fns.append(fn)
208
209 return pretrain_fns
210
211
212 def build_finetune_functions(self, datasets, batch_size, learning_rate):
213 '''Generates a function `train` that implements one step of
214 finetuning, a function `validate` that computes the error on
215 a batch from the validation set, and a function `test` that
216 computes the error on a batch from the testing set
217
218 :type datasets: list of pairs of theano.tensor.TensorType
219 :param datasets: It is a list that contain all the datasets;
220 the has to contain three pairs, `train`,
221 `valid`, `test` in this order, where each pair
222 is formed of two Theano variables, one for the
223 datapoints, the other for the labels
224
225 :type batch_size: int
226 :param batch_size: size of a minibatch
227
228 :type learning_rate: float
229 :param learning_rate: learning rate used during finetune stage
230 '''
231
232 (train_set_x, train_set_y) = datasets[0]
233 (valid_set_x, valid_set_y) = datasets[1]
234 (test_set_x , test_set_y ) = datasets[2]
235
236 # compute number of minibatches for training, validation and testing
237 n_valid_batches = valid_set_x.value.shape[0] / batch_size
238 n_test_batches = test_set_x.value.shape[0] / batch_size
239
240 index = T.lscalar('index') # index to a [mini]batch
241
242 # compute the gradients with respect to the model parameters
243 gparams = T.grad(self.finetune_cost, self.params)
244
245 # compute list of fine-tuning updates
246 updates = {}
247 for param, gparam in zip(self.params, gparams):
248 updates[param] = param - gparam*learning_rate
249
250 train_fn = theano.function(inputs = [index],
251 outputs = self.finetune_cost,
252 updates = updates,
253 givens = {
254 self.x : train_set_x[index*batch_size:(index+1)*batch_size],
255 self.y : train_set_y[index*batch_size:(index+1)*batch_size]})
256
257 test_score_i = theano.function([index], self.errors,
258 givens = {
259 self.x: test_set_x[index*batch_size:(index+1)*batch_size],
260 self.y: test_set_y[index*batch_size:(index+1)*batch_size]})
261
262 valid_score_i = theano.function([index], self.errors,
263 givens = {
264 self.x: valid_set_x[index*batch_size:(index+1)*batch_size],
265 self.y: valid_set_y[index*batch_size:(index+1)*batch_size]})
266
267 # Create a function that scans the entire validation set
268 def valid_score():
269 return [valid_score_i(i) for i in xrange(n_valid_batches)]
270
271 # Create a function that scans the entire test set
272 def test_score():
273 return [test_score_i(i) for i in xrange(n_test_batches)]
274
275 return train_fn, valid_score, test_score
276
277
278
279
280
281
282 def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
283 pretrain_lr = 0.1, training_epochs = 1000, \
284 dataset='mnist.pkl.gz'):
285 """
286 Demonstrates how to train and test a stochastic denoising autoencoder.
287
288 This is demonstrated on MNIST.
289
290 :type learning_rate: float
291 :param learning_rate: learning rate used in the finetune stage
292 (factor for the stochastic gradient)
293
294 :type pretraining_epochs: int
295 :param pretraining_epochs: number of epoch to do pretraining
296
297 :type pretrain_lr: float
298 :param pretrain_lr: learning rate to be used during pre-training
299
300 :type n_iter: int
301 :param n_iter: maximal number of iterations ot run the optimizer
302
303 :type dataset: string
304 :param dataset: path the the pickled dataset
305
306 """
307
308 datasets = load_data(dataset)
309
310 train_set_x, train_set_y = datasets[0]
311 valid_set_x, valid_set_y = datasets[1]
312 test_set_x , test_set_y = datasets[2]
313
314
315 batch_size = 20 # size of the minibatch
316
317 # compute number of minibatches for training, validation and testing
318 n_train_batches = train_set_x.value.shape[0] / batch_size
319
320 # numpy random generator
321 numpy_rng = numpy.random.RandomState(123)
322 print '... building the model'
323 # construct the stacked denoising autoencoder class
324 sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28,
325 hidden_layers_sizes = [1000,1000,1000],
326 n_outs = 10)
327
328
329 #########################
330 # PRETRAINING THE MODEL #
331 #########################
332 print '... getting the pretraining functions'
333 pretraining_fns = sda.pretraining_functions(
334 train_set_x = train_set_x,
335 batch_size = batch_size )
336
337 print '... pre-training the model'
338 start_time = time.clock()
339 ## Pre-train layer-wise
340 for i in xrange(sda.n_layers):
341 # go through pretraining epochs
342 for epoch in xrange(pretraining_epochs):
343 # go through the training set
344 c = []
345 for batch_index in xrange(n_train_batches):
346 c.append( pretraining_fns[i](index = batch_index,
347 corruption = 0.2, lr = pretrain_lr ) )
348 print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c)
349
350 end_time = time.clock()
351
352 print ('Pretraining took %f minutes' %((end_time-start_time)/60.))
353
354 ########################
355 # FINETUNING THE MODEL #
356 ########################
357
358 # get the training, validation and testing function for the model
359 print '... getting the finetuning functions'
360 train_fn, validate_model, test_model = sda.build_finetune_functions (
361 datasets = datasets, batch_size = batch_size,
362 learning_rate = finetune_lr)
363
364 print '... finetunning the model'
365 # early-stopping parameters
366 patience = 10000 # look as this many examples regardless
367 patience_increase = 2. # wait this much longer when a new best is
368 # found
369 improvement_threshold = 0.995 # a relative improvement of this much is
370 # considered significant
371 validation_frequency = min(n_train_batches, patience/2)
372 # go through this many
373 # minibatche before checking the network
374 # on the validation set; in this case we
375 # check every epoch
376
377
378 best_params = None
379 best_validation_loss = float('inf')
380 test_score = 0.
381 start_time = time.clock()
382
383 done_looping = False
384 epoch = 0
385
386 while (epoch < training_epochs) and (not done_looping):
387 epoch = epoch + 1
388 for minibatch_index in xrange(n_train_batches):
389
390 minibatch_avg_cost = train_fn(minibatch_index)
391 iter = epoch * n_train_batches + minibatch_index
392
393 if (iter+1) % validation_frequency == 0:
394
395 validation_losses = validate_model()
396 this_validation_loss = numpy.mean(validation_losses)
397 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
398 (epoch, minibatch_index+1, n_train_batches, \
399 this_validation_loss*100.))
400
401
402 # if we got the best validation score until now
403 if this_validation_loss < best_validation_loss:
404
405 #improve patience if loss improvement is good enough
406 if this_validation_loss < best_validation_loss * \
407 improvement_threshold :
408 patience = max(patience, iter * patience_increase)
409
410 # save best validation score and iteration number
411 best_validation_loss = this_validation_loss
412 best_iter = iter
413
414 # test it on the test set
415 test_losses = test_model()
416 test_score = numpy.mean(test_losses)
417 print((' epoch %i, minibatch %i/%i, test error of best '
418 'model %f %%') %
419 (epoch, minibatch_index+1, n_train_batches,
420 test_score*100.))
421
422
423 if patience <= iter :
424 done_looping = True
425 break
426
427 end_time = time.clock()
428 print(('Optimization complete with best validation score of %f %%,'
429 'with test performance %f %%') %
430 (best_validation_loss * 100., test_score*100.))
431 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
432
433
434
435
436
437
438 if __name__ == '__main__':
439 test_SdA()
440
441