comparison deep/autoencoder/DA_training.py @ 190:70a9df1cd20e

initial commit for autoencoder training
author youssouf
date Tue, 02 Mar 2010 09:52:27 -0500
parents
children e12702b88a2d
comparison
equal deleted inserted replaced
189:0d0677773533 190:70a9df1cd20e
1 """
2 This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.
3
4 Denoising autoencoders are the building blocks for SDAE.
5 They are based on auto-encoders as the ones used in Bengio et al. 2007.
6 An autoencoder takes an input x and first maps it to a hidden representation
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
8 latent representation y is then mapped back to a "reconstructed" vector
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight
10 matrix W' can optionally be constrained such that W' = W^T, in which case
11 the autoencoder is said to have tied weights. The network is trained such
12 that to minimize the reconstruction error (the error between x and z).
13
14 For the denosing autoencoder, during training, first x is corrupted into
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
16 of a stochastic mapping. Afterwards y is computed as before (using
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
18 error is now measured between z and the uncorrupted input x, which is
19 computed as the cross-entropy :
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
21
22 For X iteration of the main program loop it takes *** minutes on an
23 Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
24
25
26 References :
27 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
28 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
29 2008
30 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
31 Training of Deep Networks, Advances in Neural Information Processing
32 Systems 19, 2007
33
34 """
35
36 import numpy
37 import theano
38 import time
39 import theano.tensor as T
40 from theano.tensor.shared_randomstreams import RandomStreams
41
42 import gzip
43 import cPickle
44
45 from pylearn.io import filetensor as ft
46
47 class dA():
48 """Denoising Auto-Encoder class (dA)
49
50 A denoising autoencoders tries to reconstruct the input from a corrupted
51 version of it by projecting it first in a latent space and reprojecting
52 it afterwards back in the input space. Please refer to Vincent et al.,2008
53 for more details. If x is the input then equation (1) computes a partially
54 destroyed version of x by means of a stochastic mapping q_D. Equation (2)
55 computes the projection of the input into the latent space. Equation (3)
56 computes the reconstruction of the input, while equation (4) computes the
57 reconstruction error.
58
59 .. math::
60
61 \tilde{x} ~ q_D(\tilde{x}|x) (1)
62
63 y = s(W \tilde{x} + b) (2)
64
65 z = s(W' y + b') (3)
66
67 L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4)
68
69 """
70
71 def __init__(self, n_visible= 784, n_hidden= 500, complexity = 0.1, input= None):
72 """
73 Initialize the DAE class by specifying the number of visible units (the
74 dimension d of the input ), the number of hidden units ( the dimension
75 d' of the latent or hidden space ) and by giving a symbolic variable
76 for the input. Such a symbolic variable is useful when the input is
77 the result of some computations. For example when dealing with SDAEs,
78 the dA on layer 2 gets as input the output of the DAE on layer 1.
79 This output can be written as a function of the input to the entire
80 model, and as such can be computed by theano whenever needed.
81
82 :param n_visible: number of visible units
83
84 :param n_hidden: number of hidden units
85
86 :param input: a symbolic description of the input or None
87
88 """
89 self.n_visible = n_visible
90 self.n_hidden = n_hidden
91
92 # create a Theano random generator that gives symbolic random values
93 theano_rng = RandomStreams()
94 # create a numpy random generator
95 numpy_rng = numpy.random.RandomState()
96
97
98 # initial values for weights and biases
99 # note : W' was written as `W_prime` and b' as `b_prime`
100
101 # W is initialized with `initial_W` which is uniformely sampled
102 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
103 # the output of uniform if converted using asarray to dtype
104 # theano.config.floatX so that the code is runable on GPU
105 initial_W = numpy.asarray( numpy.random.uniform( \
106 low = -numpy.sqrt(6./(n_visible+n_hidden)), \
107 high = numpy.sqrt(6./(n_visible+n_hidden)), \
108 size = (n_visible, n_hidden)), dtype = theano.config.floatX)
109 initial_b = numpy.zeros(n_hidden)
110
111 # W' is initialized with `initial_W_prime` which is uniformely sampled
112 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
113 # the output of uniform if converted using asarray to dtype
114 # theano.config.floatX so that the code is runable on GPU
115 initial_b_prime= numpy.zeros(n_visible)
116
117
118 # theano shared variables for weights and biases
119 self.W = theano.shared(value = initial_W, name = "W")
120 self.b = theano.shared(value = initial_b, name = "b")
121 # tied weights, therefore W_prime is W transpose
122 self.W_prime = self.W.T
123 self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
124
125 # if no input is given, generate a variable representing the input
126 if input == None :
127 # we use a matrix because we expect a minibatch of several examples,
128 # each example being a row
129 x = T.dmatrix(name = 'input')
130 else:
131 x = input
132 # Equation (1)
133 # note : first argument of theano.rng.binomial is the shape(size) of
134 # random numbers that it should produce
135 # second argument is the number of trials
136 # third argument is the probability of success of any trial
137 #
138 # this will produce an array of 0s and 1s where 1 has a
139 # probability of 0.9 and 0 of 0.1
140
141 tilde_x = theano_rng.binomial( x.shape, 1, 1-complexity) * x
142 # Equation (2)
143 # note : y is stored as an attribute of the class so that it can be
144 # used later when stacking dAs.
145 self.y = T.nnet.sigmoid(T.dot(tilde_x, self.W ) + self.b)
146 # Equation (3)
147 z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
148 # Equation (4)
149 self.L = - T.sum( x*T.log(z) + (1-x)*T.log(1-z), axis=1 )
150 # note : L is now a vector, where each element is the cross-entropy cost
151 # of the reconstruction of the corresponding example of the
152 # minibatch. We need to compute the average of all these to get
153 # the cost of the minibatch
154 self.cost = T.mean(self.L)
155 # note : y is computed from the corrupted `tilde_x`. Later on,
156 # we will need the hidden layer obtained from the uncorrupted
157 # input when for example we will pass this as input to the layer
158 # above
159 self.hidden_values = T.nnet.sigmoid( T.dot(x, self.W) + self.b)
160
161
162
163 def sgd_optimization_nist( learning_rate=0.01, \
164 n_iter = 300, n_code_layer = 400, \
165 complexity = 0.1):
166 """
167 Demonstrate stochastic gradient descent optimization for a denoising autoencoder
168
169 This is demonstrated on MNIST.
170
171 :param learning_rate: learning rate used (factor for the stochastic
172 gradient
173
174 :param pretraining_epochs: number of epoch to do pretraining
175
176 :param pretrain_lr: learning rate to be used during pre-training
177
178 :param n_iter: maximal number of iterations ot run the optimizer
179
180 """
181 #open file to save the validation and test curve
182 filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
183 'c_' + str(complexity) + '.txt'
184
185 result_file = open(filename, 'w')
186
187
188
189 data_path = '/data/lisa/data/nist/by_class/'
190 f = open(data_path+'all/all_train_data.ft')
191 g = open(data_path+'all/all_train_labels.ft')
192 h = open(data_path+'all/all_test_data.ft')
193 i = open(data_path+'all/all_test_labels.ft')
194
195 train_set_x = ft.read(f)
196 train_set_y = ft.read(g)
197 test_set_x = ft.read(h)
198 test_set_y = ft.read(i)
199
200 f.close()
201 g.close()
202 i.close()
203 h.close()
204
205 # make minibatches of size 20
206 batch_size = 20 # sized of the minibatch
207
208 #create a validation set the same size as the test size
209 #use the end of the training array for this purpose
210 #discard the last remaining so we get a %batch_size number
211 test_size=len(test_set_y)
212 test_size = int(test_size/batch_size)
213 test_size*=batch_size
214 train_size = len(train_set_x)
215 train_size = int(train_size/batch_size)
216 train_size*=batch_size
217 validation_size =test_size
218 offset = train_size-test_size
219 if True:
220 print 'train size = %d' %train_size
221 print 'test size = %d' %test_size
222 print 'valid size = %d' %validation_size
223 print 'offset = %d' %offset
224
225
226 #train_set = (train_set_x,train_set_y)
227 train_batches = []
228 for i in xrange(0, train_size-test_size, batch_size):
229 train_batches = train_batches + \
230 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
231
232 test_batches = []
233 for i in xrange(0, test_size, batch_size):
234 test_batches = test_batches + \
235 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
236
237 valid_batches = []
238 for i in xrange(0, test_size, batch_size):
239 valid_batches = valid_batches + \
240 [(train_set_x[offset+i:offset+i+batch_size], \
241 train_set_y[offset+i:offset+i+batch_size])]
242
243
244 ishape = (32,32) # this is the size of NIST images
245
246 # allocate symbolic variables for the data
247 x = T.fmatrix() # the data is presented as rasterized images
248 y = T.lvector() # the labels are presented as 1D vector of
249 # [long int] labels
250
251 # construct the denoising autoencoder class
252 n_ins = 32*32
253 encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
254
255 # Train autoencoder
256
257 # compute gradients of the layer parameters
258 gW = T.grad(encoder.cost, encoder.W)
259 gb = T.grad(encoder.cost, encoder.b)
260 gb_prime = T.grad(encoder.cost, encoder.b_prime)
261 # compute the updated value of the parameters after one step
262 updated_W = encoder.W - gW * learning_rate
263 updated_b = encoder.b - gb * learning_rate
264 updated_b_prime = encoder.b_prime - gb_prime * learning_rate
265
266 # defining the function that evaluate the symbolic description of
267 # one update step
268 train_model = theano.function([x], encoder.cost, updates=\
269 { encoder.W : updated_W, \
270 encoder.b : updated_b, \
271 encoder.b_prime : updated_b_prime } )
272
273
274
275
276 # compiling a theano function that computes the mistakes that are made
277 # by the model on a minibatch
278 test_model = theano.function([x], encoder.cost)
279
280 normalize = numpy.asarray(255, dtype=theano.config.floatX)
281
282
283 n_minibatches = len(train_batches)
284
285 # early-stopping parameters
286 patience = 10000000 / batch_size # look as this many examples regardless
287 patience_increase = 2 # wait this much longer when a new best is
288 # found
289 improvement_threshold = 0.995 # a relative improvement of this much is
290 # considered significant
291 validation_frequency = n_minibatches # go through this many
292 # minibatche before checking the network
293 # on the validation set; in this case we
294 # check every epoch
295
296
297 best_params = None
298 best_validation_loss = float('inf')
299 best_iter = 0
300 test_score = 0.
301 start_time = time.clock()
302 # have a maximum of `n_iter` iterations through the entire dataset
303 for iter in xrange(n_iter* n_minibatches):
304
305 # get epoch and minibatch index
306 epoch = iter / n_minibatches
307 minibatch_index = iter % n_minibatches
308
309 # get the minibatches corresponding to `iter` modulo
310 # `len(train_batches)`
311 x,y = train_batches[ minibatch_index ]
312 '''
313 if iter == 0:
314 b = numpy.asarray(255, dtype=theano.config.floatX)
315 x = x / b
316 print x
317 print y
318 print x.__class__
319 print x.shape
320 print x.dtype.name
321 print y.dtype.name
322 print x.min(), x.max()
323 '''
324
325 cost_ij = train_model(x/normalize)
326
327 if (iter+1) % validation_frequency == 0:
328 # compute zero-one loss on validation set
329 this_validation_loss = 0.
330 for x,y in valid_batches:
331 # sum up the errors for each minibatch
332 this_validation_loss += test_model(x/normalize)
333 # get the average by dividing with the number of minibatches
334 this_validation_loss /= len(valid_batches)
335
336 print('epoch %i, minibatch %i/%i, validation error %f ' % \
337 (epoch, minibatch_index+1, n_minibatches, \
338 this_validation_loss))
339
340 # save value in file
341 result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
342
343
344 # if we got the best validation score until now
345 if this_validation_loss < best_validation_loss:
346
347 #improve patience if loss improvement is good enough
348 if this_validation_loss < best_validation_loss * \
349 improvement_threshold :
350 patience = max(patience, iter * patience_increase)
351
352 best_validation_loss = this_validation_loss
353 best_iter = iter
354 # test it on the test set
355
356 test_score = 0.
357 for x,y in test_batches:
358 test_score += test_model(x/normalize)
359 test_score /= len(test_batches)
360 print((' epoch %i, minibatch %i/%i, test error of best '
361 'model %f ') %
362 (epoch, minibatch_index+1, n_minibatches,
363 test_score))
364
365 if patience <= iter :
366 print('iter (%i) is superior than patience(%i). break', iter, patience)
367 break
368
369
370
371 end_time = time.clock()
372 print(('Optimization complete with best validation score of %f ,'
373 'with test performance %f ') %
374 (best_validation_loss, test_score))
375 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
376
377
378 result_file.close()
379
380 return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
381
382 def sgd_optimization_mnist( learning_rate=0.01, \
383 n_iter = 1, n_code_layer = 400, \
384 complexity = 0.1):
385 """
386 Demonstrate stochastic gradient descent optimization for a denoising autoencoder
387
388 This is demonstrated on MNIST.
389
390 :param learning_rate: learning rate used (factor for the stochastic
391 gradient
392
393 :param pretraining_epochs: number of epoch to do pretraining
394
395 :param pretrain_lr: learning rate to be used during pre-training
396
397 :param n_iter: maximal number of iterations ot run the optimizer
398
399 """
400 #open file to save the validation and test curve
401 filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
402 'c_' + str(complexity) + '.txt'
403
404 result_file = open(filename, 'w')
405
406 # Load the dataset
407 f = gzip.open('/u/lisa/HTML/deep/data/mnist/mnist.pkl.gz','rb')
408 train_set, valid_set, test_set = cPickle.load(f)
409 f.close()
410
411 # make minibatches of size 20
412 batch_size = 20 # sized of the minibatch
413
414 # Dealing with the training set
415 # get the list of training images (x) and their labels (y)
416 (train_set_x, train_set_y) = train_set
417 # initialize the list of training minibatches with empty list
418 train_batches = []
419 for i in xrange(0, len(train_set_x), batch_size):
420 # add to the list of minibatches the minibatch starting at
421 # position i, ending at position i+batch_size
422 # a minibatch is a pair ; the first element of the pair is a list
423 # of datapoints, the second element is the list of corresponding
424 # labels
425 train_batches = train_batches + \
426 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
427
428 # Dealing with the validation set
429 (valid_set_x, valid_set_y) = valid_set
430 # initialize the list of validation minibatches
431 valid_batches = []
432 for i in xrange(0, len(valid_set_x), batch_size):
433 valid_batches = valid_batches + \
434 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
435
436 # Dealing with the testing set
437 (test_set_x, test_set_y) = test_set
438 # initialize the list of testing minibatches
439 test_batches = []
440 for i in xrange(0, len(test_set_x), batch_size):
441 test_batches = test_batches + \
442 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
443
444
445 ishape = (28,28) # this is the size of MNIST images
446
447 # allocate symbolic variables for the data
448 x = T.fmatrix() # the data is presented as rasterized images
449 y = T.lvector() # the labels are presented as 1D vector of
450 # [long int] labels
451
452 # construct the denoising autoencoder class
453 n_ins = 28*28
454 encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
455
456 # Train autoencoder
457
458 # compute gradients of the layer parameters
459 gW = T.grad(encoder.cost, encoder.W)
460 gb = T.grad(encoder.cost, encoder.b)
461 gb_prime = T.grad(encoder.cost, encoder.b_prime)
462 # compute the updated value of the parameters after one step
463 updated_W = encoder.W - gW * learning_rate
464 updated_b = encoder.b - gb * learning_rate
465 updated_b_prime = encoder.b_prime - gb_prime * learning_rate
466
467 # defining the function that evaluate the symbolic description of
468 # one update step
469 train_model = theano.function([x], encoder.cost, updates=\
470 { encoder.W : updated_W, \
471 encoder.b : updated_b, \
472 encoder.b_prime : updated_b_prime } )
473
474
475
476
477 # compiling a theano function that computes the mistakes that are made
478 # by the model on a minibatch
479 test_model = theano.function([x], encoder.cost)
480
481
482
483
484 n_minibatches = len(train_batches)
485
486 # early-stopping parameters
487 patience = 10000# look as this many examples regardless
488 patience_increase = 2 # wait this much longer when a new best is
489 # found
490 improvement_threshold = 0.995 # a relative improvement of this much is
491 # considered significant
492 validation_frequency = n_minibatches # go through this many
493 # minibatche before checking the network
494 # on the validation set; in this case we
495 # check every epoch
496
497
498 best_params = None
499 best_validation_loss = float('inf')
500 best_iter = 0
501 test_score = 0.
502 start_time = time.clock()
503 # have a maximum of `n_iter` iterations through the entire dataset
504 for iter in xrange(n_iter* n_minibatches):
505
506 # get epoch and minibatch index
507 epoch = iter / n_minibatches
508 minibatch_index = iter % n_minibatches
509
510 # get the minibatches corresponding to `iter` modulo
511 # `len(train_batches)`
512 x,y = train_batches[ minibatch_index ]
513 cost_ij = train_model(x)
514
515 if (iter+1) % validation_frequency == 0:
516 # compute zero-one loss on validation set
517 this_validation_loss = 0.
518 for x,y in valid_batches:
519 # sum up the errors for each minibatch
520 this_validation_loss += test_model(x)
521 # get the average by dividing with the number of minibatches
522 this_validation_loss /= len(valid_batches)
523
524 print('epoch %i, minibatch %i/%i, validation error %f ' % \
525 (epoch, minibatch_index+1, n_minibatches, \
526 this_validation_loss))
527
528 # save value in file
529 result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
530
531
532 # if we got the best validation score until now
533 if this_validation_loss < best_validation_loss:
534
535 #improve patience if loss improvement is good enough
536 if this_validation_loss < best_validation_loss * \
537 improvement_threshold :
538 patience = max(patience, iter * patience_increase)
539
540 best_validation_loss = this_validation_loss
541 best_iter = iter
542 # test it on the test set
543
544 test_score = 0.
545 for x,y in test_batches:
546 test_score += test_model(x)
547 test_score /= len(test_batches)
548 print((' epoch %i, minibatch %i/%i, test error of best '
549 'model %f ') %
550 (epoch, minibatch_index+1, n_minibatches,
551 test_score))
552
553 if patience <= iter :
554 print('iter (%i) is superior than patience(%i). break', iter, patience)
555 break
556
557
558 end_time = time.clock()
559 print(('Optimization complete with best validation score of %f ,'
560 'with test performance %f ') %
561 (best_validation_loss, test_score))
562 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
563
564
565 result_file.close()
566
567 return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
568
569
570 def experiment(state,channel):
571
572 (best_validation_loss, test_score, minutes_trained, iter) = \
573 sgd_optimization_mnist(state.learning_rate, state.n_iter, state.n_code_layer,
574 state.complexity)
575
576 state.best_validation_loss = best_validation_loss
577 state.test_score = test_score
578 state.minutes_trained = minutes_trained
579 state.iter = iter
580
581 return channel.COMPLETE
582
583 def experiment_nist(state,channel):
584
585 (best_validation_loss, test_score, minutes_trained, iter) = \
586 sgd_optimization_nist(state.learning_rate, state.n_iter, state.n_code_layer,
587 state.complexity)
588
589 state.best_validation_loss = best_validation_loss
590 state.test_score = test_score
591 state.minutes_trained = minutes_trained
592 state.iter = iter
593
594 return channel.COMPLETE
595
596
597 if __name__ == '__main__':
598
599 sgd_optimization_nist()
600
601