190
|
1 """
|
|
2 This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.
|
|
3
|
|
4 Denoising autoencoders are the building blocks for SDAE.
|
|
5 They are based on auto-encoders as the ones used in Bengio et al. 2007.
|
|
6 An autoencoder takes an input x and first maps it to a hidden representation
|
|
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
|
|
8 latent representation y is then mapped back to a "reconstructed" vector
|
|
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight
|
|
10 matrix W' can optionally be constrained such that W' = W^T, in which case
|
|
11 the autoencoder is said to have tied weights. The network is trained such
|
|
12 that to minimize the reconstruction error (the error between x and z).
|
|
13
|
|
14 For the denosing autoencoder, during training, first x is corrupted into
|
|
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
|
|
16 of a stochastic mapping. Afterwards y is computed as before (using
|
|
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
|
|
18 error is now measured between z and the uncorrupted input x, which is
|
|
19 computed as the cross-entropy :
|
|
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
|
|
21
|
|
22 For X iteration of the main program loop it takes *** minutes on an
|
|
23 Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
|
|
24
|
|
25
|
|
26 References :
|
|
27 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
|
|
28 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
|
|
29 2008
|
|
30 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
|
|
31 Training of Deep Networks, Advances in Neural Information Processing
|
|
32 Systems 19, 2007
|
|
33
|
|
34 """
|
|
35
|
|
36 import numpy
|
|
37 import theano
|
|
38 import time
|
|
39 import theano.tensor as T
|
|
40 from theano.tensor.shared_randomstreams import RandomStreams
|
|
41
|
|
42 import gzip
|
|
43 import cPickle
|
|
44
|
|
45 from pylearn.io import filetensor as ft
|
|
46
|
|
47 class dA():
|
|
48 """Denoising Auto-Encoder class (dA)
|
|
49
|
|
50 A denoising autoencoders tries to reconstruct the input from a corrupted
|
|
51 version of it by projecting it first in a latent space and reprojecting
|
|
52 it afterwards back in the input space. Please refer to Vincent et al.,2008
|
|
53 for more details. If x is the input then equation (1) computes a partially
|
|
54 destroyed version of x by means of a stochastic mapping q_D. Equation (2)
|
|
55 computes the projection of the input into the latent space. Equation (3)
|
|
56 computes the reconstruction of the input, while equation (4) computes the
|
|
57 reconstruction error.
|
|
58
|
|
59 .. math::
|
|
60
|
|
61 \tilde{x} ~ q_D(\tilde{x}|x) (1)
|
|
62
|
|
63 y = s(W \tilde{x} + b) (2)
|
|
64
|
|
65 z = s(W' y + b') (3)
|
|
66
|
|
67 L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4)
|
|
68
|
|
69 """
|
|
70
|
|
71 def __init__(self, n_visible= 784, n_hidden= 500, complexity = 0.1, input= None):
|
|
72 """
|
|
73 Initialize the DAE class by specifying the number of visible units (the
|
|
74 dimension d of the input ), the number of hidden units ( the dimension
|
|
75 d' of the latent or hidden space ) and by giving a symbolic variable
|
|
76 for the input. Such a symbolic variable is useful when the input is
|
|
77 the result of some computations. For example when dealing with SDAEs,
|
|
78 the dA on layer 2 gets as input the output of the DAE on layer 1.
|
|
79 This output can be written as a function of the input to the entire
|
|
80 model, and as such can be computed by theano whenever needed.
|
|
81
|
|
82 :param n_visible: number of visible units
|
|
83
|
|
84 :param n_hidden: number of hidden units
|
|
85
|
|
86 :param input: a symbolic description of the input or None
|
|
87
|
|
88 """
|
|
89 self.n_visible = n_visible
|
|
90 self.n_hidden = n_hidden
|
|
91
|
|
92 # create a Theano random generator that gives symbolic random values
|
|
93 theano_rng = RandomStreams()
|
|
94 # create a numpy random generator
|
|
95 numpy_rng = numpy.random.RandomState()
|
|
96
|
|
97
|
|
98 # initial values for weights and biases
|
|
99 # note : W' was written as `W_prime` and b' as `b_prime`
|
|
100
|
|
101 # W is initialized with `initial_W` which is uniformely sampled
|
|
102 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
|
|
103 # the output of uniform if converted using asarray to dtype
|
|
104 # theano.config.floatX so that the code is runable on GPU
|
|
105 initial_W = numpy.asarray( numpy.random.uniform( \
|
|
106 low = -numpy.sqrt(6./(n_visible+n_hidden)), \
|
|
107 high = numpy.sqrt(6./(n_visible+n_hidden)), \
|
|
108 size = (n_visible, n_hidden)), dtype = theano.config.floatX)
|
|
109 initial_b = numpy.zeros(n_hidden)
|
|
110
|
|
111 # W' is initialized with `initial_W_prime` which is uniformely sampled
|
|
112 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
|
|
113 # the output of uniform if converted using asarray to dtype
|
|
114 # theano.config.floatX so that the code is runable on GPU
|
|
115 initial_b_prime= numpy.zeros(n_visible)
|
|
116
|
|
117
|
|
118 # theano shared variables for weights and biases
|
|
119 self.W = theano.shared(value = initial_W, name = "W")
|
|
120 self.b = theano.shared(value = initial_b, name = "b")
|
|
121 # tied weights, therefore W_prime is W transpose
|
|
122 self.W_prime = self.W.T
|
|
123 self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
|
|
124
|
|
125 # if no input is given, generate a variable representing the input
|
|
126 if input == None :
|
|
127 # we use a matrix because we expect a minibatch of several examples,
|
|
128 # each example being a row
|
|
129 x = T.dmatrix(name = 'input')
|
|
130 else:
|
|
131 x = input
|
|
132 # Equation (1)
|
|
133 # note : first argument of theano.rng.binomial is the shape(size) of
|
|
134 # random numbers that it should produce
|
|
135 # second argument is the number of trials
|
|
136 # third argument is the probability of success of any trial
|
|
137 #
|
|
138 # this will produce an array of 0s and 1s where 1 has a
|
|
139 # probability of 0.9 and 0 of 0.1
|
|
140
|
|
141 tilde_x = theano_rng.binomial( x.shape, 1, 1-complexity) * x
|
|
142 # Equation (2)
|
|
143 # note : y is stored as an attribute of the class so that it can be
|
|
144 # used later when stacking dAs.
|
|
145 self.y = T.nnet.sigmoid(T.dot(tilde_x, self.W ) + self.b)
|
|
146 # Equation (3)
|
|
147 z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
|
|
148 # Equation (4)
|
|
149 self.L = - T.sum( x*T.log(z) + (1-x)*T.log(1-z), axis=1 )
|
|
150 # note : L is now a vector, where each element is the cross-entropy cost
|
|
151 # of the reconstruction of the corresponding example of the
|
|
152 # minibatch. We need to compute the average of all these to get
|
|
153 # the cost of the minibatch
|
|
154 self.cost = T.mean(self.L)
|
|
155 # note : y is computed from the corrupted `tilde_x`. Later on,
|
|
156 # we will need the hidden layer obtained from the uncorrupted
|
|
157 # input when for example we will pass this as input to the layer
|
|
158 # above
|
|
159 self.hidden_values = T.nnet.sigmoid( T.dot(x, self.W) + self.b)
|
|
160
|
|
161
|
|
162
|
|
163 def sgd_optimization_nist( learning_rate=0.01, \
|
|
164 n_iter = 300, n_code_layer = 400, \
|
|
165 complexity = 0.1):
|
|
166 """
|
|
167 Demonstrate stochastic gradient descent optimization for a denoising autoencoder
|
|
168
|
|
169 This is demonstrated on MNIST.
|
|
170
|
|
171 :param learning_rate: learning rate used (factor for the stochastic
|
|
172 gradient
|
|
173
|
|
174 :param pretraining_epochs: number of epoch to do pretraining
|
|
175
|
|
176 :param pretrain_lr: learning rate to be used during pre-training
|
|
177
|
|
178 :param n_iter: maximal number of iterations ot run the optimizer
|
|
179
|
|
180 """
|
|
181 #open file to save the validation and test curve
|
|
182 filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
|
|
183 'c_' + str(complexity) + '.txt'
|
|
184
|
|
185 result_file = open(filename, 'w')
|
|
186
|
|
187
|
|
188
|
|
189 data_path = '/data/lisa/data/nist/by_class/'
|
|
190 f = open(data_path+'all/all_train_data.ft')
|
|
191 g = open(data_path+'all/all_train_labels.ft')
|
|
192 h = open(data_path+'all/all_test_data.ft')
|
|
193 i = open(data_path+'all/all_test_labels.ft')
|
|
194
|
|
195 train_set_x = ft.read(f)
|
|
196 train_set_y = ft.read(g)
|
|
197 test_set_x = ft.read(h)
|
|
198 test_set_y = ft.read(i)
|
|
199
|
|
200 f.close()
|
|
201 g.close()
|
|
202 i.close()
|
|
203 h.close()
|
|
204
|
|
205 # make minibatches of size 20
|
|
206 batch_size = 20 # sized of the minibatch
|
|
207
|
|
208 #create a validation set the same size as the test size
|
|
209 #use the end of the training array for this purpose
|
|
210 #discard the last remaining so we get a %batch_size number
|
|
211 test_size=len(test_set_y)
|
|
212 test_size = int(test_size/batch_size)
|
|
213 test_size*=batch_size
|
|
214 train_size = len(train_set_x)
|
|
215 train_size = int(train_size/batch_size)
|
|
216 train_size*=batch_size
|
|
217 validation_size =test_size
|
|
218 offset = train_size-test_size
|
|
219 if True:
|
|
220 print 'train size = %d' %train_size
|
|
221 print 'test size = %d' %test_size
|
|
222 print 'valid size = %d' %validation_size
|
|
223 print 'offset = %d' %offset
|
|
224
|
|
225
|
|
226 #train_set = (train_set_x,train_set_y)
|
|
227 train_batches = []
|
|
228 for i in xrange(0, train_size-test_size, batch_size):
|
|
229 train_batches = train_batches + \
|
|
230 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
|
|
231
|
|
232 test_batches = []
|
|
233 for i in xrange(0, test_size, batch_size):
|
|
234 test_batches = test_batches + \
|
|
235 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
|
|
236
|
|
237 valid_batches = []
|
|
238 for i in xrange(0, test_size, batch_size):
|
|
239 valid_batches = valid_batches + \
|
|
240 [(train_set_x[offset+i:offset+i+batch_size], \
|
|
241 train_set_y[offset+i:offset+i+batch_size])]
|
|
242
|
|
243
|
|
244 ishape = (32,32) # this is the size of NIST images
|
|
245
|
|
246 # allocate symbolic variables for the data
|
|
247 x = T.fmatrix() # the data is presented as rasterized images
|
|
248 y = T.lvector() # the labels are presented as 1D vector of
|
|
249 # [long int] labels
|
|
250
|
|
251 # construct the denoising autoencoder class
|
|
252 n_ins = 32*32
|
|
253 encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
|
|
254
|
|
255 # Train autoencoder
|
|
256
|
|
257 # compute gradients of the layer parameters
|
|
258 gW = T.grad(encoder.cost, encoder.W)
|
|
259 gb = T.grad(encoder.cost, encoder.b)
|
|
260 gb_prime = T.grad(encoder.cost, encoder.b_prime)
|
|
261 # compute the updated value of the parameters after one step
|
|
262 updated_W = encoder.W - gW * learning_rate
|
|
263 updated_b = encoder.b - gb * learning_rate
|
|
264 updated_b_prime = encoder.b_prime - gb_prime * learning_rate
|
|
265
|
|
266 # defining the function that evaluate the symbolic description of
|
|
267 # one update step
|
|
268 train_model = theano.function([x], encoder.cost, updates=\
|
|
269 { encoder.W : updated_W, \
|
|
270 encoder.b : updated_b, \
|
|
271 encoder.b_prime : updated_b_prime } )
|
|
272
|
|
273
|
|
274
|
|
275
|
|
276 # compiling a theano function that computes the mistakes that are made
|
|
277 # by the model on a minibatch
|
|
278 test_model = theano.function([x], encoder.cost)
|
|
279
|
|
280 normalize = numpy.asarray(255, dtype=theano.config.floatX)
|
|
281
|
|
282
|
|
283 n_minibatches = len(train_batches)
|
|
284
|
|
285 # early-stopping parameters
|
|
286 patience = 10000000 / batch_size # look as this many examples regardless
|
|
287 patience_increase = 2 # wait this much longer when a new best is
|
|
288 # found
|
|
289 improvement_threshold = 0.995 # a relative improvement of this much is
|
|
290 # considered significant
|
|
291 validation_frequency = n_minibatches # go through this many
|
|
292 # minibatche before checking the network
|
|
293 # on the validation set; in this case we
|
|
294 # check every epoch
|
|
295
|
|
296
|
|
297 best_params = None
|
|
298 best_validation_loss = float('inf')
|
|
299 best_iter = 0
|
|
300 test_score = 0.
|
|
301 start_time = time.clock()
|
|
302 # have a maximum of `n_iter` iterations through the entire dataset
|
|
303 for iter in xrange(n_iter* n_minibatches):
|
|
304
|
|
305 # get epoch and minibatch index
|
|
306 epoch = iter / n_minibatches
|
|
307 minibatch_index = iter % n_minibatches
|
|
308
|
|
309 # get the minibatches corresponding to `iter` modulo
|
|
310 # `len(train_batches)`
|
|
311 x,y = train_batches[ minibatch_index ]
|
|
312 '''
|
|
313 if iter == 0:
|
|
314 b = numpy.asarray(255, dtype=theano.config.floatX)
|
|
315 x = x / b
|
|
316 print x
|
|
317 print y
|
|
318 print x.__class__
|
|
319 print x.shape
|
|
320 print x.dtype.name
|
|
321 print y.dtype.name
|
|
322 print x.min(), x.max()
|
|
323 '''
|
|
324
|
|
325 cost_ij = train_model(x/normalize)
|
|
326
|
|
327 if (iter+1) % validation_frequency == 0:
|
|
328 # compute zero-one loss on validation set
|
|
329 this_validation_loss = 0.
|
|
330 for x,y in valid_batches:
|
|
331 # sum up the errors for each minibatch
|
|
332 this_validation_loss += test_model(x/normalize)
|
|
333 # get the average by dividing with the number of minibatches
|
|
334 this_validation_loss /= len(valid_batches)
|
|
335
|
|
336 print('epoch %i, minibatch %i/%i, validation error %f ' % \
|
|
337 (epoch, minibatch_index+1, n_minibatches, \
|
|
338 this_validation_loss))
|
|
339
|
|
340 # save value in file
|
|
341 result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
|
|
342
|
|
343
|
|
344 # if we got the best validation score until now
|
|
345 if this_validation_loss < best_validation_loss:
|
|
346
|
|
347 #improve patience if loss improvement is good enough
|
|
348 if this_validation_loss < best_validation_loss * \
|
|
349 improvement_threshold :
|
|
350 patience = max(patience, iter * patience_increase)
|
|
351
|
|
352 best_validation_loss = this_validation_loss
|
|
353 best_iter = iter
|
|
354 # test it on the test set
|
|
355
|
|
356 test_score = 0.
|
|
357 for x,y in test_batches:
|
|
358 test_score += test_model(x/normalize)
|
|
359 test_score /= len(test_batches)
|
|
360 print((' epoch %i, minibatch %i/%i, test error of best '
|
|
361 'model %f ') %
|
|
362 (epoch, minibatch_index+1, n_minibatches,
|
|
363 test_score))
|
|
364
|
|
365 if patience <= iter :
|
|
366 print('iter (%i) is superior than patience(%i). break', iter, patience)
|
|
367 break
|
|
368
|
|
369
|
|
370
|
|
371 end_time = time.clock()
|
|
372 print(('Optimization complete with best validation score of %f ,'
|
|
373 'with test performance %f ') %
|
|
374 (best_validation_loss, test_score))
|
|
375 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
|
|
376
|
|
377
|
|
378 result_file.close()
|
|
379
|
|
380 return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
|
|
381
|
|
382 def sgd_optimization_mnist( learning_rate=0.01, \
|
|
383 n_iter = 1, n_code_layer = 400, \
|
|
384 complexity = 0.1):
|
|
385 """
|
|
386 Demonstrate stochastic gradient descent optimization for a denoising autoencoder
|
|
387
|
|
388 This is demonstrated on MNIST.
|
|
389
|
|
390 :param learning_rate: learning rate used (factor for the stochastic
|
|
391 gradient
|
|
392
|
|
393 :param pretraining_epochs: number of epoch to do pretraining
|
|
394
|
|
395 :param pretrain_lr: learning rate to be used during pre-training
|
|
396
|
|
397 :param n_iter: maximal number of iterations ot run the optimizer
|
|
398
|
|
399 """
|
|
400 #open file to save the validation and test curve
|
|
401 filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \
|
|
402 'c_' + str(complexity) + '.txt'
|
|
403
|
|
404 result_file = open(filename, 'w')
|
|
405
|
|
406 # Load the dataset
|
|
407 f = gzip.open('/u/lisa/HTML/deep/data/mnist/mnist.pkl.gz','rb')
|
|
408 train_set, valid_set, test_set = cPickle.load(f)
|
|
409 f.close()
|
|
410
|
|
411 # make minibatches of size 20
|
|
412 batch_size = 20 # sized of the minibatch
|
|
413
|
|
414 # Dealing with the training set
|
|
415 # get the list of training images (x) and their labels (y)
|
|
416 (train_set_x, train_set_y) = train_set
|
|
417 # initialize the list of training minibatches with empty list
|
|
418 train_batches = []
|
|
419 for i in xrange(0, len(train_set_x), batch_size):
|
|
420 # add to the list of minibatches the minibatch starting at
|
|
421 # position i, ending at position i+batch_size
|
|
422 # a minibatch is a pair ; the first element of the pair is a list
|
|
423 # of datapoints, the second element is the list of corresponding
|
|
424 # labels
|
|
425 train_batches = train_batches + \
|
|
426 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
|
|
427
|
|
428 # Dealing with the validation set
|
|
429 (valid_set_x, valid_set_y) = valid_set
|
|
430 # initialize the list of validation minibatches
|
|
431 valid_batches = []
|
|
432 for i in xrange(0, len(valid_set_x), batch_size):
|
|
433 valid_batches = valid_batches + \
|
|
434 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
|
|
435
|
|
436 # Dealing with the testing set
|
|
437 (test_set_x, test_set_y) = test_set
|
|
438 # initialize the list of testing minibatches
|
|
439 test_batches = []
|
|
440 for i in xrange(0, len(test_set_x), batch_size):
|
|
441 test_batches = test_batches + \
|
|
442 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
|
|
443
|
|
444
|
|
445 ishape = (28,28) # this is the size of MNIST images
|
|
446
|
|
447 # allocate symbolic variables for the data
|
|
448 x = T.fmatrix() # the data is presented as rasterized images
|
|
449 y = T.lvector() # the labels are presented as 1D vector of
|
|
450 # [long int] labels
|
|
451
|
|
452 # construct the denoising autoencoder class
|
|
453 n_ins = 28*28
|
|
454 encoder = dA(n_ins, n_code_layer, input = x.reshape((batch_size,n_ins)))
|
|
455
|
|
456 # Train autoencoder
|
|
457
|
|
458 # compute gradients of the layer parameters
|
|
459 gW = T.grad(encoder.cost, encoder.W)
|
|
460 gb = T.grad(encoder.cost, encoder.b)
|
|
461 gb_prime = T.grad(encoder.cost, encoder.b_prime)
|
|
462 # compute the updated value of the parameters after one step
|
|
463 updated_W = encoder.W - gW * learning_rate
|
|
464 updated_b = encoder.b - gb * learning_rate
|
|
465 updated_b_prime = encoder.b_prime - gb_prime * learning_rate
|
|
466
|
|
467 # defining the function that evaluate the symbolic description of
|
|
468 # one update step
|
|
469 train_model = theano.function([x], encoder.cost, updates=\
|
|
470 { encoder.W : updated_W, \
|
|
471 encoder.b : updated_b, \
|
|
472 encoder.b_prime : updated_b_prime } )
|
|
473
|
|
474
|
|
475
|
|
476
|
|
477 # compiling a theano function that computes the mistakes that are made
|
|
478 # by the model on a minibatch
|
|
479 test_model = theano.function([x], encoder.cost)
|
|
480
|
|
481
|
|
482
|
|
483
|
|
484 n_minibatches = len(train_batches)
|
|
485
|
|
486 # early-stopping parameters
|
|
487 patience = 10000# look as this many examples regardless
|
|
488 patience_increase = 2 # wait this much longer when a new best is
|
|
489 # found
|
|
490 improvement_threshold = 0.995 # a relative improvement of this much is
|
|
491 # considered significant
|
|
492 validation_frequency = n_minibatches # go through this many
|
|
493 # minibatche before checking the network
|
|
494 # on the validation set; in this case we
|
|
495 # check every epoch
|
|
496
|
|
497
|
|
498 best_params = None
|
|
499 best_validation_loss = float('inf')
|
|
500 best_iter = 0
|
|
501 test_score = 0.
|
|
502 start_time = time.clock()
|
|
503 # have a maximum of `n_iter` iterations through the entire dataset
|
|
504 for iter in xrange(n_iter* n_minibatches):
|
|
505
|
|
506 # get epoch and minibatch index
|
|
507 epoch = iter / n_minibatches
|
|
508 minibatch_index = iter % n_minibatches
|
|
509
|
|
510 # get the minibatches corresponding to `iter` modulo
|
|
511 # `len(train_batches)`
|
|
512 x,y = train_batches[ minibatch_index ]
|
|
513 cost_ij = train_model(x)
|
|
514
|
|
515 if (iter+1) % validation_frequency == 0:
|
|
516 # compute zero-one loss on validation set
|
|
517 this_validation_loss = 0.
|
|
518 for x,y in valid_batches:
|
|
519 # sum up the errors for each minibatch
|
|
520 this_validation_loss += test_model(x)
|
|
521 # get the average by dividing with the number of minibatches
|
|
522 this_validation_loss /= len(valid_batches)
|
|
523
|
|
524 print('epoch %i, minibatch %i/%i, validation error %f ' % \
|
|
525 (epoch, minibatch_index+1, n_minibatches, \
|
|
526 this_validation_loss))
|
|
527
|
|
528 # save value in file
|
|
529 result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n')
|
|
530
|
|
531
|
|
532 # if we got the best validation score until now
|
|
533 if this_validation_loss < best_validation_loss:
|
|
534
|
|
535 #improve patience if loss improvement is good enough
|
|
536 if this_validation_loss < best_validation_loss * \
|
|
537 improvement_threshold :
|
|
538 patience = max(patience, iter * patience_increase)
|
|
539
|
|
540 best_validation_loss = this_validation_loss
|
|
541 best_iter = iter
|
|
542 # test it on the test set
|
|
543
|
|
544 test_score = 0.
|
|
545 for x,y in test_batches:
|
|
546 test_score += test_model(x)
|
|
547 test_score /= len(test_batches)
|
|
548 print((' epoch %i, minibatch %i/%i, test error of best '
|
|
549 'model %f ') %
|
|
550 (epoch, minibatch_index+1, n_minibatches,
|
|
551 test_score))
|
|
552
|
|
553 if patience <= iter :
|
|
554 print('iter (%i) is superior than patience(%i). break', iter, patience)
|
|
555 break
|
|
556
|
|
557
|
|
558 end_time = time.clock()
|
|
559 print(('Optimization complete with best validation score of %f ,'
|
|
560 'with test performance %f ') %
|
|
561 (best_validation_loss, test_score))
|
|
562 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
|
|
563
|
|
564
|
|
565 result_file.close()
|
|
566
|
|
567 return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter)
|
|
568
|
|
569
|
|
570 def experiment(state,channel):
|
|
571
|
|
572 (best_validation_loss, test_score, minutes_trained, iter) = \
|
|
573 sgd_optimization_mnist(state.learning_rate, state.n_iter, state.n_code_layer,
|
|
574 state.complexity)
|
|
575
|
|
576 state.best_validation_loss = best_validation_loss
|
|
577 state.test_score = test_score
|
|
578 state.minutes_trained = minutes_trained
|
|
579 state.iter = iter
|
|
580
|
|
581 return channel.COMPLETE
|
|
582
|
|
583 def experiment_nist(state,channel):
|
|
584
|
|
585 (best_validation_loss, test_score, minutes_trained, iter) = \
|
|
586 sgd_optimization_nist(state.learning_rate, state.n_iter, state.n_code_layer,
|
|
587 state.complexity)
|
|
588
|
|
589 state.best_validation_loss = best_validation_loss
|
|
590 state.test_score = test_score
|
|
591 state.minutes_trained = minutes_trained
|
|
592 state.iter = iter
|
|
593
|
|
594 return channel.COMPLETE
|
|
595
|
|
596
|
|
597 if __name__ == '__main__':
|
|
598
|
|
599 sgd_optimization_nist()
|
|
600
|
|
601
|