comparison code_tutoriel/deep.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents
children
comparison
equal deleted inserted replaced
164:e3de934a98b6 165:4bc5eeec6394
1 """
2 Draft of DBN, DAA, SDAA, RBM tutorial code
3
4 """
5 import sys
6 import numpy
7 import theano
8 import time
9 import theano.tensor as T
10 from theano.tensor.shared_randomstreams import RandomStreams
11 from theano import shared, function
12
13 import gzip
14 import cPickle
15 import pylearn.io.image_tiling
16 import PIL
17
18 # NNET STUFF
19
20 class LogisticRegression(object):
21 """Multi-class Logistic Regression Class
22
23 The logistic regression is fully described by a weight matrix :math:`W`
24 and bias vector :math:`b`. Classification is done by projecting data
25 points onto a set of hyperplanes, the distance to which is used to
26 determine a class membership probability.
27 """
28
29 def __init__(self, input, n_in, n_out):
30 """ Initialize the parameters of the logistic regression
31 :param input: symbolic variable that describes the input of the
32 architecture (one minibatch)
33 :type n_in: int
34 :param n_in: number of input units, the dimension of the space in
35 which the datapoints lie
36 :type n_out: int
37 :param n_out: number of output units, the dimension of the space in
38 which the labels lie
39 """
40
41 # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
42 self.W = theano.shared( value=numpy.zeros((n_in,n_out),
43 dtype = theano.config.floatX) )
44 # initialize the baises b as a vector of n_out 0s
45 self.b = theano.shared( value=numpy.zeros((n_out,),
46 dtype = theano.config.floatX) )
47 # compute vector of class-membership probabilities in symbolic form
48 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
49
50 # compute prediction as class whose probability is maximal in
51 # symbolic form
52 self.y_pred=T.argmax(self.p_y_given_x, axis=1)
53
54 # list of parameters for this layer
55 self.params = [self.W, self.b]
56
57 def negative_log_likelihood(self, y):
58 """Return the mean of the negative log-likelihood of the prediction
59 of this model under a given target distribution.
60 :param y: corresponds to a vector that gives for each example the
61 correct label
62 Note: we use the mean instead of the sum so that
63 the learning rate is less dependent on the batch size
64 """
65 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
66
67 def errors(self, y):
68 """Return a float representing the number of errors in the minibatch
69 over the total number of examples of the minibatch ; zero one
70 loss over the size of the minibatch
71 """
72 # check if y has same dimension of y_pred
73 if y.ndim != self.y_pred.ndim:
74 raise TypeError('y should have the same shape as self.y_pred',
75 ('y', target.type, 'y_pred', self.y_pred.type))
76
77 # check if y is of the correct datatype
78 if y.dtype.startswith('int'):
79 # the T.neq operator returns a vector of 0s and 1s, where 1
80 # represents a mistake in prediction
81 return T.mean(T.neq(self.y_pred, y))
82 else:
83 raise NotImplementedError()
84
85 class SigmoidalLayer(object):
86 def __init__(self, rng, input, n_in, n_out):
87 """
88 Typical hidden layer of a MLP: units are fully-connected and have
89 sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
90 and the bias vector b is of shape (n_out,).
91
92 Hidden unit activation is given by: sigmoid(dot(input,W) + b)
93
94 :type rng: numpy.random.RandomState
95 :param rng: a random number generator used to initialize weights
96 :type input: theano.tensor.matrix
97 :param input: a symbolic tensor of shape (n_examples, n_in)
98 :type n_in: int
99 :param n_in: dimensionality of input
100 :type n_out: int
101 :param n_out: number of hidden units
102 """
103 self.input = input
104
105 W_values = numpy.asarray( rng.uniform( \
106 low = -numpy.sqrt(6./(n_in+n_out)), \
107 high = numpy.sqrt(6./(n_in+n_out)), \
108 size = (n_in, n_out)), dtype = theano.config.floatX)
109 self.W = theano.shared(value = W_values)
110
111 b_values = numpy.zeros((n_out,), dtype= theano.config.floatX)
112 self.b = theano.shared(value= b_values)
113
114 self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b)
115 self.params = [self.W, self.b]
116
117 # PRETRAINING LAYERS
118
119 class RBM(object):
120 """
121 *** WRITE THE ENERGY FUNCTION USE SAME LETTERS AS VARIABLE NAMES IN CODE
122 """
123
124 def __init__(self, input=None, n_visible=None, n_hidden=None,
125 W=None, hbias=None, vbias=None,
126 numpy_rng=None, theano_rng=None):
127 """
128 RBM constructor. Defines the parameters of the model along with
129 basic operations for inferring hidden from visible (and vice-versa),
130 as well as for performing CD updates.
131
132 :param input: None for standalone RBMs or symbolic variable if RBM is
133 part of a larger graph.
134
135 :param n_visible: number of visible units (necessary when W or vbias is None)
136
137 :param n_hidden: number of hidden units (necessary when W or hbias is None)
138
139 :param W: weights to use for the RBM. None means that a shared variable will be
140 created with a randomly chosen matrix of size (n_visible, n_hidden).
141
142 :param hbias: ***
143
144 :param vbias: ***
145
146 :param numpy_rng: random number generator (necessary when W is None)
147
148 """
149
150 params = []
151 if W is None:
152 # choose initial values for weight matrix of RBM
153 initial_W = numpy.asarray(
154 numpy_rng.uniform( \
155 low=-numpy.sqrt(6./(n_hidden+n_visible)), \
156 high=numpy.sqrt(6./(n_hidden+n_visible)), \
157 size=(n_visible, n_hidden)), \
158 dtype=theano.config.floatX)
159 W = theano.shared(value=initial_W, name='W')
160 params.append(W)
161
162 if hbias is None:
163 # theano shared variables for hidden biases
164 hbias = theano.shared(value=numpy.zeros(n_hidden,
165 dtype=theano.config.floatX), name='hbias')
166 params.append(hbias)
167
168 if vbias is None:
169 # theano shared variables for visible biases
170 vbias = theano.shared(value=numpy.zeros(n_visible,
171 dtype=theano.config.floatX), name='vbias')
172 params.append(vbias)
173
174 if input is None:
175 # initialize input layer for standalone RBM or layer0 of DBN
176 input = T.matrix('input')
177
178 # setup theano random number generator
179 if theano_rng is None:
180 theano_rng = RandomStreams(numpy_rng.randint(2**30))
181
182 self.visible = self.input = input
183 self.W = W
184 self.hbias = hbias
185 self.vbias = vbias
186 self.theano_rng = theano_rng
187 self.params = params
188 self.hidden_mean = T.nnet.sigmoid(T.dot(input, W)+hbias)
189 self.hidden_sample = theano_rng.binomial(self.hidden_mean.shape, 1, self.hidden_mean)
190
191 def gibbs_k(self, v_sample, k):
192 ''' This function implements k steps of Gibbs sampling '''
193
194 # We compute the visible after k steps of Gibbs by iterating
195 # over ``gibs_1`` for k times; this can be done in Theano using
196 # the `scan op`. For a more comprehensive description of scan see
197 # http://deeplearning.net/software/theano/library/scan.html .
198
199 def gibbs_1(v0_sample, W, hbias, vbias):
200 ''' This function implements one Gibbs step '''
201
202 # compute the activation of the hidden units given a sample of the
203 # vissibles
204 h0_mean = T.nnet.sigmoid(T.dot(v0_sample, W) + hbias)
205 # get a sample of the hiddens given their activation
206 h0_sample = self.theano_rng.binomial(h0_mean.shape, 1, h0_mean)
207 # compute the activation of the visible given the hidden sample
208 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, W.T) + vbias)
209 # get a sample of the visible given their activation
210 v1_act = self.theano_rng.binomial(v1_mean.shape, 1, v1_mean)
211 return [v1_mean, v1_act]
212
213
214 # DEBUGGING TO DO ALL WITHOUT SCAN
215 if k == 1:
216 return gibbs_1(v_sample, self.W, self.hbias, self.vbias)
217
218
219 # Because we require as output two values, namely the mean field
220 # approximation of the visible and the sample obtained after k steps,
221 # scan needs to know the shape of those two outputs. Scan takes
222 # this information from the variables containing the initial state
223 # of the outputs. Since we do not need a initial state of ``v_mean``
224 # we provide a dummy one used only to get the correct shape
225 v_mean = T.zeros_like(v_sample)
226
227 # ``outputs_taps`` is an argument of scan which describes at each
228 # time step what past values of the outputs the function applied
229 # recursively needs. This is given in the form of a dictionary,
230 # where the keys are outputs indexes, and values are a list of
231 # of the offsets used by the corresponding outputs
232 # In our case the function ``gibbs_1`` applied recursively, requires
233 # at time k the past value k-1 for the first output (index 0) and
234 # no past value of the second output
235 outputs_taps = { 0 : [-1], 1 : [] }
236
237 v_means, v_samples = theano.scan( fn = gibbs_1,
238 sequences = [],
239 initial_states = [v_sample, v_mean],
240 non_sequences = [self.W, self.hbias, self.vbias],
241 outputs_taps = outputs_taps,
242 n_steps = k)
243 return v_means[-1], v_samples[-1]
244
245 def free_energy(self, v_sample):
246 wx_b = T.dot(v_sample, self.W) + self.hbias
247 vbias_term = T.sum(T.dot(v_sample, self.vbias))
248 hidden_term = T.sum(T.log(1+T.exp(wx_b)))
249 return -hidden_term - vbias_term
250
251 def cd(self, visible = None, persistent = None, steps = 1):
252 """
253 Return a 5-tuple of values related to contrastive divergence: (cost,
254 end-state of negative-phase chain, gradient on weights, gradient on
255 hidden bias, gradient on visible bias)
256
257 If visible is None, it defaults to self.input
258 If persistent is None, it defaults to self.input
259
260 CD aka CD1 - cd()
261 CD-10 - cd(steps=10)
262 PCD - cd(persistent=shared(numpy.asarray(initializer)))
263 PCD-k - cd(persistent=shared(numpy.asarray(initializer)),
264 steps=10)
265 """
266 if visible is None:
267 visible = self.input
268
269 if visible is None:
270 raise TypeError('visible argument is required when self.input is None')
271
272 if steps is None:
273 steps = self.gibbs_1
274
275 if persistent is None:
276 chain_start = visible
277 else:
278 chain_start = persistent
279
280 chain_end_mean, chain_end_sample = self.gibbs_k(chain_start, steps)
281
282 #print >> sys.stderr, "WARNING: DEBUGGING with wrong FREE ENERGY"
283 #free_energy_delta = - self.free_energy(chain_end_sample)
284 free_energy_delta = self.free_energy(visible) - self.free_energy(chain_end_sample)
285
286 # we will return all of these regardless of what is in self.params
287 all_params = [self.W, self.hbias, self.vbias]
288
289 gparams = T.grad(free_energy_delta, all_params,
290 consider_constant = [chain_end_sample])
291
292 cross_entropy = T.mean(T.sum(
293 visible*T.log(chain_end_mean) + (1 - visible)*T.log(1-chain_end_mean),
294 axis = 1))
295
296 return (cross_entropy, chain_end_sample,) + tuple(gparams)
297
298 def cd_updates(self, lr, visible = None, persistent = None, steps = 1):
299 """
300 Return the learning updates for the RBM parameters that are shared variables.
301
302 Also returns an update for the persistent if it is a shared variable.
303
304 These updates are returned as a dictionary.
305
306 :param lr: [scalar] learning rate for contrastive divergence learning
307 :param visible: see `cd_grad`
308 :param persistent: see `cd_grad`
309 :param steps: see `cd_grad`
310
311 """
312
313 cross_entropy, chain_end, gW, ghbias, gvbias = self.cd(visible,
314 persistent, steps)
315
316 updates = {}
317 if hasattr(self.W, 'value'):
318 updates[self.W] = self.W - lr * gW
319 if hasattr(self.hbias, 'value'):
320 updates[self.hbias] = self.hbias - lr * ghbias
321 if hasattr(self.vbias, 'value'):
322 updates[self.vbias] = self.vbias - lr * gvbias
323 if persistent:
324 #if persistent is a shared var, then it means we should use
325 updates[persistent] = chain_end
326
327 return updates
328
329 # DEEP MODELS
330
331 class DBN(object):
332 """
333 *** WHAT IS A DBN?
334 """
335
336 def __init__(self, input_len, hidden_layers_sizes, n_classes, rng):
337 """ This class is made to support a variable number of layers.
338
339 :param train_set_x: symbolic variable pointing to the training dataset
340
341 :param train_set_y: symbolic variable pointing to the labels of the
342 training dataset
343
344 :param input_len: dimension of the input to the sdA
345
346 :param n_layers_sizes: intermidiate layers size, must contain
347 at least one value
348
349 :param n_classes: dimension of the output of the network
350
351 :param corruption_levels: amount of corruption to use for each
352 layer
353
354 :param rng: numpy random number generator used to draw initial weights
355
356 :param pretrain_lr: learning rate used during pre-trainnig stage
357
358 :param finetune_lr: learning rate used during finetune stage
359 """
360
361 self.sigmoid_layers = []
362 self.rbm_layers = []
363 self.pretrain_functions = []
364 self.params = []
365
366 theano_rng = RandomStreams(rng.randint(2**30))
367
368 # allocate symbolic variables for the data
369 index = T.lscalar() # index to a [mini]batch
370 self.x = T.matrix('x') # the data is presented as rasterized images
371 self.y = T.ivector('y') # the labels are presented as 1D vector of
372 # [int] labels
373 input = self.x
374
375 # The SdA is an MLP, for which all weights of intermidiate layers
376 # are shared with a different denoising autoencoders
377 # We will first construct the SdA as a deep multilayer perceptron,
378 # and when constructing each sigmoidal layer we also construct a
379 # denoising autoencoder that shares weights with that layer, and
380 # compile a training function for that denoising autoencoder
381
382 for n_hid in hidden_layers_sizes:
383 # construct the sigmoidal layer
384
385 sigmoid_layer = SigmoidalLayer(rng, input, input_len, n_hid)
386 self.sigmoid_layers.append(sigmoid_layer)
387
388 self.rbm_layers.append(RBM(input=input,
389 W=sigmoid_layer.W,
390 hbias=sigmoid_layer.b,
391 n_visible = input_len,
392 n_hidden = n_hid,
393 numpy_rng=rng,
394 theano_rng=theano_rng))
395
396 # its arguably a philosophical question...
397 # but we are going to only declare that the parameters of the
398 # sigmoid_layers are parameters of the StackedDAA
399 # the hidden-layer biases in the daa_layers are parameters of those
400 # daa_layers, but not the StackedDAA
401 self.params.extend(self.sigmoid_layers[-1].params)
402
403 # get ready for the next loop iteration
404 input_len = n_hid
405 input = self.sigmoid_layers[-1].output
406
407 # We now need to add a logistic layer on top of the MLP
408 self.logistic_regressor = LogisticRegression(input = input,
409 n_in = input_len, n_out = n_classes)
410
411 self.params.extend(self.logistic_regressor.params)
412
413 def pretraining_functions(self, train_set_x, batch_size, learning_rate, k=1):
414 if k!=1:
415 raise NotImplementedError()
416 index = T.lscalar() # index to a [mini]batch
417 n_train_batches = train_set_x.value.shape[0] / batch_size
418 batch_begin = (index % n_train_batches) * batch_size
419 batch_end = batch_begin+batch_size
420
421 print 'TRAIN_SET X', train_set_x.value.shape
422 rval = []
423 for rbm in self.rbm_layers:
424 # N.B. these cd() samples are independent from the
425 # samples used for learning
426 outputs = list(rbm.cd())[0:2]
427 rval.append(function([index], outputs,
428 updates = rbm.cd_updates(lr=learning_rate),
429 givens = {self.x: train_set_x[batch_begin:batch_end]}))
430 if rbm is self.rbm_layers[0]:
431 f = rval[-1]
432 AA=len(outputs)
433 for i, implicit_out in enumerate(f.maker.env.outputs): #[len(outputs):]:
434 print 'OUTPUT ', i
435 theano.printing.debugprint(implicit_out, file=sys.stdout)
436
437 return rval
438
439 def finetune(self, datasets, lr, batch_size):
440
441 # unpack the various datasets
442 (train_set_x, train_set_y) = datasets[0]
443 (valid_set_x, valid_set_y) = datasets[1]
444 (test_set_x, test_set_y) = datasets[2]
445
446 # compute number of minibatches for training, validation and testing
447 assert train_set_x.value.shape[0] % batch_size == 0
448 assert valid_set_x.value.shape[0] % batch_size == 0
449 assert test_set_x.value.shape[0] % batch_size == 0
450 n_train_batches = train_set_x.value.shape[0] / batch_size
451 n_valid_batches = valid_set_x.value.shape[0] / batch_size
452 n_test_batches = test_set_x.value.shape[0] / batch_size
453
454 index = T.lscalar() # index to a [mini]batch
455 target = self.y
456
457 train_index = index % n_train_batches
458
459 classifier = self.logistic_regressor
460 cost = classifier.negative_log_likelihood(target)
461 # compute the gradients with respect to the model parameters
462 gparams = T.grad(cost, self.params)
463
464 # compute list of fine-tuning updates
465 updates = [(param, param - gparam*finetune_lr)
466 for param,gparam in zip(self.params, gparams)]
467
468 train_fn = theano.function([index], cost,
469 updates = updates,
470 givens = {
471 self.x : train_set_x[train_index*batch_size:(train_index+1)*batch_size],
472 target : train_set_y[train_index*batch_size:(train_index+1)*batch_size]})
473
474 test_score_i = theano.function([index], classifier.errors(target),
475 givens = {
476 self.x: test_set_x[index*batch_size:(index+1)*batch_size],
477 target: test_set_y[index*batch_size:(index+1)*batch_size]})
478
479 valid_score_i = theano.function([index], classifier.errors(target),
480 givens = {
481 self.x: valid_set_x[index*batch_size:(index+1)*batch_size],
482 target: valid_set_y[index*batch_size:(index+1)*batch_size]})
483
484 def test_scores():
485 return [test_score_i(i) for i in xrange(n_test_batches)]
486
487 def valid_scores():
488 return [valid_score_i(i) for i in xrange(n_valid_batches)]
489
490 return train_fn, valid_scores, test_scores
491
492 def load_mnist(filename):
493 f = gzip.open(filename,'rb')
494 train_set, valid_set, test_set = cPickle.load(f)
495 f.close()
496
497 def shared_dataset(data_xy):
498 data_x, data_y = data_xy
499 shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
500 shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
501 return shared_x, T.cast(shared_y, 'int32')
502
503 n_train_examples = train_set[0].shape[0]
504 datasets = shared_dataset(train_set), shared_dataset(valid_set), shared_dataset(test_set)
505
506 return n_train_examples, datasets
507
508 def dbn_main(finetune_lr = 0.01,
509 pretraining_epochs = 10,
510 pretrain_lr = 0.1,
511 training_epochs = 1000,
512 batch_size = 20,
513 mnist_file='mnist.pkl.gz'):
514 """
515 Demonstrate stochastic gradient descent optimization for a multilayer perceptron
516
517 This is demonstrated on MNIST.
518
519 :param learning_rate: learning rate used in the finetune stage
520 (factor for the stochastic gradient)
521
522 :param pretraining_epochs: number of epoch to do pretraining
523
524 :param pretrain_lr: learning rate to be used during pre-training
525
526 :param n_iter: maximal number of iterations ot run the optimizer
527
528 :param mnist_file: path the the pickled mnist_file
529
530 """
531
532 n_train_examples, train_valid_test = load_mnist(mnist_file)
533
534 print "Creating a Deep Belief Network"
535 deep_model = DBN(
536 input_len=28*28,
537 hidden_layers_sizes = [500, 150, 100],
538 n_classes=10,
539 rng = numpy.random.RandomState())
540
541 ####
542 #### Phase 1: Pre-training
543 ####
544 print "Pretraining (unsupervised learning) ..."
545
546 pretrain_functions = deep_model.pretraining_functions(
547 batch_size=batch_size,
548 train_set_x=train_valid_test[0][0],
549 learning_rate=pretrain_lr,
550 )
551
552 start_time = time.clock()
553 for layer_idx, pretrain_fn in enumerate(pretrain_functions):
554 # go through pretraining epochs
555 print 'Pre-training layer %i'% layer_idx
556 for i in xrange(pretraining_epochs * n_train_examples / batch_size):
557 outstuff = pretrain_fn(i)
558 xe, negsample = outstuff[:2]
559 print (layer_idx, i,
560 n_train_examples / batch_size,
561 float(xe),
562 'Wmin', deep_model.rbm_layers[0].W.value.min(),
563 'Wmax', deep_model.rbm_layers[0].W.value.max(),
564 'vmin', deep_model.rbm_layers[0].vbias.value.min(),
565 'vmax', deep_model.rbm_layers[0].vbias.value.max(),
566 #'x>0.3', (input_i>0.3).sum(),
567 )
568 sys.stdout.flush()
569 if i % 1000 == 0:
570 PIL.Image.fromarray(
571 pylearn.io.image_tiling.tile_raster_images(negsample, (28,28), (10,10),
572 tile_spacing=(1,1))).save('samples_%i_%i.png'%(layer_idx,i))
573
574 PIL.Image.fromarray(
575 pylearn.io.image_tiling.tile_raster_images(
576 deep_model.rbm_layers[0].W.value.T,
577 (28,28), (10,10),
578 tile_spacing=(1,1))).save('filters_%i_%i.png'%(layer_idx,i))
579 end_time = time.clock()
580 print 'Pretraining took %f minutes' %((end_time - start_time)/60.)
581
582 return
583
584 print "Fine tuning (supervised learning) ..."
585 train_fn, valid_scores, test_scores =\
586 deep_model.finetune_functions(train_valid_test[0][0],
587 learning_rate=finetune_lr, # the learning rate
588 batch_size = batch_size) # number of examples to use at once
589
590 ####
591 #### Phase 2: Fine Tuning
592 ####
593
594 patience = 10000 # look as this many examples regardless
595 patience_increase = 2. # wait this much longer when a new best is
596 # found
597 improvement_threshold = 0.995 # a relative improvement of this much is
598 # considered significant
599 validation_frequency = min(n_train_examples, patience/2)
600 # go through this many
601 # minibatche before checking the network
602 # on the validation set; in this case we
603 # check every epoch
604
605 patience_max = n_train_examples * training_epochs
606
607 best_epoch = None
608 best_epoch_test_score = None
609 best_epoch_valid_score = float('inf')
610 start_time = time.clock()
611
612 for i in xrange(patience_max):
613 if i >= patience:
614 break
615
616 cost_i = train_fn(i)
617
618 if i % validation_frequency == 0:
619 validation_i = numpy.mean([score for score in valid_scores()])
620
621 # if we got the best validation score until now
622 if validation_i < best_epoch_valid_score:
623
624 # improve patience if loss improvement is good enough
625 threshold_i = best_epoch_valid_score * improvement_threshold
626 if validation_i < threshold_i:
627 patience = max(patience, i * patience_increase)
628
629 # save best validation score and iteration number
630 best_epoch_valid_score = validation_i
631 best_epoch = i/validation_i
632 best_epoch_test_score = numpy.mean(
633 [score for score in test_scores()])
634
635 print('epoch %i, validation error %f %%, test error %f %%'%(
636 i/validation_frequency, validation_i*100.,
637 best_epoch_test_score*100.))
638 else:
639 print('epoch %i, validation error %f %%' % (
640 i/validation_frequency, validation_i*100.))
641 end_time = time.clock()
642
643 print(('Optimization complete with best validation score of %f %%,'
644 'with test performance %f %%') %
645 (finetune_status['best_validation_loss']*100.,
646 finetune_status['test_score']*100.))
647 print ('The code ran for %f minutes' % ((finetune_status['duration'])/60.))
648
649 def rbm_main():
650 rbm = RBM(n_visible=20, n_hidden=30,
651 numpy_rng = numpy.random.RandomState(34))
652
653 cd_updates = rbm.cd_updates(lr=0.25)
654
655 print cd_updates
656
657 f = function([rbm.input], [],
658 updates={rbm.W:cd_updates[rbm.W]})
659
660 theano.printing.debugprint(f.maker.env.outputs[0],
661 file=sys.stdout)
662
663
664 if __name__ == '__main__':
665 dbn_main()
666 #rbm_main()
667
668
669 if 0:
670 class DAA(object):
671 def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\
672 input = None, shared_W = None, shared_b = None):
673 """
674 Initialize the dA class by specifying the number of visible units (the
675 dimension d of the input ), the number of hidden units ( the dimension
676 d' of the latent or hidden space ) and the corruption level. The
677 constructor also receives symbolic variables for the input, weights and
678 bias. Such a symbolic variables are useful when, for example the input is
679 the result of some computations, or when weights are shared between the
680 dA and an MLP layer. When dealing with SdAs this always happens,
681 the dA on layer 2 gets as input the output of the dA on layer 1,
682 and the weights of the dA are used in the second stage of training
683 to construct an MLP.
684
685 :param n_visible: number of visible units
686
687 :param n_hidden: number of hidden units
688
689 :param input: a symbolic description of the input or None
690
691 :param corruption_level: the corruption mechanism picks up randomly this
692 fraction of entries of the input and turns them to 0
693
694
695 """
696 self.n_visible = n_visible
697 self.n_hidden = n_hidden
698
699 # create a Theano random generator that gives symbolic random values
700 theano_rng = RandomStreams()
701
702 if shared_W != None and shared_b != None :
703 self.W = shared_W
704 self.b = shared_b
705 else:
706 # initial values for weights and biases
707 # note : W' was written as `W_prime` and b' as `b_prime`
708
709 # W is initialized with `initial_W` which is uniformely sampled
710 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
711 # the output of uniform if converted using asarray to dtype
712 # theano.config.floatX so that the code is runable on GPU
713 initial_W = numpy.asarray( numpy.random.uniform( \
714 low = -numpy.sqrt(6./(n_hidden+n_visible)), \
715 high = numpy.sqrt(6./(n_hidden+n_visible)), \
716 size = (n_visible, n_hidden)), dtype = theano.config.floatX)
717 initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX)
718
719
720 # theano shared variables for weights and biases
721 self.W = theano.shared(value = initial_W, name = "W")
722 self.b = theano.shared(value = initial_b, name = "b")
723
724
725 initial_b_prime= numpy.zeros(n_visible)
726 # tied weights, therefore W_prime is W transpose
727 self.W_prime = self.W.T
728 self.b_prime = theano.shared(value = initial_b_prime, name = "b'")
729
730 # if no input is given, generate a variable representing the input
731 if input == None :
732 # we use a matrix because we expect a minibatch of several examples,
733 # each example being a row
734 self.x = T.matrix(name = 'input')
735 else:
736 self.x = input
737 # Equation (1)
738 # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs
739 # note : first argument of theano.rng.binomial is the shape(size) of
740 # random numbers that it should produce
741 # second argument is the number of trials
742 # third argument is the probability of success of any trial
743 #
744 # this will produce an array of 0s and 1s where 1 has a
745 # probability of 1 - ``corruption_level`` and 0 with
746 # ``corruption_level``
747 self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x
748 # Equation (2)
749 # note : y is stored as an attribute of the class so that it can be
750 # used later when stacking dAs.
751 self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b)
752 # Equation (3)
753 self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime)
754 # Equation (4)
755 # note : we sum over the size of a datapoint; if we are using minibatches,
756 # L will be a vector, with one entry per example in minibatch
757 self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 )
758 # note : L is now a vector, where each element is the cross-entropy cost
759 # of the reconstruction of the corresponding example of the
760 # minibatch. We need to compute the average of all these to get
761 # the cost of the minibatch
762 self.cost = T.mean(self.L)
763
764 self.params = [ self.W, self.b, self.b_prime ]
765
766 class StackedDAA(DeepLayerwiseModel):
767 """Stacked denoising auto-encoder class (SdA)
768
769 A stacked denoising autoencoder model is obtained by stacking several
770 dAs. The hidden layer of the dA at layer `i` becomes the input of
771 the dA at layer `i+1`. The first layer dA gets as input the input of
772 the SdA, and the hidden layer of the last dA represents the output.
773 Note that after pretraining, the SdA is dealt with as a normal MLP,
774 the dAs are only used to initialize the weights.
775 """
776
777 def __init__(self, n_ins, hidden_layers_sizes, n_outs,
778 corruption_levels, rng, ):
779 """ This class is made to support a variable number of layers.
780
781 :param train_set_x: symbolic variable pointing to the training dataset
782
783 :param train_set_y: symbolic variable pointing to the labels of the
784 training dataset
785
786 :param n_ins: dimension of the input to the sdA
787
788 :param n_layers_sizes: intermidiate layers size, must contain
789 at least one value
790
791 :param n_outs: dimension of the output of the network
792
793 :param corruption_levels: amount of corruption to use for each
794 layer
795
796 :param rng: numpy random number generator used to draw initial weights
797
798 :param pretrain_lr: learning rate used during pre-trainnig stage
799
800 :param finetune_lr: learning rate used during finetune stage
801 """
802
803 self.sigmoid_layers = []
804 self.daa_layers = []
805 self.pretrain_functions = []
806 self.params = []
807 self.n_layers = len(hidden_layers_sizes)
808
809 if len(hidden_layers_sizes) < 1 :
810 raiseException (' You must have at least one hidden layer ')
811
812 theano_rng = RandomStreams(rng.randint(2**30))
813
814 # allocate symbolic variables for the data
815 index = T.lscalar() # index to a [mini]batch
816 self.x = T.matrix('x') # the data is presented as rasterized images
817 self.y = T.ivector('y') # the labels are presented as 1D vector of
818 # [int] labels
819
820 # The SdA is an MLP, for which all weights of intermidiate layers
821 # are shared with a different denoising autoencoders
822 # We will first construct the SdA as a deep multilayer perceptron,
823 # and when constructing each sigmoidal layer we also construct a
824 # denoising autoencoder that shares weights with that layer, and
825 # compile a training function for that denoising autoencoder
826
827 for i in xrange( self.n_layers ):
828 # construct the sigmoidal layer
829
830 sigmoid_layer = SigmoidalLayer(rng,
831 self.layers[-1].output if i else self.x,
832 hidden_layers_sizes[i-1] if i else n_ins,
833 hidden_layers_sizes[i])
834
835 daa_layer = DAA(corruption_level = corruption_levels[i],
836 input = sigmoid_layer.input,
837 W = sigmoid_layer.W,
838 b = sigmoid_layer.b)
839
840 # add the layer to the
841 self.sigmoid_layers.append(sigmoid_layer)
842 self.daa_layers.append(daa_layer)
843
844 # its arguably a philosophical question...
845 # but we are going to only declare that the parameters of the
846 # sigmoid_layers are parameters of the StackedDAA
847 # the hidden-layer biases in the daa_layers are parameters of those
848 # daa_layers, but not the StackedDAA
849 self.params.extend(sigmoid_layer.params)
850
851 # We now need to add a logistic layer on top of the MLP
852 self.logistic_regressor = LogisticRegression(
853 input = self.sigmoid_layers[-1].output,
854 n_in = hidden_layers_sizes[-1],
855 n_out = n_outs)
856
857 self.params.extend(self.logLayer.params)
858
859 def pretraining_functions(self, train_set_x, batch_size):
860
861 # compiles update functions for each layer, and
862 # returns them as a list
863 #
864 # Construct a function that trains this dA
865 # compute gradients of layer parameters
866 gparams = T.grad(dA_layer.cost, dA_layer.params)
867 # compute the list of updates
868 updates = {}
869 for param, gparam in zip(dA_layer.params, gparams):
870 updates[param] = param - gparam * pretrain_lr
871
872 # create a function that trains the dA
873 update_fn = theano.function([index], dA_layer.cost, \
874 updates = updates,
875 givens = {
876 self.x : train_set_x[index*batch_size:(index+1)*batch_size]})
877 # collect this function into a list
878 self.pretrain_functions += [update_fn]
879
880