Mercurial > ift6266
comparison code_tutoriel/deep.py @ 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
164:e3de934a98b6 | 165:4bc5eeec6394 |
---|---|
1 """ | |
2 Draft of DBN, DAA, SDAA, RBM tutorial code | |
3 | |
4 """ | |
5 import sys | |
6 import numpy | |
7 import theano | |
8 import time | |
9 import theano.tensor as T | |
10 from theano.tensor.shared_randomstreams import RandomStreams | |
11 from theano import shared, function | |
12 | |
13 import gzip | |
14 import cPickle | |
15 import pylearn.io.image_tiling | |
16 import PIL | |
17 | |
18 # NNET STUFF | |
19 | |
20 class LogisticRegression(object): | |
21 """Multi-class Logistic Regression Class | |
22 | |
23 The logistic regression is fully described by a weight matrix :math:`W` | |
24 and bias vector :math:`b`. Classification is done by projecting data | |
25 points onto a set of hyperplanes, the distance to which is used to | |
26 determine a class membership probability. | |
27 """ | |
28 | |
29 def __init__(self, input, n_in, n_out): | |
30 """ Initialize the parameters of the logistic regression | |
31 :param input: symbolic variable that describes the input of the | |
32 architecture (one minibatch) | |
33 :type n_in: int | |
34 :param n_in: number of input units, the dimension of the space in | |
35 which the datapoints lie | |
36 :type n_out: int | |
37 :param n_out: number of output units, the dimension of the space in | |
38 which the labels lie | |
39 """ | |
40 | |
41 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) | |
42 self.W = theano.shared( value=numpy.zeros((n_in,n_out), | |
43 dtype = theano.config.floatX) ) | |
44 # initialize the baises b as a vector of n_out 0s | |
45 self.b = theano.shared( value=numpy.zeros((n_out,), | |
46 dtype = theano.config.floatX) ) | |
47 # compute vector of class-membership probabilities in symbolic form | |
48 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) | |
49 | |
50 # compute prediction as class whose probability is maximal in | |
51 # symbolic form | |
52 self.y_pred=T.argmax(self.p_y_given_x, axis=1) | |
53 | |
54 # list of parameters for this layer | |
55 self.params = [self.W, self.b] | |
56 | |
57 def negative_log_likelihood(self, y): | |
58 """Return the mean of the negative log-likelihood of the prediction | |
59 of this model under a given target distribution. | |
60 :param y: corresponds to a vector that gives for each example the | |
61 correct label | |
62 Note: we use the mean instead of the sum so that | |
63 the learning rate is less dependent on the batch size | |
64 """ | |
65 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) | |
66 | |
67 def errors(self, y): | |
68 """Return a float representing the number of errors in the minibatch | |
69 over the total number of examples of the minibatch ; zero one | |
70 loss over the size of the minibatch | |
71 """ | |
72 # check if y has same dimension of y_pred | |
73 if y.ndim != self.y_pred.ndim: | |
74 raise TypeError('y should have the same shape as self.y_pred', | |
75 ('y', target.type, 'y_pred', self.y_pred.type)) | |
76 | |
77 # check if y is of the correct datatype | |
78 if y.dtype.startswith('int'): | |
79 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
80 # represents a mistake in prediction | |
81 return T.mean(T.neq(self.y_pred, y)) | |
82 else: | |
83 raise NotImplementedError() | |
84 | |
85 class SigmoidalLayer(object): | |
86 def __init__(self, rng, input, n_in, n_out): | |
87 """ | |
88 Typical hidden layer of a MLP: units are fully-connected and have | |
89 sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) | |
90 and the bias vector b is of shape (n_out,). | |
91 | |
92 Hidden unit activation is given by: sigmoid(dot(input,W) + b) | |
93 | |
94 :type rng: numpy.random.RandomState | |
95 :param rng: a random number generator used to initialize weights | |
96 :type input: theano.tensor.matrix | |
97 :param input: a symbolic tensor of shape (n_examples, n_in) | |
98 :type n_in: int | |
99 :param n_in: dimensionality of input | |
100 :type n_out: int | |
101 :param n_out: number of hidden units | |
102 """ | |
103 self.input = input | |
104 | |
105 W_values = numpy.asarray( rng.uniform( \ | |
106 low = -numpy.sqrt(6./(n_in+n_out)), \ | |
107 high = numpy.sqrt(6./(n_in+n_out)), \ | |
108 size = (n_in, n_out)), dtype = theano.config.floatX) | |
109 self.W = theano.shared(value = W_values) | |
110 | |
111 b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) | |
112 self.b = theano.shared(value= b_values) | |
113 | |
114 self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) | |
115 self.params = [self.W, self.b] | |
116 | |
117 # PRETRAINING LAYERS | |
118 | |
119 class RBM(object): | |
120 """ | |
121 *** WRITE THE ENERGY FUNCTION USE SAME LETTERS AS VARIABLE NAMES IN CODE | |
122 """ | |
123 | |
124 def __init__(self, input=None, n_visible=None, n_hidden=None, | |
125 W=None, hbias=None, vbias=None, | |
126 numpy_rng=None, theano_rng=None): | |
127 """ | |
128 RBM constructor. Defines the parameters of the model along with | |
129 basic operations for inferring hidden from visible (and vice-versa), | |
130 as well as for performing CD updates. | |
131 | |
132 :param input: None for standalone RBMs or symbolic variable if RBM is | |
133 part of a larger graph. | |
134 | |
135 :param n_visible: number of visible units (necessary when W or vbias is None) | |
136 | |
137 :param n_hidden: number of hidden units (necessary when W or hbias is None) | |
138 | |
139 :param W: weights to use for the RBM. None means that a shared variable will be | |
140 created with a randomly chosen matrix of size (n_visible, n_hidden). | |
141 | |
142 :param hbias: *** | |
143 | |
144 :param vbias: *** | |
145 | |
146 :param numpy_rng: random number generator (necessary when W is None) | |
147 | |
148 """ | |
149 | |
150 params = [] | |
151 if W is None: | |
152 # choose initial values for weight matrix of RBM | |
153 initial_W = numpy.asarray( | |
154 numpy_rng.uniform( \ | |
155 low=-numpy.sqrt(6./(n_hidden+n_visible)), \ | |
156 high=numpy.sqrt(6./(n_hidden+n_visible)), \ | |
157 size=(n_visible, n_hidden)), \ | |
158 dtype=theano.config.floatX) | |
159 W = theano.shared(value=initial_W, name='W') | |
160 params.append(W) | |
161 | |
162 if hbias is None: | |
163 # theano shared variables for hidden biases | |
164 hbias = theano.shared(value=numpy.zeros(n_hidden, | |
165 dtype=theano.config.floatX), name='hbias') | |
166 params.append(hbias) | |
167 | |
168 if vbias is None: | |
169 # theano shared variables for visible biases | |
170 vbias = theano.shared(value=numpy.zeros(n_visible, | |
171 dtype=theano.config.floatX), name='vbias') | |
172 params.append(vbias) | |
173 | |
174 if input is None: | |
175 # initialize input layer for standalone RBM or layer0 of DBN | |
176 input = T.matrix('input') | |
177 | |
178 # setup theano random number generator | |
179 if theano_rng is None: | |
180 theano_rng = RandomStreams(numpy_rng.randint(2**30)) | |
181 | |
182 self.visible = self.input = input | |
183 self.W = W | |
184 self.hbias = hbias | |
185 self.vbias = vbias | |
186 self.theano_rng = theano_rng | |
187 self.params = params | |
188 self.hidden_mean = T.nnet.sigmoid(T.dot(input, W)+hbias) | |
189 self.hidden_sample = theano_rng.binomial(self.hidden_mean.shape, 1, self.hidden_mean) | |
190 | |
191 def gibbs_k(self, v_sample, k): | |
192 ''' This function implements k steps of Gibbs sampling ''' | |
193 | |
194 # We compute the visible after k steps of Gibbs by iterating | |
195 # over ``gibs_1`` for k times; this can be done in Theano using | |
196 # the `scan op`. For a more comprehensive description of scan see | |
197 # http://deeplearning.net/software/theano/library/scan.html . | |
198 | |
199 def gibbs_1(v0_sample, W, hbias, vbias): | |
200 ''' This function implements one Gibbs step ''' | |
201 | |
202 # compute the activation of the hidden units given a sample of the | |
203 # vissibles | |
204 h0_mean = T.nnet.sigmoid(T.dot(v0_sample, W) + hbias) | |
205 # get a sample of the hiddens given their activation | |
206 h0_sample = self.theano_rng.binomial(h0_mean.shape, 1, h0_mean) | |
207 # compute the activation of the visible given the hidden sample | |
208 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, W.T) + vbias) | |
209 # get a sample of the visible given their activation | |
210 v1_act = self.theano_rng.binomial(v1_mean.shape, 1, v1_mean) | |
211 return [v1_mean, v1_act] | |
212 | |
213 | |
214 # DEBUGGING TO DO ALL WITHOUT SCAN | |
215 if k == 1: | |
216 return gibbs_1(v_sample, self.W, self.hbias, self.vbias) | |
217 | |
218 | |
219 # Because we require as output two values, namely the mean field | |
220 # approximation of the visible and the sample obtained after k steps, | |
221 # scan needs to know the shape of those two outputs. Scan takes | |
222 # this information from the variables containing the initial state | |
223 # of the outputs. Since we do not need a initial state of ``v_mean`` | |
224 # we provide a dummy one used only to get the correct shape | |
225 v_mean = T.zeros_like(v_sample) | |
226 | |
227 # ``outputs_taps`` is an argument of scan which describes at each | |
228 # time step what past values of the outputs the function applied | |
229 # recursively needs. This is given in the form of a dictionary, | |
230 # where the keys are outputs indexes, and values are a list of | |
231 # of the offsets used by the corresponding outputs | |
232 # In our case the function ``gibbs_1`` applied recursively, requires | |
233 # at time k the past value k-1 for the first output (index 0) and | |
234 # no past value of the second output | |
235 outputs_taps = { 0 : [-1], 1 : [] } | |
236 | |
237 v_means, v_samples = theano.scan( fn = gibbs_1, | |
238 sequences = [], | |
239 initial_states = [v_sample, v_mean], | |
240 non_sequences = [self.W, self.hbias, self.vbias], | |
241 outputs_taps = outputs_taps, | |
242 n_steps = k) | |
243 return v_means[-1], v_samples[-1] | |
244 | |
245 def free_energy(self, v_sample): | |
246 wx_b = T.dot(v_sample, self.W) + self.hbias | |
247 vbias_term = T.sum(T.dot(v_sample, self.vbias)) | |
248 hidden_term = T.sum(T.log(1+T.exp(wx_b))) | |
249 return -hidden_term - vbias_term | |
250 | |
251 def cd(self, visible = None, persistent = None, steps = 1): | |
252 """ | |
253 Return a 5-tuple of values related to contrastive divergence: (cost, | |
254 end-state of negative-phase chain, gradient on weights, gradient on | |
255 hidden bias, gradient on visible bias) | |
256 | |
257 If visible is None, it defaults to self.input | |
258 If persistent is None, it defaults to self.input | |
259 | |
260 CD aka CD1 - cd() | |
261 CD-10 - cd(steps=10) | |
262 PCD - cd(persistent=shared(numpy.asarray(initializer))) | |
263 PCD-k - cd(persistent=shared(numpy.asarray(initializer)), | |
264 steps=10) | |
265 """ | |
266 if visible is None: | |
267 visible = self.input | |
268 | |
269 if visible is None: | |
270 raise TypeError('visible argument is required when self.input is None') | |
271 | |
272 if steps is None: | |
273 steps = self.gibbs_1 | |
274 | |
275 if persistent is None: | |
276 chain_start = visible | |
277 else: | |
278 chain_start = persistent | |
279 | |
280 chain_end_mean, chain_end_sample = self.gibbs_k(chain_start, steps) | |
281 | |
282 #print >> sys.stderr, "WARNING: DEBUGGING with wrong FREE ENERGY" | |
283 #free_energy_delta = - self.free_energy(chain_end_sample) | |
284 free_energy_delta = self.free_energy(visible) - self.free_energy(chain_end_sample) | |
285 | |
286 # we will return all of these regardless of what is in self.params | |
287 all_params = [self.W, self.hbias, self.vbias] | |
288 | |
289 gparams = T.grad(free_energy_delta, all_params, | |
290 consider_constant = [chain_end_sample]) | |
291 | |
292 cross_entropy = T.mean(T.sum( | |
293 visible*T.log(chain_end_mean) + (1 - visible)*T.log(1-chain_end_mean), | |
294 axis = 1)) | |
295 | |
296 return (cross_entropy, chain_end_sample,) + tuple(gparams) | |
297 | |
298 def cd_updates(self, lr, visible = None, persistent = None, steps = 1): | |
299 """ | |
300 Return the learning updates for the RBM parameters that are shared variables. | |
301 | |
302 Also returns an update for the persistent if it is a shared variable. | |
303 | |
304 These updates are returned as a dictionary. | |
305 | |
306 :param lr: [scalar] learning rate for contrastive divergence learning | |
307 :param visible: see `cd_grad` | |
308 :param persistent: see `cd_grad` | |
309 :param steps: see `cd_grad` | |
310 | |
311 """ | |
312 | |
313 cross_entropy, chain_end, gW, ghbias, gvbias = self.cd(visible, | |
314 persistent, steps) | |
315 | |
316 updates = {} | |
317 if hasattr(self.W, 'value'): | |
318 updates[self.W] = self.W - lr * gW | |
319 if hasattr(self.hbias, 'value'): | |
320 updates[self.hbias] = self.hbias - lr * ghbias | |
321 if hasattr(self.vbias, 'value'): | |
322 updates[self.vbias] = self.vbias - lr * gvbias | |
323 if persistent: | |
324 #if persistent is a shared var, then it means we should use | |
325 updates[persistent] = chain_end | |
326 | |
327 return updates | |
328 | |
329 # DEEP MODELS | |
330 | |
331 class DBN(object): | |
332 """ | |
333 *** WHAT IS A DBN? | |
334 """ | |
335 | |
336 def __init__(self, input_len, hidden_layers_sizes, n_classes, rng): | |
337 """ This class is made to support a variable number of layers. | |
338 | |
339 :param train_set_x: symbolic variable pointing to the training dataset | |
340 | |
341 :param train_set_y: symbolic variable pointing to the labels of the | |
342 training dataset | |
343 | |
344 :param input_len: dimension of the input to the sdA | |
345 | |
346 :param n_layers_sizes: intermidiate layers size, must contain | |
347 at least one value | |
348 | |
349 :param n_classes: dimension of the output of the network | |
350 | |
351 :param corruption_levels: amount of corruption to use for each | |
352 layer | |
353 | |
354 :param rng: numpy random number generator used to draw initial weights | |
355 | |
356 :param pretrain_lr: learning rate used during pre-trainnig stage | |
357 | |
358 :param finetune_lr: learning rate used during finetune stage | |
359 """ | |
360 | |
361 self.sigmoid_layers = [] | |
362 self.rbm_layers = [] | |
363 self.pretrain_functions = [] | |
364 self.params = [] | |
365 | |
366 theano_rng = RandomStreams(rng.randint(2**30)) | |
367 | |
368 # allocate symbolic variables for the data | |
369 index = T.lscalar() # index to a [mini]batch | |
370 self.x = T.matrix('x') # the data is presented as rasterized images | |
371 self.y = T.ivector('y') # the labels are presented as 1D vector of | |
372 # [int] labels | |
373 input = self.x | |
374 | |
375 # The SdA is an MLP, for which all weights of intermidiate layers | |
376 # are shared with a different denoising autoencoders | |
377 # We will first construct the SdA as a deep multilayer perceptron, | |
378 # and when constructing each sigmoidal layer we also construct a | |
379 # denoising autoencoder that shares weights with that layer, and | |
380 # compile a training function for that denoising autoencoder | |
381 | |
382 for n_hid in hidden_layers_sizes: | |
383 # construct the sigmoidal layer | |
384 | |
385 sigmoid_layer = SigmoidalLayer(rng, input, input_len, n_hid) | |
386 self.sigmoid_layers.append(sigmoid_layer) | |
387 | |
388 self.rbm_layers.append(RBM(input=input, | |
389 W=sigmoid_layer.W, | |
390 hbias=sigmoid_layer.b, | |
391 n_visible = input_len, | |
392 n_hidden = n_hid, | |
393 numpy_rng=rng, | |
394 theano_rng=theano_rng)) | |
395 | |
396 # its arguably a philosophical question... | |
397 # but we are going to only declare that the parameters of the | |
398 # sigmoid_layers are parameters of the StackedDAA | |
399 # the hidden-layer biases in the daa_layers are parameters of those | |
400 # daa_layers, but not the StackedDAA | |
401 self.params.extend(self.sigmoid_layers[-1].params) | |
402 | |
403 # get ready for the next loop iteration | |
404 input_len = n_hid | |
405 input = self.sigmoid_layers[-1].output | |
406 | |
407 # We now need to add a logistic layer on top of the MLP | |
408 self.logistic_regressor = LogisticRegression(input = input, | |
409 n_in = input_len, n_out = n_classes) | |
410 | |
411 self.params.extend(self.logistic_regressor.params) | |
412 | |
413 def pretraining_functions(self, train_set_x, batch_size, learning_rate, k=1): | |
414 if k!=1: | |
415 raise NotImplementedError() | |
416 index = T.lscalar() # index to a [mini]batch | |
417 n_train_batches = train_set_x.value.shape[0] / batch_size | |
418 batch_begin = (index % n_train_batches) * batch_size | |
419 batch_end = batch_begin+batch_size | |
420 | |
421 print 'TRAIN_SET X', train_set_x.value.shape | |
422 rval = [] | |
423 for rbm in self.rbm_layers: | |
424 # N.B. these cd() samples are independent from the | |
425 # samples used for learning | |
426 outputs = list(rbm.cd())[0:2] | |
427 rval.append(function([index], outputs, | |
428 updates = rbm.cd_updates(lr=learning_rate), | |
429 givens = {self.x: train_set_x[batch_begin:batch_end]})) | |
430 if rbm is self.rbm_layers[0]: | |
431 f = rval[-1] | |
432 AA=len(outputs) | |
433 for i, implicit_out in enumerate(f.maker.env.outputs): #[len(outputs):]: | |
434 print 'OUTPUT ', i | |
435 theano.printing.debugprint(implicit_out, file=sys.stdout) | |
436 | |
437 return rval | |
438 | |
439 def finetune(self, datasets, lr, batch_size): | |
440 | |
441 # unpack the various datasets | |
442 (train_set_x, train_set_y) = datasets[0] | |
443 (valid_set_x, valid_set_y) = datasets[1] | |
444 (test_set_x, test_set_y) = datasets[2] | |
445 | |
446 # compute number of minibatches for training, validation and testing | |
447 assert train_set_x.value.shape[0] % batch_size == 0 | |
448 assert valid_set_x.value.shape[0] % batch_size == 0 | |
449 assert test_set_x.value.shape[0] % batch_size == 0 | |
450 n_train_batches = train_set_x.value.shape[0] / batch_size | |
451 n_valid_batches = valid_set_x.value.shape[0] / batch_size | |
452 n_test_batches = test_set_x.value.shape[0] / batch_size | |
453 | |
454 index = T.lscalar() # index to a [mini]batch | |
455 target = self.y | |
456 | |
457 train_index = index % n_train_batches | |
458 | |
459 classifier = self.logistic_regressor | |
460 cost = classifier.negative_log_likelihood(target) | |
461 # compute the gradients with respect to the model parameters | |
462 gparams = T.grad(cost, self.params) | |
463 | |
464 # compute list of fine-tuning updates | |
465 updates = [(param, param - gparam*finetune_lr) | |
466 for param,gparam in zip(self.params, gparams)] | |
467 | |
468 train_fn = theano.function([index], cost, | |
469 updates = updates, | |
470 givens = { | |
471 self.x : train_set_x[train_index*batch_size:(train_index+1)*batch_size], | |
472 target : train_set_y[train_index*batch_size:(train_index+1)*batch_size]}) | |
473 | |
474 test_score_i = theano.function([index], classifier.errors(target), | |
475 givens = { | |
476 self.x: test_set_x[index*batch_size:(index+1)*batch_size], | |
477 target: test_set_y[index*batch_size:(index+1)*batch_size]}) | |
478 | |
479 valid_score_i = theano.function([index], classifier.errors(target), | |
480 givens = { | |
481 self.x: valid_set_x[index*batch_size:(index+1)*batch_size], | |
482 target: valid_set_y[index*batch_size:(index+1)*batch_size]}) | |
483 | |
484 def test_scores(): | |
485 return [test_score_i(i) for i in xrange(n_test_batches)] | |
486 | |
487 def valid_scores(): | |
488 return [valid_score_i(i) for i in xrange(n_valid_batches)] | |
489 | |
490 return train_fn, valid_scores, test_scores | |
491 | |
492 def load_mnist(filename): | |
493 f = gzip.open(filename,'rb') | |
494 train_set, valid_set, test_set = cPickle.load(f) | |
495 f.close() | |
496 | |
497 def shared_dataset(data_xy): | |
498 data_x, data_y = data_xy | |
499 shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) | |
500 shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) | |
501 return shared_x, T.cast(shared_y, 'int32') | |
502 | |
503 n_train_examples = train_set[0].shape[0] | |
504 datasets = shared_dataset(train_set), shared_dataset(valid_set), shared_dataset(test_set) | |
505 | |
506 return n_train_examples, datasets | |
507 | |
508 def dbn_main(finetune_lr = 0.01, | |
509 pretraining_epochs = 10, | |
510 pretrain_lr = 0.1, | |
511 training_epochs = 1000, | |
512 batch_size = 20, | |
513 mnist_file='mnist.pkl.gz'): | |
514 """ | |
515 Demonstrate stochastic gradient descent optimization for a multilayer perceptron | |
516 | |
517 This is demonstrated on MNIST. | |
518 | |
519 :param learning_rate: learning rate used in the finetune stage | |
520 (factor for the stochastic gradient) | |
521 | |
522 :param pretraining_epochs: number of epoch to do pretraining | |
523 | |
524 :param pretrain_lr: learning rate to be used during pre-training | |
525 | |
526 :param n_iter: maximal number of iterations ot run the optimizer | |
527 | |
528 :param mnist_file: path the the pickled mnist_file | |
529 | |
530 """ | |
531 | |
532 n_train_examples, train_valid_test = load_mnist(mnist_file) | |
533 | |
534 print "Creating a Deep Belief Network" | |
535 deep_model = DBN( | |
536 input_len=28*28, | |
537 hidden_layers_sizes = [500, 150, 100], | |
538 n_classes=10, | |
539 rng = numpy.random.RandomState()) | |
540 | |
541 #### | |
542 #### Phase 1: Pre-training | |
543 #### | |
544 print "Pretraining (unsupervised learning) ..." | |
545 | |
546 pretrain_functions = deep_model.pretraining_functions( | |
547 batch_size=batch_size, | |
548 train_set_x=train_valid_test[0][0], | |
549 learning_rate=pretrain_lr, | |
550 ) | |
551 | |
552 start_time = time.clock() | |
553 for layer_idx, pretrain_fn in enumerate(pretrain_functions): | |
554 # go through pretraining epochs | |
555 print 'Pre-training layer %i'% layer_idx | |
556 for i in xrange(pretraining_epochs * n_train_examples / batch_size): | |
557 outstuff = pretrain_fn(i) | |
558 xe, negsample = outstuff[:2] | |
559 print (layer_idx, i, | |
560 n_train_examples / batch_size, | |
561 float(xe), | |
562 'Wmin', deep_model.rbm_layers[0].W.value.min(), | |
563 'Wmax', deep_model.rbm_layers[0].W.value.max(), | |
564 'vmin', deep_model.rbm_layers[0].vbias.value.min(), | |
565 'vmax', deep_model.rbm_layers[0].vbias.value.max(), | |
566 #'x>0.3', (input_i>0.3).sum(), | |
567 ) | |
568 sys.stdout.flush() | |
569 if i % 1000 == 0: | |
570 PIL.Image.fromarray( | |
571 pylearn.io.image_tiling.tile_raster_images(negsample, (28,28), (10,10), | |
572 tile_spacing=(1,1))).save('samples_%i_%i.png'%(layer_idx,i)) | |
573 | |
574 PIL.Image.fromarray( | |
575 pylearn.io.image_tiling.tile_raster_images( | |
576 deep_model.rbm_layers[0].W.value.T, | |
577 (28,28), (10,10), | |
578 tile_spacing=(1,1))).save('filters_%i_%i.png'%(layer_idx,i)) | |
579 end_time = time.clock() | |
580 print 'Pretraining took %f minutes' %((end_time - start_time)/60.) | |
581 | |
582 return | |
583 | |
584 print "Fine tuning (supervised learning) ..." | |
585 train_fn, valid_scores, test_scores =\ | |
586 deep_model.finetune_functions(train_valid_test[0][0], | |
587 learning_rate=finetune_lr, # the learning rate | |
588 batch_size = batch_size) # number of examples to use at once | |
589 | |
590 #### | |
591 #### Phase 2: Fine Tuning | |
592 #### | |
593 | |
594 patience = 10000 # look as this many examples regardless | |
595 patience_increase = 2. # wait this much longer when a new best is | |
596 # found | |
597 improvement_threshold = 0.995 # a relative improvement of this much is | |
598 # considered significant | |
599 validation_frequency = min(n_train_examples, patience/2) | |
600 # go through this many | |
601 # minibatche before checking the network | |
602 # on the validation set; in this case we | |
603 # check every epoch | |
604 | |
605 patience_max = n_train_examples * training_epochs | |
606 | |
607 best_epoch = None | |
608 best_epoch_test_score = None | |
609 best_epoch_valid_score = float('inf') | |
610 start_time = time.clock() | |
611 | |
612 for i in xrange(patience_max): | |
613 if i >= patience: | |
614 break | |
615 | |
616 cost_i = train_fn(i) | |
617 | |
618 if i % validation_frequency == 0: | |
619 validation_i = numpy.mean([score for score in valid_scores()]) | |
620 | |
621 # if we got the best validation score until now | |
622 if validation_i < best_epoch_valid_score: | |
623 | |
624 # improve patience if loss improvement is good enough | |
625 threshold_i = best_epoch_valid_score * improvement_threshold | |
626 if validation_i < threshold_i: | |
627 patience = max(patience, i * patience_increase) | |
628 | |
629 # save best validation score and iteration number | |
630 best_epoch_valid_score = validation_i | |
631 best_epoch = i/validation_i | |
632 best_epoch_test_score = numpy.mean( | |
633 [score for score in test_scores()]) | |
634 | |
635 print('epoch %i, validation error %f %%, test error %f %%'%( | |
636 i/validation_frequency, validation_i*100., | |
637 best_epoch_test_score*100.)) | |
638 else: | |
639 print('epoch %i, validation error %f %%' % ( | |
640 i/validation_frequency, validation_i*100.)) | |
641 end_time = time.clock() | |
642 | |
643 print(('Optimization complete with best validation score of %f %%,' | |
644 'with test performance %f %%') % | |
645 (finetune_status['best_validation_loss']*100., | |
646 finetune_status['test_score']*100.)) | |
647 print ('The code ran for %f minutes' % ((finetune_status['duration'])/60.)) | |
648 | |
649 def rbm_main(): | |
650 rbm = RBM(n_visible=20, n_hidden=30, | |
651 numpy_rng = numpy.random.RandomState(34)) | |
652 | |
653 cd_updates = rbm.cd_updates(lr=0.25) | |
654 | |
655 print cd_updates | |
656 | |
657 f = function([rbm.input], [], | |
658 updates={rbm.W:cd_updates[rbm.W]}) | |
659 | |
660 theano.printing.debugprint(f.maker.env.outputs[0], | |
661 file=sys.stdout) | |
662 | |
663 | |
664 if __name__ == '__main__': | |
665 dbn_main() | |
666 #rbm_main() | |
667 | |
668 | |
669 if 0: | |
670 class DAA(object): | |
671 def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ | |
672 input = None, shared_W = None, shared_b = None): | |
673 """ | |
674 Initialize the dA class by specifying the number of visible units (the | |
675 dimension d of the input ), the number of hidden units ( the dimension | |
676 d' of the latent or hidden space ) and the corruption level. The | |
677 constructor also receives symbolic variables for the input, weights and | |
678 bias. Such a symbolic variables are useful when, for example the input is | |
679 the result of some computations, or when weights are shared between the | |
680 dA and an MLP layer. When dealing with SdAs this always happens, | |
681 the dA on layer 2 gets as input the output of the dA on layer 1, | |
682 and the weights of the dA are used in the second stage of training | |
683 to construct an MLP. | |
684 | |
685 :param n_visible: number of visible units | |
686 | |
687 :param n_hidden: number of hidden units | |
688 | |
689 :param input: a symbolic description of the input or None | |
690 | |
691 :param corruption_level: the corruption mechanism picks up randomly this | |
692 fraction of entries of the input and turns them to 0 | |
693 | |
694 | |
695 """ | |
696 self.n_visible = n_visible | |
697 self.n_hidden = n_hidden | |
698 | |
699 # create a Theano random generator that gives symbolic random values | |
700 theano_rng = RandomStreams() | |
701 | |
702 if shared_W != None and shared_b != None : | |
703 self.W = shared_W | |
704 self.b = shared_b | |
705 else: | |
706 # initial values for weights and biases | |
707 # note : W' was written as `W_prime` and b' as `b_prime` | |
708 | |
709 # W is initialized with `initial_W` which is uniformely sampled | |
710 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) | |
711 # the output of uniform if converted using asarray to dtype | |
712 # theano.config.floatX so that the code is runable on GPU | |
713 initial_W = numpy.asarray( numpy.random.uniform( \ | |
714 low = -numpy.sqrt(6./(n_hidden+n_visible)), \ | |
715 high = numpy.sqrt(6./(n_hidden+n_visible)), \ | |
716 size = (n_visible, n_hidden)), dtype = theano.config.floatX) | |
717 initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) | |
718 | |
719 | |
720 # theano shared variables for weights and biases | |
721 self.W = theano.shared(value = initial_W, name = "W") | |
722 self.b = theano.shared(value = initial_b, name = "b") | |
723 | |
724 | |
725 initial_b_prime= numpy.zeros(n_visible) | |
726 # tied weights, therefore W_prime is W transpose | |
727 self.W_prime = self.W.T | |
728 self.b_prime = theano.shared(value = initial_b_prime, name = "b'") | |
729 | |
730 # if no input is given, generate a variable representing the input | |
731 if input == None : | |
732 # we use a matrix because we expect a minibatch of several examples, | |
733 # each example being a row | |
734 self.x = T.matrix(name = 'input') | |
735 else: | |
736 self.x = input | |
737 # Equation (1) | |
738 # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs | |
739 # note : first argument of theano.rng.binomial is the shape(size) of | |
740 # random numbers that it should produce | |
741 # second argument is the number of trials | |
742 # third argument is the probability of success of any trial | |
743 # | |
744 # this will produce an array of 0s and 1s where 1 has a | |
745 # probability of 1 - ``corruption_level`` and 0 with | |
746 # ``corruption_level`` | |
747 self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x | |
748 # Equation (2) | |
749 # note : y is stored as an attribute of the class so that it can be | |
750 # used later when stacking dAs. | |
751 self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) | |
752 # Equation (3) | |
753 self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) | |
754 # Equation (4) | |
755 # note : we sum over the size of a datapoint; if we are using minibatches, | |
756 # L will be a vector, with one entry per example in minibatch | |
757 self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) | |
758 # note : L is now a vector, where each element is the cross-entropy cost | |
759 # of the reconstruction of the corresponding example of the | |
760 # minibatch. We need to compute the average of all these to get | |
761 # the cost of the minibatch | |
762 self.cost = T.mean(self.L) | |
763 | |
764 self.params = [ self.W, self.b, self.b_prime ] | |
765 | |
766 class StackedDAA(DeepLayerwiseModel): | |
767 """Stacked denoising auto-encoder class (SdA) | |
768 | |
769 A stacked denoising autoencoder model is obtained by stacking several | |
770 dAs. The hidden layer of the dA at layer `i` becomes the input of | |
771 the dA at layer `i+1`. The first layer dA gets as input the input of | |
772 the SdA, and the hidden layer of the last dA represents the output. | |
773 Note that after pretraining, the SdA is dealt with as a normal MLP, | |
774 the dAs are only used to initialize the weights. | |
775 """ | |
776 | |
777 def __init__(self, n_ins, hidden_layers_sizes, n_outs, | |
778 corruption_levels, rng, ): | |
779 """ This class is made to support a variable number of layers. | |
780 | |
781 :param train_set_x: symbolic variable pointing to the training dataset | |
782 | |
783 :param train_set_y: symbolic variable pointing to the labels of the | |
784 training dataset | |
785 | |
786 :param n_ins: dimension of the input to the sdA | |
787 | |
788 :param n_layers_sizes: intermidiate layers size, must contain | |
789 at least one value | |
790 | |
791 :param n_outs: dimension of the output of the network | |
792 | |
793 :param corruption_levels: amount of corruption to use for each | |
794 layer | |
795 | |
796 :param rng: numpy random number generator used to draw initial weights | |
797 | |
798 :param pretrain_lr: learning rate used during pre-trainnig stage | |
799 | |
800 :param finetune_lr: learning rate used during finetune stage | |
801 """ | |
802 | |
803 self.sigmoid_layers = [] | |
804 self.daa_layers = [] | |
805 self.pretrain_functions = [] | |
806 self.params = [] | |
807 self.n_layers = len(hidden_layers_sizes) | |
808 | |
809 if len(hidden_layers_sizes) < 1 : | |
810 raiseException (' You must have at least one hidden layer ') | |
811 | |
812 theano_rng = RandomStreams(rng.randint(2**30)) | |
813 | |
814 # allocate symbolic variables for the data | |
815 index = T.lscalar() # index to a [mini]batch | |
816 self.x = T.matrix('x') # the data is presented as rasterized images | |
817 self.y = T.ivector('y') # the labels are presented as 1D vector of | |
818 # [int] labels | |
819 | |
820 # The SdA is an MLP, for which all weights of intermidiate layers | |
821 # are shared with a different denoising autoencoders | |
822 # We will first construct the SdA as a deep multilayer perceptron, | |
823 # and when constructing each sigmoidal layer we also construct a | |
824 # denoising autoencoder that shares weights with that layer, and | |
825 # compile a training function for that denoising autoencoder | |
826 | |
827 for i in xrange( self.n_layers ): | |
828 # construct the sigmoidal layer | |
829 | |
830 sigmoid_layer = SigmoidalLayer(rng, | |
831 self.layers[-1].output if i else self.x, | |
832 hidden_layers_sizes[i-1] if i else n_ins, | |
833 hidden_layers_sizes[i]) | |
834 | |
835 daa_layer = DAA(corruption_level = corruption_levels[i], | |
836 input = sigmoid_layer.input, | |
837 W = sigmoid_layer.W, | |
838 b = sigmoid_layer.b) | |
839 | |
840 # add the layer to the | |
841 self.sigmoid_layers.append(sigmoid_layer) | |
842 self.daa_layers.append(daa_layer) | |
843 | |
844 # its arguably a philosophical question... | |
845 # but we are going to only declare that the parameters of the | |
846 # sigmoid_layers are parameters of the StackedDAA | |
847 # the hidden-layer biases in the daa_layers are parameters of those | |
848 # daa_layers, but not the StackedDAA | |
849 self.params.extend(sigmoid_layer.params) | |
850 | |
851 # We now need to add a logistic layer on top of the MLP | |
852 self.logistic_regressor = LogisticRegression( | |
853 input = self.sigmoid_layers[-1].output, | |
854 n_in = hidden_layers_sizes[-1], | |
855 n_out = n_outs) | |
856 | |
857 self.params.extend(self.logLayer.params) | |
858 | |
859 def pretraining_functions(self, train_set_x, batch_size): | |
860 | |
861 # compiles update functions for each layer, and | |
862 # returns them as a list | |
863 # | |
864 # Construct a function that trains this dA | |
865 # compute gradients of layer parameters | |
866 gparams = T.grad(dA_layer.cost, dA_layer.params) | |
867 # compute the list of updates | |
868 updates = {} | |
869 for param, gparam in zip(dA_layer.params, gparams): | |
870 updates[param] = param - gparam * pretrain_lr | |
871 | |
872 # create a function that trains the dA | |
873 update_fn = theano.function([index], dA_layer.cost, \ | |
874 updates = updates, | |
875 givens = { | |
876 self.x : train_set_x[index*batch_size:(index+1)*batch_size]}) | |
877 # collect this function into a list | |
878 self.pretrain_functions += [update_fn] | |
879 | |
880 |