Mercurial > ift6266
annotate deep/rbm/rbm.py @ 581:df749e70f637
ajoute les auteurs
author | Yoshua Bengio <bengioy@iro.umontreal.ca> |
---|---|
date | Thu, 16 Sep 2010 16:21:52 -0400 |
parents | e36ccffb3870 |
children |
rev | line source |
---|---|
347 | 1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano. |
2 | |
3 Boltzmann Machines (BMs) are a particular form of energy-based model which | |
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs | |
5 to those without visible-visible and hidden-hidden connections. | |
6 """ | |
7 | |
8 import numpy, time, cPickle, gzip, PIL.Image | |
9 | |
10 import theano | |
11 import theano.tensor as T | |
12 import os | |
369 | 13 import pdb |
14 import numpy | |
15 import pylab | |
16 import time | |
17 import theano.tensor.nnet | |
18 import pylearn | |
372 | 19 #import ift6266 |
20 import theano,pylearn.version #,ift6266 | |
369 | 21 from pylearn.io import filetensor as ft |
372 | 22 #from ift6266 import datasets |
23 | |
24 from jobman.tools import DD, flatten | |
25 from jobman import sql | |
347 | 26 |
27 from theano.tensor.shared_randomstreams import RandomStreams | |
28 | |
29 from utils import tile_raster_images | |
30 from logistic_sgd import load_data | |
31 | |
32 class RBM(object): | |
33 """Restricted Boltzmann Machine (RBM) """ | |
369 | 34 def __init__(self, input=None, n_visible=32*32, n_hidden=500, \ |
347 | 35 W = None, hbias = None, vbias = None, numpy_rng = None, |
36 theano_rng = None): | |
37 """ | |
38 RBM constructor. Defines the parameters of the model along with | |
39 basic operations for inferring hidden from visible (and vice-versa), | |
40 as well as for performing CD updates. | |
41 | |
42 :param input: None for standalone RBMs or symbolic variable if RBM is | |
43 part of a larger graph. | |
44 | |
45 :param n_visible: number of visible units | |
46 | |
47 :param n_hidden: number of hidden units | |
48 | |
49 :param W: None for standalone RBMs or symbolic variable pointing to a | |
50 shared weight matrix in case RBM is part of a DBN network; in a DBN, | |
51 the weights are shared between RBMs and layers of a MLP | |
52 | |
53 :param hbias: None for standalone RBMs or symbolic variable pointing | |
54 to a shared hidden units bias vector in case RBM is part of a | |
55 different network | |
56 | |
57 :param vbias: None for standalone RBMs or a symbolic variable | |
58 pointing to a shared visible units bias | |
59 """ | |
60 | |
61 self.n_visible = n_visible | |
62 self.n_hidden = n_hidden | |
63 | |
64 | |
65 if W is None : | |
66 # W is initialized with `initial_W` which is uniformely sampled | |
67 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) | |
68 # the output of uniform if converted using asarray to dtype | |
69 # theano.config.floatX so that the code is runable on GPU | |
70 initial_W = numpy.asarray( numpy.random.uniform( | |
71 low = -numpy.sqrt(6./(n_hidden+n_visible)), | |
72 high = numpy.sqrt(6./(n_hidden+n_visible)), | |
73 size = (n_visible, n_hidden)), | |
74 dtype = theano.config.floatX) | |
75 # theano shared variables for weights and biases | |
76 W = theano.shared(value = initial_W, name = 'W') | |
77 | |
78 if hbias is None : | |
79 # create shared variable for hidden units bias | |
80 hbias = theano.shared(value = numpy.zeros(n_hidden, | |
81 dtype = theano.config.floatX), name='hbias') | |
82 | |
83 if vbias is None : | |
84 # create shared variable for visible units bias | |
85 vbias = theano.shared(value =numpy.zeros(n_visible, | |
86 dtype = theano.config.floatX),name='vbias') | |
87 | |
88 if numpy_rng is None: | |
89 # create a number generator | |
90 numpy_rng = numpy.random.RandomState(1234) | |
91 | |
92 if theano_rng is None : | |
93 theano_rng = RandomStreams(numpy_rng.randint(2**30)) | |
94 | |
95 | |
96 # initialize input layer for standalone RBM or layer0 of DBN | |
97 self.input = input if input else T.dmatrix('input') | |
98 | |
99 self.W = W | |
100 self.hbias = hbias | |
101 self.vbias = vbias | |
102 self.theano_rng = theano_rng | |
369 | 103 |
347 | 104 # **** WARNING: It is not a good idea to put things in this list |
105 # other than shared variables created in this function. | |
106 self.params = [self.W, self.hbias, self.vbias] | |
107 self.batch_size = self.input.shape[0] | |
108 | |
109 def free_energy(self, v_sample): | |
110 ''' Function to compute the free energy ''' | |
111 wx_b = T.dot(v_sample, self.W) + self.hbias | |
112 vbias_term = T.sum(T.dot(v_sample, self.vbias)) | |
113 hidden_term = T.sum(T.log(1+T.exp(wx_b))) | |
114 return -hidden_term - vbias_term | |
115 | |
116 def sample_h_given_v(self, v0_sample): | |
117 ''' This function infers state of hidden units given visible units ''' | |
118 # compute the activation of the hidden units given a sample of the visibles | |
119 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias) | |
120 # get a sample of the hiddens given their activation | |
121 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean) | |
122 return [h1_mean, h1_sample] | |
123 | |
124 def sample_v_given_h(self, h0_sample): | |
125 ''' This function infers state of visible units given hidden units ''' | |
126 # compute the activation of the visible given the hidden sample | |
127 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias) | |
128 # get a sample of the visible given their activation | |
129 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean) | |
130 return [v1_mean, v1_sample] | |
131 | |
132 def gibbs_hvh(self, h0_sample): | |
133 ''' This function implements one step of Gibbs sampling, | |
134 starting from the hidden state''' | |
135 v1_mean, v1_sample = self.sample_v_given_h(h0_sample) | |
136 h1_mean, h1_sample = self.sample_h_given_v(v1_sample) | |
137 return [v1_mean, v1_sample, h1_mean, h1_sample] | |
138 | |
139 def gibbs_vhv(self, v0_sample): | |
140 ''' This function implements one step of Gibbs sampling, | |
141 starting from the visible state''' | |
142 h1_mean, h1_sample = self.sample_h_given_v(v0_sample) | |
143 v1_mean, v1_sample = self.sample_v_given_h(h1_sample) | |
144 return [h1_mean, h1_sample, v1_mean, v1_sample] | |
145 | |
369 | 146 def cd(self, lr = 0.1, persistent=None, k=1): |
347 | 147 """ |
148 This functions implements one step of CD-1 or PCD-1 | |
149 | |
150 :param lr: learning rate used to train the RBM | |
151 :param persistent: None for CD. For PCD, shared variable containing old state | |
152 of Gibbs chain. This must be a shared variable of size (batch size, number of | |
153 hidden units). | |
154 | |
155 Returns the updates dictionary. The dictionary contains the update rules for weights | |
156 and biases but also an update of the shared variable used to store the persistent | |
157 chain, if one is used. | |
158 """ | |
159 | |
160 # compute positive phase | |
161 ph_mean, ph_sample = self.sample_h_given_v(self.input) | |
162 | |
163 # decide how to initialize persistent chain: | |
164 # for CD, we use the newly generate hidden sample | |
165 # for PCD, we initialize from the old state of the chain | |
166 if persistent is None: | |
167 chain_start = ph_sample | |
168 else: | |
169 chain_start = persistent | |
170 | |
369 | 171 # perform actual negative phase (the CD-1) |
347 | 172 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start) |
173 | |
369 | 174 #perform CD-k |
175 if k-1>0: | |
176 for i in range(k-1): | |
177 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(nh_sample) | |
178 | |
179 | |
180 | |
347 | 181 # determine gradients on RBM parameters |
182 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size | |
183 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size | |
184 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \ | |
185 T.dot(nh_mean.T, nv_mean )/ self.batch_size | |
186 | |
187 gparams = [g_W.T, g_hbias, g_vbias] | |
188 | |
189 # constructs the update dictionary | |
190 updates = {} | |
191 for gparam, param in zip(gparams, self.params): | |
192 updates[param] = param + gparam * lr | |
193 | |
194 if persistent: | |
195 # Note that this works only if persistent is a shared variable | |
196 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX) | |
197 # pseudo-likelihood is a better proxy for PCD | |
198 cost = self.get_pseudo_likelihood_cost(updates) | |
199 else: | |
200 # reconstruction cross-entropy is a better proxy for CD | |
201 cost = self.get_reconstruction_cost(updates, nv_mean) | |
202 | |
203 return cost, updates | |
204 | |
205 def get_pseudo_likelihood_cost(self, updates): | |
206 """Stochastic approximation to the pseudo-likelihood""" | |
207 | |
208 # index of bit i in expression p(x_i | x_{\i}) | |
209 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx') | |
210 | |
211 # binarize the input image by rounding to nearest integer | |
212 xi = T.iround(self.input) | |
213 | |
214 # calculate free energy for the given bit configuration | |
215 fe_xi = self.free_energy(xi) | |
216 | |
217 # flip bit x_i of matrix xi and preserve all other bits x_{\i} | |
218 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx] | |
219 # NB: slice(start,stop,step) is the python object used for | |
220 # slicing, e.g. to index matrix x as follows: x[start:stop:step] | |
221 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], | |
222 idx_list=(slice(None,None,None),bit_i_idx)) | |
223 | |
224 # calculate free energy with bit flipped | |
225 fe_xi_flip = self.free_energy(xi_flip) | |
226 | |
227 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) | |
228 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)) | |
229 | |
230 # increment bit_i_idx % number as part of updates | |
231 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible | |
232 | |
233 return cost | |
234 | |
235 def get_reconstruction_cost(self, updates, nv_mean): | |
236 """Approximation to the reconstruction error""" | |
237 | |
238 cross_entropy = T.mean( | |
239 T.sum(self.input*T.log(nv_mean) + | |
240 (1 - self.input)*T.log(1-nv_mean), axis = 1)) | |
241 | |
242 return cross_entropy | |
243 | |
244 | |
245 | |
372 | 246 def test_rbm(b_size = 20, nhidden = 1000, kk = 1, persistance = 0): |
369 | 247 """ |
248 Demonstrate *** | |
249 | |
250 This is demonstrated on MNIST. | |
251 | |
252 :param learning_rate: learning rate used for training the RBM | |
253 | |
254 :param training_epochs: number of epochs used for training | |
255 | |
256 :param dataset: path the the pickled dataset | |
257 | |
258 """ | |
259 | |
260 learning_rate=0.1 | |
261 | |
372 | 262 # if data_set==0: |
263 # datasets=datasets.nist_all() | |
264 # elif data_set==1: | |
265 # datasets=datasets.nist_P07() | |
266 # elif data_set==2: | |
267 # datasets=datasets.PNIST07() | |
369 | 268 |
269 | |
372 | 270 data_path = '/data/lisa/data/nist/by_class/' |
271 f = open(data_path+'all/all_train_data.ft') | |
272 g = open(data_path+'all/all_train_labels.ft') | |
273 h = open(data_path+'all/all_test_data.ft') | |
274 i = open(data_path+'all/all_test_labels.ft') | |
275 | |
375
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
276 train_set_x_uint8 = theano.shared(ft.read(f)) |
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
277 test_set_x_uint8 = theano.shared(ft.read(h)) |
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
278 |
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
279 |
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
280 train_set_x = T.cast(train_set_x_uint8/255.,theano.config.floatX) |
372 | 281 train_set_y = ft.read(g) |
375
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
282 test_set_x = T.cast(test_set_x_uint8/255.,theano.config.floatX) |
372 | 283 test_set_y = ft.read(i) |
284 | |
285 f.close() | |
286 g.close() | |
287 i.close() | |
288 h.close() | |
289 | |
290 #t = len(train_set_x) | |
291 | |
369 | 292 # revoir la recuperation des donnees |
293 ## dataset = load_data(dataset) | |
294 ## | |
295 ## train_set_x, train_set_y = datasets[0] | |
296 ## test_set_x , test_set_y = datasets[2] | |
372 | 297 training_epochs = 1 # a determiner |
369 | 298 |
299 batch_size = b_size # size of the minibatch | |
300 | |
301 # compute number of minibatches for training, validation and testing | |
375
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
302 n_train_batches = train_set_x_uint8.value.shape[0] / batch_size |
369 | 303 |
304 # allocate symbolic variables for the data | |
375
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
305 index = T.lscalar() # index to a [mini]batch |
369 | 306 x = T.matrix('x') # the data is presented as rasterized images |
307 | |
308 rng = numpy.random.RandomState(123) | |
309 theano_rng = RandomStreams( rng.randint(2**30)) | |
310 | |
311 | |
312 # construct the RBM class | |
313 rbm = RBM( input = x, n_visible=32*32, \ | |
314 n_hidden = nhidden, numpy_rng = rng, theano_rng = theano_rng) | |
315 | |
316 | |
317 # initialize storage fot the persistent chain (state = hidden layer of chain) | |
318 if persistance == 1: | |
319 persistent_chain = theano.shared(numpy.zeros((batch_size, 500))) | |
320 # get the cost and the gradient corresponding to one step of CD | |
321 cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain, k= kk) | |
322 | |
323 else: | |
324 # get the cost and the gradient corresponding to one step of CD | |
325 #persistance_chain = None | |
326 cost, updates = rbm.cd(lr=learning_rate, persistent=None, k= kk) | |
327 | |
328 ################################# | |
329 # Training the RBM # | |
330 ################################# | |
372 | 331 #os.chdir('~') |
332 dirname = str(persistance) + '_' + str(nhidden) + '_' + str(b_size) + '_'+ str(kk) | |
369 | 333 os.makedirs(dirname) |
334 os.chdir(dirname) | |
372 | 335 print 'yes' |
369 | 336 # it is ok for a theano function to have no output |
337 # the purpose of train_rbm is solely to update the RBM parameters | |
375
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
338 print type(batch_size) |
e36ccffb3870
Changes to cast NIST data to floatX for rbm code
Xavier Glorot <glorotxa@iro.umontreal.ca>
parents:
372
diff
changeset
|
339 print index.dtype |
372 | 340 train_rbm = theano.function([index], cost, |
369 | 341 updates = updates, |
372 | 342 givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]}) |
369 | 343 |
372 | 344 print 'yep' |
369 | 345 plotting_time = 0.0 |
346 start_time = time.clock() | |
347 bufsize = 1000 | |
348 | |
349 # go through training epochs | |
350 costs = [] | |
351 for epoch in xrange(training_epochs): | |
352 | |
353 # go through the training set | |
354 mean_cost = [] | |
372 | 355 for batch_index in xrange(n_train_batches): |
356 mean_cost += [train_rbm(batch_index)] | |
357 # for mini_x, mini_y in datasets.train(b_size): | |
358 # mean_cost += [train_rbm(mini_x)] | |
369 | 359 ## learning_rate = learning_rate - 0.0001 |
360 ## learning_rate = learning_rate/(tau+( epoch*batch_index*batch_size)) | |
361 | |
362 #learning_rate = learning_rate/10 | |
363 | |
364 costs.append(numpy.mean(mean_cost)) | |
365 | |
366 # Plot filters after each training epoch | |
367 plotting_start = time.clock() | |
368 # Construct image from the weight matrix | |
369 image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T, | |
370 img_shape = (32,32),tile_shape = (10,10), | |
371 tile_spacing=(1,1))) | |
372 image.save('filters_at_epoch_%i.png'%epoch) | |
373 plotting_stop = time.clock() | |
374 plotting_time += (plotting_stop - plotting_start) | |
375 | |
376 end_time = time.clock() | |
377 | |
378 pretraining_time = (end_time - start_time) - plotting_time | |
379 | |
380 | |
381 | |
382 ################################# | |
383 # Sampling from the RBM # | |
384 ################################# | |
385 | |
386 # find out the number of test samples | |
372 | 387 #number_of_test_samples = 100 |
388 number_of_test_samples = test_set_x.value.shape[0] | |
369 | 389 |
372 | 390 #test_set_x, test_y = datasets.test(100*b_size) |
369 | 391 # pick random test examples, with which to initialize the persistent chain |
392 test_idx = rng.randint(number_of_test_samples - b_size) | |
393 persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+b_size]) | |
394 | |
395 # define one step of Gibbs sampling (mf = mean-field) | |
396 [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain) | |
397 | |
398 # the sample at the end of the channel is returned by ``gibbs_1`` as | |
399 # its second output; note that this is computed as a binomial draw, | |
400 # therefore it is formed of ints (0 and 1) and therefore needs to | |
401 # be converted to the same dtype as ``persistent_vis_chain`` | |
402 vis_sample = T.cast(vis_sample, dtype=theano.config.floatX) | |
403 | |
404 # construct the function that implements our persistent chain | |
405 # we generate the "mean field" activations for plotting and the actual samples for | |
406 # reinitializing the state of our persistent chain | |
407 sample_fn = theano.function([], [vis_mf, vis_sample], | |
408 updates = { persistent_vis_chain:vis_sample}) | |
409 | |
410 # sample the RBM, plotting every `plot_every`-th sample; do this | |
411 # until you plot at least `n_samples` | |
412 n_samples = 10 | |
413 # run minibatch size chains for gibbs samples (number of negative particles) | |
414 plot_every = b_size | |
415 | |
416 for idx in xrange(n_samples): | |
417 | |
418 # do `plot_every` intermediate samplings of which we do not care | |
419 for jdx in xrange(plot_every): | |
420 vis_mf, vis_sample = sample_fn() | |
421 | |
422 # construct image | |
423 image = PIL.Image.fromarray(tile_raster_images( | |
424 X = vis_mf, | |
425 img_shape = (32,32), | |
426 tile_shape = (10,10), | |
427 tile_spacing = (1,1) ) ) | |
428 #print ' ... plotting sample ', idx | |
429 image.save('sample_%i_step_%i.png'%(idx,idx*jdx)) | |
430 | |
431 #save the model | |
432 model = [rbm.W, rbm.vbias, rbm.hbias] | |
433 f = fopen('params.txt', 'w') | |
372 | 434 cPickle.dump(model, f, protocol = -1) |
369 | 435 f.close() |
436 #os.chdir('./..') | |
372 | 437 return numpy.mean(costs), pretraining_time*36 |
369 | 438 |
439 | |
440 def experiment(state, channel): | |
441 | |
442 (mean_cost, time_execution) = test_rbm(b_size = state.b_size,\ | |
443 nhidden = state.ndidden,\ | |
444 kk = state.kk,\ | |
445 persistance = state.persistance,\ | |
372 | 446 ) |
369 | 447 |
448 state.mean_costs = mean_costs | |
449 state.time_execution = time_execution | |
450 pylearn.version.record_versions(state,[theano,ift6266,pylearn]) | |
451 return channel.COMPLETE | |
452 | |
453 if __name__ == '__main__': | |
372 | 454 |
455 TABLE_NAME='RBM_tapha' | |
369 | 456 |
372 | 457 # DB path... |
458 test_rbm() | |
459 #db = sql.db('postgres://ift6266h10:f0572cd63b@gershwin/ift6266h10_db/'+ TABLE_NAME) | |
460 | |
461 #state = DD() | |
462 #for b_size in 50, 75, 100: | |
463 # state.b_size = b_size | |
464 # for nhidden in 1000,1250,1500: | |
465 # state.nhidden = nhidden | |
466 # for kk in 1,2,3,4: | |
467 # state.kk = kk | |
468 # for persistance in 0,1: | |
469 # state.persistance = persistance | |
470 # sql.insert_job(rbm.experiment, flatten(state), db) | |
471 | |
472 | |
473 #db.createView(TABLE_NAME + 'view') |