347
|
1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano.
|
|
2
|
|
3 Boltzmann Machines (BMs) are a particular form of energy-based model which
|
|
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs
|
|
5 to those without visible-visible and hidden-hidden connections.
|
|
6 """
|
|
7
|
|
8 import numpy, time, cPickle, gzip, PIL.Image
|
|
9
|
|
10 import theano
|
|
11 import theano.tensor as T
|
|
12 import os
|
369
|
13 import pdb
|
|
14 import numpy
|
|
15 import pylab
|
|
16 import time
|
|
17 import theano.tensor.nnet
|
|
18 import pylearn
|
372
|
19 #import ift6266
|
|
20 import theano,pylearn.version #,ift6266
|
369
|
21 from pylearn.io import filetensor as ft
|
372
|
22 #from ift6266 import datasets
|
|
23
|
|
24 from jobman.tools import DD, flatten
|
|
25 from jobman import sql
|
347
|
26
|
|
27 from theano.tensor.shared_randomstreams import RandomStreams
|
|
28
|
|
29 from utils import tile_raster_images
|
|
30 from logistic_sgd import load_data
|
|
31
|
|
32 class RBM(object):
|
|
33 """Restricted Boltzmann Machine (RBM) """
|
369
|
34 def __init__(self, input=None, n_visible=32*32, n_hidden=500, \
|
347
|
35 W = None, hbias = None, vbias = None, numpy_rng = None,
|
|
36 theano_rng = None):
|
|
37 """
|
|
38 RBM constructor. Defines the parameters of the model along with
|
|
39 basic operations for inferring hidden from visible (and vice-versa),
|
|
40 as well as for performing CD updates.
|
|
41
|
|
42 :param input: None for standalone RBMs or symbolic variable if RBM is
|
|
43 part of a larger graph.
|
|
44
|
|
45 :param n_visible: number of visible units
|
|
46
|
|
47 :param n_hidden: number of hidden units
|
|
48
|
|
49 :param W: None for standalone RBMs or symbolic variable pointing to a
|
|
50 shared weight matrix in case RBM is part of a DBN network; in a DBN,
|
|
51 the weights are shared between RBMs and layers of a MLP
|
|
52
|
|
53 :param hbias: None for standalone RBMs or symbolic variable pointing
|
|
54 to a shared hidden units bias vector in case RBM is part of a
|
|
55 different network
|
|
56
|
|
57 :param vbias: None for standalone RBMs or a symbolic variable
|
|
58 pointing to a shared visible units bias
|
|
59 """
|
|
60
|
|
61 self.n_visible = n_visible
|
|
62 self.n_hidden = n_hidden
|
|
63
|
|
64
|
|
65 if W is None :
|
|
66 # W is initialized with `initial_W` which is uniformely sampled
|
|
67 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
|
|
68 # the output of uniform if converted using asarray to dtype
|
|
69 # theano.config.floatX so that the code is runable on GPU
|
|
70 initial_W = numpy.asarray( numpy.random.uniform(
|
|
71 low = -numpy.sqrt(6./(n_hidden+n_visible)),
|
|
72 high = numpy.sqrt(6./(n_hidden+n_visible)),
|
|
73 size = (n_visible, n_hidden)),
|
|
74 dtype = theano.config.floatX)
|
|
75 # theano shared variables for weights and biases
|
|
76 W = theano.shared(value = initial_W, name = 'W')
|
|
77
|
|
78 if hbias is None :
|
|
79 # create shared variable for hidden units bias
|
|
80 hbias = theano.shared(value = numpy.zeros(n_hidden,
|
|
81 dtype = theano.config.floatX), name='hbias')
|
|
82
|
|
83 if vbias is None :
|
|
84 # create shared variable for visible units bias
|
|
85 vbias = theano.shared(value =numpy.zeros(n_visible,
|
|
86 dtype = theano.config.floatX),name='vbias')
|
|
87
|
|
88 if numpy_rng is None:
|
|
89 # create a number generator
|
|
90 numpy_rng = numpy.random.RandomState(1234)
|
|
91
|
|
92 if theano_rng is None :
|
|
93 theano_rng = RandomStreams(numpy_rng.randint(2**30))
|
|
94
|
|
95
|
|
96 # initialize input layer for standalone RBM or layer0 of DBN
|
|
97 self.input = input if input else T.dmatrix('input')
|
|
98
|
|
99 self.W = W
|
|
100 self.hbias = hbias
|
|
101 self.vbias = vbias
|
|
102 self.theano_rng = theano_rng
|
369
|
103
|
347
|
104 # **** WARNING: It is not a good idea to put things in this list
|
|
105 # other than shared variables created in this function.
|
|
106 self.params = [self.W, self.hbias, self.vbias]
|
|
107 self.batch_size = self.input.shape[0]
|
|
108
|
|
109 def free_energy(self, v_sample):
|
|
110 ''' Function to compute the free energy '''
|
|
111 wx_b = T.dot(v_sample, self.W) + self.hbias
|
|
112 vbias_term = T.sum(T.dot(v_sample, self.vbias))
|
|
113 hidden_term = T.sum(T.log(1+T.exp(wx_b)))
|
|
114 return -hidden_term - vbias_term
|
|
115
|
|
116 def sample_h_given_v(self, v0_sample):
|
|
117 ''' This function infers state of hidden units given visible units '''
|
|
118 # compute the activation of the hidden units given a sample of the visibles
|
|
119 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
|
|
120 # get a sample of the hiddens given their activation
|
|
121 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
|
|
122 return [h1_mean, h1_sample]
|
|
123
|
|
124 def sample_v_given_h(self, h0_sample):
|
|
125 ''' This function infers state of visible units given hidden units '''
|
|
126 # compute the activation of the visible given the hidden sample
|
|
127 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
|
|
128 # get a sample of the visible given their activation
|
|
129 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
|
|
130 return [v1_mean, v1_sample]
|
|
131
|
|
132 def gibbs_hvh(self, h0_sample):
|
|
133 ''' This function implements one step of Gibbs sampling,
|
|
134 starting from the hidden state'''
|
|
135 v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
|
|
136 h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
|
|
137 return [v1_mean, v1_sample, h1_mean, h1_sample]
|
|
138
|
|
139 def gibbs_vhv(self, v0_sample):
|
|
140 ''' This function implements one step of Gibbs sampling,
|
|
141 starting from the visible state'''
|
|
142 h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
|
|
143 v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
|
|
144 return [h1_mean, h1_sample, v1_mean, v1_sample]
|
|
145
|
369
|
146 def cd(self, lr = 0.1, persistent=None, k=1):
|
347
|
147 """
|
|
148 This functions implements one step of CD-1 or PCD-1
|
|
149
|
|
150 :param lr: learning rate used to train the RBM
|
|
151 :param persistent: None for CD. For PCD, shared variable containing old state
|
|
152 of Gibbs chain. This must be a shared variable of size (batch size, number of
|
|
153 hidden units).
|
|
154
|
|
155 Returns the updates dictionary. The dictionary contains the update rules for weights
|
|
156 and biases but also an update of the shared variable used to store the persistent
|
|
157 chain, if one is used.
|
|
158 """
|
|
159
|
|
160 # compute positive phase
|
|
161 ph_mean, ph_sample = self.sample_h_given_v(self.input)
|
|
162
|
|
163 # decide how to initialize persistent chain:
|
|
164 # for CD, we use the newly generate hidden sample
|
|
165 # for PCD, we initialize from the old state of the chain
|
|
166 if persistent is None:
|
|
167 chain_start = ph_sample
|
|
168 else:
|
|
169 chain_start = persistent
|
|
170
|
369
|
171 # perform actual negative phase (the CD-1)
|
347
|
172 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
|
|
173
|
369
|
174 #perform CD-k
|
|
175 if k-1>0:
|
|
176 for i in range(k-1):
|
|
177 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(nh_sample)
|
|
178
|
|
179
|
|
180
|
347
|
181 # determine gradients on RBM parameters
|
|
182 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
|
|
183 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size
|
|
184 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \
|
|
185 T.dot(nh_mean.T, nv_mean )/ self.batch_size
|
|
186
|
|
187 gparams = [g_W.T, g_hbias, g_vbias]
|
|
188
|
|
189 # constructs the update dictionary
|
|
190 updates = {}
|
|
191 for gparam, param in zip(gparams, self.params):
|
|
192 updates[param] = param + gparam * lr
|
|
193
|
|
194 if persistent:
|
|
195 # Note that this works only if persistent is a shared variable
|
|
196 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX)
|
|
197 # pseudo-likelihood is a better proxy for PCD
|
|
198 cost = self.get_pseudo_likelihood_cost(updates)
|
|
199 else:
|
|
200 # reconstruction cross-entropy is a better proxy for CD
|
|
201 cost = self.get_reconstruction_cost(updates, nv_mean)
|
|
202
|
|
203 return cost, updates
|
|
204
|
|
205 def get_pseudo_likelihood_cost(self, updates):
|
|
206 """Stochastic approximation to the pseudo-likelihood"""
|
|
207
|
|
208 # index of bit i in expression p(x_i | x_{\i})
|
|
209 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx')
|
|
210
|
|
211 # binarize the input image by rounding to nearest integer
|
|
212 xi = T.iround(self.input)
|
|
213
|
|
214 # calculate free energy for the given bit configuration
|
|
215 fe_xi = self.free_energy(xi)
|
|
216
|
|
217 # flip bit x_i of matrix xi and preserve all other bits x_{\i}
|
|
218 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
|
|
219 # NB: slice(start,stop,step) is the python object used for
|
|
220 # slicing, e.g. to index matrix x as follows: x[start:stop:step]
|
|
221 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx],
|
|
222 idx_list=(slice(None,None,None),bit_i_idx))
|
|
223
|
|
224 # calculate free energy with bit flipped
|
|
225 fe_xi_flip = self.free_energy(xi_flip)
|
|
226
|
|
227 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
|
|
228 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))
|
|
229
|
|
230 # increment bit_i_idx % number as part of updates
|
|
231 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
|
|
232
|
|
233 return cost
|
|
234
|
|
235 def get_reconstruction_cost(self, updates, nv_mean):
|
|
236 """Approximation to the reconstruction error"""
|
|
237
|
|
238 cross_entropy = T.mean(
|
|
239 T.sum(self.input*T.log(nv_mean) +
|
|
240 (1 - self.input)*T.log(1-nv_mean), axis = 1))
|
|
241
|
|
242 return cross_entropy
|
|
243
|
|
244
|
|
245
|
372
|
246 def test_rbm(b_size = 20, nhidden = 1000, kk = 1, persistance = 0):
|
369
|
247 """
|
|
248 Demonstrate ***
|
|
249
|
|
250 This is demonstrated on MNIST.
|
|
251
|
|
252 :param learning_rate: learning rate used for training the RBM
|
|
253
|
|
254 :param training_epochs: number of epochs used for training
|
|
255
|
|
256 :param dataset: path the the pickled dataset
|
|
257
|
|
258 """
|
|
259
|
|
260 learning_rate=0.1
|
|
261
|
372
|
262 # if data_set==0:
|
|
263 # datasets=datasets.nist_all()
|
|
264 # elif data_set==1:
|
|
265 # datasets=datasets.nist_P07()
|
|
266 # elif data_set==2:
|
|
267 # datasets=datasets.PNIST07()
|
369
|
268
|
|
269
|
372
|
270 data_path = '/data/lisa/data/nist/by_class/'
|
|
271 f = open(data_path+'all/all_train_data.ft')
|
|
272 g = open(data_path+'all/all_train_labels.ft')
|
|
273 h = open(data_path+'all/all_test_data.ft')
|
|
274 i = open(data_path+'all/all_test_labels.ft')
|
|
275
|
|
276 train_set_x = theano.shared(ft.read(f))
|
|
277 train_set_y = ft.read(g)
|
|
278 test_set_x = ft.read(h)
|
|
279 test_set_y = ft.read(i)
|
|
280
|
|
281 f.close()
|
|
282 g.close()
|
|
283 i.close()
|
|
284 h.close()
|
|
285
|
|
286 #t = len(train_set_x)
|
|
287 print train_set_x.value.shape
|
|
288
|
369
|
289 # revoir la recuperation des donnees
|
|
290 ## dataset = load_data(dataset)
|
|
291 ##
|
|
292 ## train_set_x, train_set_y = datasets[0]
|
|
293 ## test_set_x , test_set_y = datasets[2]
|
372
|
294 training_epochs = 1 # a determiner
|
369
|
295
|
|
296 batch_size = b_size # size of the minibatch
|
|
297
|
|
298 # compute number of minibatches for training, validation and testing
|
372
|
299 n_train_batches = train_set_x.value.shape[0] / batch_size
|
369
|
300
|
|
301 # allocate symbolic variables for the data
|
372
|
302 index = T.scalar() # index to a [mini]batch
|
369
|
303 x = T.matrix('x') # the data is presented as rasterized images
|
|
304
|
|
305 rng = numpy.random.RandomState(123)
|
|
306 theano_rng = RandomStreams( rng.randint(2**30))
|
|
307
|
|
308
|
|
309 # construct the RBM class
|
|
310 rbm = RBM( input = x, n_visible=32*32, \
|
|
311 n_hidden = nhidden, numpy_rng = rng, theano_rng = theano_rng)
|
|
312
|
|
313
|
|
314 # initialize storage fot the persistent chain (state = hidden layer of chain)
|
|
315 if persistance == 1:
|
|
316 persistent_chain = theano.shared(numpy.zeros((batch_size, 500)))
|
|
317 # get the cost and the gradient corresponding to one step of CD
|
|
318 cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain, k= kk)
|
|
319
|
|
320 else:
|
|
321 # get the cost and the gradient corresponding to one step of CD
|
|
322 #persistance_chain = None
|
|
323 cost, updates = rbm.cd(lr=learning_rate, persistent=None, k= kk)
|
|
324
|
|
325 #################################
|
|
326 # Training the RBM #
|
|
327 #################################
|
372
|
328 #os.chdir('~')
|
|
329 dirname = str(persistance) + '_' + str(nhidden) + '_' + str(b_size) + '_'+ str(kk)
|
369
|
330 os.makedirs(dirname)
|
|
331 os.chdir(dirname)
|
372
|
332 print 'yes'
|
369
|
333 # it is ok for a theano function to have no output
|
|
334 # the purpose of train_rbm is solely to update the RBM parameters
|
372
|
335 train_rbm = theano.function([index], cost,
|
369
|
336 updates = updates,
|
372
|
337 givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]})
|
369
|
338
|
372
|
339 print 'yep'
|
369
|
340 plotting_time = 0.0
|
|
341 start_time = time.clock()
|
|
342 bufsize = 1000
|
|
343
|
|
344 # go through training epochs
|
|
345 costs = []
|
|
346 for epoch in xrange(training_epochs):
|
|
347
|
|
348 # go through the training set
|
|
349 mean_cost = []
|
372
|
350 for batch_index in xrange(n_train_batches):
|
|
351 mean_cost += [train_rbm(batch_index)]
|
|
352 # for mini_x, mini_y in datasets.train(b_size):
|
|
353 # mean_cost += [train_rbm(mini_x)]
|
369
|
354 ## learning_rate = learning_rate - 0.0001
|
|
355 ## learning_rate = learning_rate/(tau+( epoch*batch_index*batch_size))
|
|
356
|
|
357 #learning_rate = learning_rate/10
|
|
358
|
|
359 costs.append(numpy.mean(mean_cost))
|
|
360
|
|
361 # Plot filters after each training epoch
|
|
362 plotting_start = time.clock()
|
|
363 # Construct image from the weight matrix
|
|
364 image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T,
|
|
365 img_shape = (32,32),tile_shape = (10,10),
|
|
366 tile_spacing=(1,1)))
|
|
367 image.save('filters_at_epoch_%i.png'%epoch)
|
|
368 plotting_stop = time.clock()
|
|
369 plotting_time += (plotting_stop - plotting_start)
|
|
370
|
|
371 end_time = time.clock()
|
|
372
|
|
373 pretraining_time = (end_time - start_time) - plotting_time
|
|
374
|
|
375
|
|
376
|
|
377 #################################
|
|
378 # Sampling from the RBM #
|
|
379 #################################
|
|
380
|
|
381 # find out the number of test samples
|
372
|
382 #number_of_test_samples = 100
|
|
383 number_of_test_samples = test_set_x.value.shape[0]
|
369
|
384
|
372
|
385 #test_set_x, test_y = datasets.test(100*b_size)
|
369
|
386 # pick random test examples, with which to initialize the persistent chain
|
|
387 test_idx = rng.randint(number_of_test_samples - b_size)
|
|
388 persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+b_size])
|
|
389
|
|
390 # define one step of Gibbs sampling (mf = mean-field)
|
|
391 [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain)
|
|
392
|
|
393 # the sample at the end of the channel is returned by ``gibbs_1`` as
|
|
394 # its second output; note that this is computed as a binomial draw,
|
|
395 # therefore it is formed of ints (0 and 1) and therefore needs to
|
|
396 # be converted to the same dtype as ``persistent_vis_chain``
|
|
397 vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
|
|
398
|
|
399 # construct the function that implements our persistent chain
|
|
400 # we generate the "mean field" activations for plotting and the actual samples for
|
|
401 # reinitializing the state of our persistent chain
|
|
402 sample_fn = theano.function([], [vis_mf, vis_sample],
|
|
403 updates = { persistent_vis_chain:vis_sample})
|
|
404
|
|
405 # sample the RBM, plotting every `plot_every`-th sample; do this
|
|
406 # until you plot at least `n_samples`
|
|
407 n_samples = 10
|
|
408 # run minibatch size chains for gibbs samples (number of negative particles)
|
|
409 plot_every = b_size
|
|
410
|
|
411 for idx in xrange(n_samples):
|
|
412
|
|
413 # do `plot_every` intermediate samplings of which we do not care
|
|
414 for jdx in xrange(plot_every):
|
|
415 vis_mf, vis_sample = sample_fn()
|
|
416
|
|
417 # construct image
|
|
418 image = PIL.Image.fromarray(tile_raster_images(
|
|
419 X = vis_mf,
|
|
420 img_shape = (32,32),
|
|
421 tile_shape = (10,10),
|
|
422 tile_spacing = (1,1) ) )
|
|
423 #print ' ... plotting sample ', idx
|
|
424 image.save('sample_%i_step_%i.png'%(idx,idx*jdx))
|
|
425
|
|
426 #save the model
|
|
427 model = [rbm.W, rbm.vbias, rbm.hbias]
|
|
428 f = fopen('params.txt', 'w')
|
372
|
429 cPickle.dump(model, f, protocol = -1)
|
369
|
430 f.close()
|
|
431 #os.chdir('./..')
|
372
|
432 return numpy.mean(costs), pretraining_time*36
|
369
|
433
|
|
434
|
|
435 def experiment(state, channel):
|
|
436
|
|
437 (mean_cost, time_execution) = test_rbm(b_size = state.b_size,\
|
|
438 nhidden = state.ndidden,\
|
|
439 kk = state.kk,\
|
|
440 persistance = state.persistance,\
|
372
|
441 )
|
369
|
442
|
|
443 state.mean_costs = mean_costs
|
|
444 state.time_execution = time_execution
|
|
445 pylearn.version.record_versions(state,[theano,ift6266,pylearn])
|
|
446 return channel.COMPLETE
|
|
447
|
|
448 if __name__ == '__main__':
|
372
|
449
|
|
450 TABLE_NAME='RBM_tapha'
|
369
|
451
|
372
|
452 # DB path...
|
|
453 test_rbm()
|
|
454 #db = sql.db('postgres://ift6266h10:f0572cd63b@gershwin/ift6266h10_db/'+ TABLE_NAME)
|
|
455
|
|
456 #state = DD()
|
|
457 #for b_size in 50, 75, 100:
|
|
458 # state.b_size = b_size
|
|
459 # for nhidden in 1000,1250,1500:
|
|
460 # state.nhidden = nhidden
|
|
461 # for kk in 1,2,3,4:
|
|
462 # state.kk = kk
|
|
463 # for persistance in 0,1:
|
|
464 # state.persistance = persistance
|
|
465 # sql.insert_job(rbm.experiment, flatten(state), db)
|
|
466
|
|
467
|
|
468 #db.createView(TABLE_NAME + 'view')
|