347
|
1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano.
|
|
2
|
|
3 Boltzmann Machines (BMs) are a particular form of energy-based model which
|
|
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs
|
|
5 to those without visible-visible and hidden-hidden connections.
|
|
6 """
|
|
7
|
|
8 import numpy, time, cPickle, gzip, PIL.Image
|
|
9
|
|
10 import theano
|
|
11 import theano.tensor as T
|
|
12 import os
|
369
|
13 import pdb
|
|
14 import numpy
|
|
15 import pylab
|
|
16 import time
|
|
17 import theano.tensor.nnet
|
|
18 import pylearn
|
|
19 import ift6266
|
|
20 import theano,pylearn.version,ift6266
|
|
21 from pylearn.io import filetensor as ft
|
|
22 from ift6266 import datasets
|
347
|
23
|
|
24 from theano.tensor.shared_randomstreams import RandomStreams
|
|
25
|
|
26 from utils import tile_raster_images
|
|
27 from logistic_sgd import load_data
|
|
28
|
|
29 class RBM(object):
|
|
30 """Restricted Boltzmann Machine (RBM) """
|
369
|
31 def __init__(self, input=None, n_visible=32*32, n_hidden=500, \
|
347
|
32 W = None, hbias = None, vbias = None, numpy_rng = None,
|
|
33 theano_rng = None):
|
|
34 """
|
|
35 RBM constructor. Defines the parameters of the model along with
|
|
36 basic operations for inferring hidden from visible (and vice-versa),
|
|
37 as well as for performing CD updates.
|
|
38
|
|
39 :param input: None for standalone RBMs or symbolic variable if RBM is
|
|
40 part of a larger graph.
|
|
41
|
|
42 :param n_visible: number of visible units
|
|
43
|
|
44 :param n_hidden: number of hidden units
|
|
45
|
|
46 :param W: None for standalone RBMs or symbolic variable pointing to a
|
|
47 shared weight matrix in case RBM is part of a DBN network; in a DBN,
|
|
48 the weights are shared between RBMs and layers of a MLP
|
|
49
|
|
50 :param hbias: None for standalone RBMs or symbolic variable pointing
|
|
51 to a shared hidden units bias vector in case RBM is part of a
|
|
52 different network
|
|
53
|
|
54 :param vbias: None for standalone RBMs or a symbolic variable
|
|
55 pointing to a shared visible units bias
|
|
56 """
|
|
57
|
|
58 self.n_visible = n_visible
|
|
59 self.n_hidden = n_hidden
|
|
60
|
|
61
|
|
62 if W is None :
|
|
63 # W is initialized with `initial_W` which is uniformely sampled
|
|
64 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
|
|
65 # the output of uniform if converted using asarray to dtype
|
|
66 # theano.config.floatX so that the code is runable on GPU
|
|
67 initial_W = numpy.asarray( numpy.random.uniform(
|
|
68 low = -numpy.sqrt(6./(n_hidden+n_visible)),
|
|
69 high = numpy.sqrt(6./(n_hidden+n_visible)),
|
|
70 size = (n_visible, n_hidden)),
|
|
71 dtype = theano.config.floatX)
|
|
72 # theano shared variables for weights and biases
|
|
73 W = theano.shared(value = initial_W, name = 'W')
|
|
74
|
|
75 if hbias is None :
|
|
76 # create shared variable for hidden units bias
|
|
77 hbias = theano.shared(value = numpy.zeros(n_hidden,
|
|
78 dtype = theano.config.floatX), name='hbias')
|
|
79
|
|
80 if vbias is None :
|
|
81 # create shared variable for visible units bias
|
|
82 vbias = theano.shared(value =numpy.zeros(n_visible,
|
|
83 dtype = theano.config.floatX),name='vbias')
|
|
84
|
|
85 if numpy_rng is None:
|
|
86 # create a number generator
|
|
87 numpy_rng = numpy.random.RandomState(1234)
|
|
88
|
|
89 if theano_rng is None :
|
|
90 theano_rng = RandomStreams(numpy_rng.randint(2**30))
|
|
91
|
|
92
|
|
93 # initialize input layer for standalone RBM or layer0 of DBN
|
|
94 self.input = input if input else T.dmatrix('input')
|
|
95
|
|
96 self.W = W
|
|
97 self.hbias = hbias
|
|
98 self.vbias = vbias
|
|
99 self.theano_rng = theano_rng
|
369
|
100
|
347
|
101 # **** WARNING: It is not a good idea to put things in this list
|
|
102 # other than shared variables created in this function.
|
|
103 self.params = [self.W, self.hbias, self.vbias]
|
|
104 self.batch_size = self.input.shape[0]
|
|
105
|
|
106 def free_energy(self, v_sample):
|
|
107 ''' Function to compute the free energy '''
|
|
108 wx_b = T.dot(v_sample, self.W) + self.hbias
|
|
109 vbias_term = T.sum(T.dot(v_sample, self.vbias))
|
|
110 hidden_term = T.sum(T.log(1+T.exp(wx_b)))
|
|
111 return -hidden_term - vbias_term
|
|
112
|
|
113 def sample_h_given_v(self, v0_sample):
|
|
114 ''' This function infers state of hidden units given visible units '''
|
|
115 # compute the activation of the hidden units given a sample of the visibles
|
|
116 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
|
|
117 # get a sample of the hiddens given their activation
|
|
118 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
|
|
119 return [h1_mean, h1_sample]
|
|
120
|
|
121 def sample_v_given_h(self, h0_sample):
|
|
122 ''' This function infers state of visible units given hidden units '''
|
|
123 # compute the activation of the visible given the hidden sample
|
|
124 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
|
|
125 # get a sample of the visible given their activation
|
|
126 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
|
|
127 return [v1_mean, v1_sample]
|
|
128
|
|
129 def gibbs_hvh(self, h0_sample):
|
|
130 ''' This function implements one step of Gibbs sampling,
|
|
131 starting from the hidden state'''
|
|
132 v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
|
|
133 h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
|
|
134 return [v1_mean, v1_sample, h1_mean, h1_sample]
|
|
135
|
|
136 def gibbs_vhv(self, v0_sample):
|
|
137 ''' This function implements one step of Gibbs sampling,
|
|
138 starting from the visible state'''
|
|
139 h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
|
|
140 v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
|
|
141 return [h1_mean, h1_sample, v1_mean, v1_sample]
|
|
142
|
369
|
143 def cd(self, lr = 0.1, persistent=None, k=1):
|
347
|
144 """
|
|
145 This functions implements one step of CD-1 or PCD-1
|
|
146
|
|
147 :param lr: learning rate used to train the RBM
|
|
148 :param persistent: None for CD. For PCD, shared variable containing old state
|
|
149 of Gibbs chain. This must be a shared variable of size (batch size, number of
|
|
150 hidden units).
|
|
151
|
|
152 Returns the updates dictionary. The dictionary contains the update rules for weights
|
|
153 and biases but also an update of the shared variable used to store the persistent
|
|
154 chain, if one is used.
|
|
155 """
|
|
156
|
|
157 # compute positive phase
|
|
158 ph_mean, ph_sample = self.sample_h_given_v(self.input)
|
|
159
|
|
160 # decide how to initialize persistent chain:
|
|
161 # for CD, we use the newly generate hidden sample
|
|
162 # for PCD, we initialize from the old state of the chain
|
|
163 if persistent is None:
|
|
164 chain_start = ph_sample
|
|
165 else:
|
|
166 chain_start = persistent
|
|
167
|
369
|
168 # perform actual negative phase (the CD-1)
|
347
|
169 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
|
|
170
|
369
|
171 #perform CD-k
|
|
172 if k-1>0:
|
|
173 for i in range(k-1):
|
|
174 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(nh_sample)
|
|
175
|
|
176
|
|
177
|
347
|
178 # determine gradients on RBM parameters
|
|
179 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
|
|
180 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size
|
|
181 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \
|
|
182 T.dot(nh_mean.T, nv_mean )/ self.batch_size
|
|
183
|
|
184 gparams = [g_W.T, g_hbias, g_vbias]
|
|
185
|
|
186 # constructs the update dictionary
|
|
187 updates = {}
|
|
188 for gparam, param in zip(gparams, self.params):
|
|
189 updates[param] = param + gparam * lr
|
|
190
|
|
191 if persistent:
|
|
192 # Note that this works only if persistent is a shared variable
|
|
193 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX)
|
|
194 # pseudo-likelihood is a better proxy for PCD
|
|
195 cost = self.get_pseudo_likelihood_cost(updates)
|
|
196 else:
|
|
197 # reconstruction cross-entropy is a better proxy for CD
|
|
198 cost = self.get_reconstruction_cost(updates, nv_mean)
|
|
199
|
|
200 return cost, updates
|
|
201
|
|
202 def get_pseudo_likelihood_cost(self, updates):
|
|
203 """Stochastic approximation to the pseudo-likelihood"""
|
|
204
|
|
205 # index of bit i in expression p(x_i | x_{\i})
|
|
206 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx')
|
|
207
|
|
208 # binarize the input image by rounding to nearest integer
|
|
209 xi = T.iround(self.input)
|
|
210
|
|
211 # calculate free energy for the given bit configuration
|
|
212 fe_xi = self.free_energy(xi)
|
|
213
|
|
214 # flip bit x_i of matrix xi and preserve all other bits x_{\i}
|
|
215 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
|
|
216 # NB: slice(start,stop,step) is the python object used for
|
|
217 # slicing, e.g. to index matrix x as follows: x[start:stop:step]
|
|
218 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx],
|
|
219 idx_list=(slice(None,None,None),bit_i_idx))
|
|
220
|
|
221 # calculate free energy with bit flipped
|
|
222 fe_xi_flip = self.free_energy(xi_flip)
|
|
223
|
|
224 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
|
|
225 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))
|
|
226
|
|
227 # increment bit_i_idx % number as part of updates
|
|
228 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
|
|
229
|
|
230 return cost
|
|
231
|
|
232 def get_reconstruction_cost(self, updates, nv_mean):
|
|
233 """Approximation to the reconstruction error"""
|
|
234
|
|
235 cross_entropy = T.mean(
|
|
236 T.sum(self.input*T.log(nv_mean) +
|
|
237 (1 - self.input)*T.log(1-nv_mean), axis = 1))
|
|
238
|
|
239 return cross_entropy
|
|
240
|
|
241
|
|
242
|
369
|
243 def test_rbm(b_size = 25, nhidden = 1000, kk = 1, persistance = 0,
|
|
244 dataset= 0):
|
|
245 """
|
|
246 Demonstrate ***
|
|
247
|
|
248 This is demonstrated on MNIST.
|
|
249
|
|
250 :param learning_rate: learning rate used for training the RBM
|
|
251
|
|
252 :param training_epochs: number of epochs used for training
|
|
253
|
|
254 :param dataset: path the the pickled dataset
|
|
255
|
|
256 """
|
|
257
|
|
258 learning_rate=0.1
|
|
259
|
|
260 if data_set==0:
|
|
261 datasets=datasets.nist_all()
|
|
262 elif data_set==1:
|
|
263 datasets=datasets.nist_P07()
|
|
264 elif data_set==2:
|
|
265 datasets=datasets.PNIST07()
|
|
266
|
|
267
|
|
268 # revoir la recuperation des donnees
|
|
269 ## dataset = load_data(dataset)
|
|
270 ##
|
|
271 ## train_set_x, train_set_y = datasets[0]
|
|
272 ## test_set_x , test_set_y = datasets[2]
|
|
273 ## training_epochs = 10 # a determiner
|
|
274
|
|
275 batch_size = b_size # size of the minibatch
|
|
276
|
|
277 # compute number of minibatches for training, validation and testing
|
|
278 #n_train_batches = train_set_x.value.shape[0] / batch_size
|
|
279
|
|
280 # allocate symbolic variables for the data
|
|
281 index = T.lscalar() # index to a [mini]batch
|
|
282 x = T.matrix('x') # the data is presented as rasterized images
|
|
283
|
|
284 rng = numpy.random.RandomState(123)
|
|
285 theano_rng = RandomStreams( rng.randint(2**30))
|
|
286
|
|
287
|
|
288 # construct the RBM class
|
|
289 rbm = RBM( input = x, n_visible=32*32, \
|
|
290 n_hidden = nhidden, numpy_rng = rng, theano_rng = theano_rng)
|
|
291
|
|
292
|
|
293 # initialize storage fot the persistent chain (state = hidden layer of chain)
|
|
294 if persistance == 1:
|
|
295 persistent_chain = theano.shared(numpy.zeros((batch_size, 500)))
|
|
296 # get the cost and the gradient corresponding to one step of CD
|
|
297 cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain, k= kk)
|
|
298
|
|
299 else:
|
|
300 # get the cost and the gradient corresponding to one step of CD
|
|
301 #persistance_chain = None
|
|
302 cost, updates = rbm.cd(lr=learning_rate, persistent=None, k= kk)
|
|
303
|
|
304 #################################
|
|
305 # Training the RBM #
|
|
306 #################################
|
|
307 dirname = 'data=%i'%dataset + ' persistance=%i'%persistance + ' n_hidden=%i'%n_hidden + 'batch_size=i%'%b_size
|
|
308 os.makedirs(dirname)
|
|
309 os.chdir(dirname)
|
|
310
|
|
311 # it is ok for a theano function to have no output
|
|
312 # the purpose of train_rbm is solely to update the RBM parameters
|
|
313 train_rbm = theano.function([x], cost,
|
|
314 updates = updates,
|
|
315 )
|
|
316
|
|
317 plotting_time = 0.0
|
|
318 start_time = time.clock()
|
|
319 bufsize = 1000
|
|
320
|
|
321 # go through training epochs
|
|
322 costs = []
|
|
323 for epoch in xrange(training_epochs):
|
|
324
|
|
325 # go through the training set
|
|
326 mean_cost = []
|
|
327 for mini_x, mini_y in datasets.train(b_size):
|
|
328 mean_cost += [train_rbm(mini_x)]
|
|
329 ## learning_rate = learning_rate - 0.0001
|
|
330 ## learning_rate = learning_rate/(tau+( epoch*batch_index*batch_size))
|
|
331
|
|
332 #learning_rate = learning_rate/10
|
|
333
|
|
334 costs.append(numpy.mean(mean_cost))
|
|
335
|
|
336 # Plot filters after each training epoch
|
|
337 plotting_start = time.clock()
|
|
338 # Construct image from the weight matrix
|
|
339 image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T,
|
|
340 img_shape = (32,32),tile_shape = (10,10),
|
|
341 tile_spacing=(1,1)))
|
|
342 image.save('filters_at_epoch_%i.png'%epoch)
|
|
343 plotting_stop = time.clock()
|
|
344 plotting_time += (plotting_stop - plotting_start)
|
|
345
|
|
346 end_time = time.clock()
|
|
347
|
|
348 pretraining_time = (end_time - start_time) - plotting_time
|
|
349
|
|
350
|
|
351
|
|
352
|
|
353
|
|
354
|
|
355 #################################
|
|
356 # Sampling from the RBM #
|
|
357 #################################
|
|
358
|
|
359 # find out the number of test samples
|
|
360 number_of_test_samples = 1000
|
|
361
|
|
362 test_set_x, test_y = datasets.test(100*b_size)
|
|
363 # pick random test examples, with which to initialize the persistent chain
|
|
364 test_idx = rng.randint(number_of_test_samples - b_size)
|
|
365 persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+b_size])
|
|
366
|
|
367 # define one step of Gibbs sampling (mf = mean-field)
|
|
368 [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain)
|
|
369
|
|
370 # the sample at the end of the channel is returned by ``gibbs_1`` as
|
|
371 # its second output; note that this is computed as a binomial draw,
|
|
372 # therefore it is formed of ints (0 and 1) and therefore needs to
|
|
373 # be converted to the same dtype as ``persistent_vis_chain``
|
|
374 vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
|
|
375
|
|
376 # construct the function that implements our persistent chain
|
|
377 # we generate the "mean field" activations for plotting and the actual samples for
|
|
378 # reinitializing the state of our persistent chain
|
|
379 sample_fn = theano.function([], [vis_mf, vis_sample],
|
|
380 updates = { persistent_vis_chain:vis_sample})
|
|
381
|
|
382 # sample the RBM, plotting every `plot_every`-th sample; do this
|
|
383 # until you plot at least `n_samples`
|
|
384 n_samples = 10
|
|
385 # run minibatch size chains for gibbs samples (number of negative particles)
|
|
386 plot_every = b_size
|
|
387
|
|
388 for idx in xrange(n_samples):
|
|
389
|
|
390 # do `plot_every` intermediate samplings of which we do not care
|
|
391 for jdx in xrange(plot_every):
|
|
392 vis_mf, vis_sample = sample_fn()
|
|
393
|
|
394 # construct image
|
|
395 image = PIL.Image.fromarray(tile_raster_images(
|
|
396 X = vis_mf,
|
|
397 img_shape = (32,32),
|
|
398 tile_shape = (10,10),
|
|
399 tile_spacing = (1,1) ) )
|
|
400 #print ' ... plotting sample ', idx
|
|
401 image.save('sample_%i_step_%i.png'%(idx,idx*jdx))
|
|
402
|
|
403 #save the model
|
|
404 model = [rbm.W, rbm.vbias, rbm.hbias]
|
|
405 f = fopen('params.txt', 'w')
|
|
406 pickle.dump(model, f)
|
|
407 f.close()
|
|
408 #os.chdir('./..')
|
|
409 return numpy.mean(costs), pretraining_time/360
|
|
410
|
|
411
|
|
412 def experiment(state, channel):
|
|
413
|
|
414 (mean_cost, time_execution) = test_rbm(b_size = state.b_size,\
|
|
415 nhidden = state.ndidden,\
|
|
416 kk = state.kk,\
|
|
417 persistance = state.persistance,\
|
|
418 dataset = state.dataset)
|
|
419
|
|
420 state.mean_costs = mean_costs
|
|
421 state.time_execution = time_execution
|
|
422 pylearn.version.record_versions(state,[theano,ift6266,pylearn])
|
|
423 return channel.COMPLETE
|
|
424
|
|
425 if __name__ == '__main__':
|
|
426
|
|
427 test_rbm()
|