comparison code_tutoriel/rbm.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents
children
comparison
equal deleted inserted replaced
164:e3de934a98b6 165:4bc5eeec6394
1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano.
2
3 Boltzmann Machines (BMs) are a particular form of energy-based model which
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs
5 to those without visible-visible and hidden-hidden connections.
6 """
7
8
9 import numpy, time, cPickle, gzip, PIL.Image
10
11 import theano
12 import theano.tensor as T
13 import os
14
15 from theano.tensor.shared_randomstreams import RandomStreams
16
17 from utils import tile_raster_images
18 from logistic_sgd import load_data
19
20 class RBM(object):
21 """Restricted Boltzmann Machine (RBM) """
22 def __init__(self, input=None, n_visible=784, n_hidden=500, \
23 W = None, hbias = None, vbias = None, numpy_rng = None,
24 theano_rng = None):
25 """
26 RBM constructor. Defines the parameters of the model along with
27 basic operations for inferring hidden from visible (and vice-versa),
28 as well as for performing CD updates.
29
30 :param input: None for standalone RBMs or symbolic variable if RBM is
31 part of a larger graph.
32
33 :param n_visible: number of visible units
34
35 :param n_hidden: number of hidden units
36
37 :param W: None for standalone RBMs or symbolic variable pointing to a
38 shared weight matrix in case RBM is part of a DBN network; in a DBN,
39 the weights are shared between RBMs and layers of a MLP
40
41 :param hbias: None for standalone RBMs or symbolic variable pointing
42 to a shared hidden units bias vector in case RBM is part of a
43 different network
44
45 :param vbias: None for standalone RBMs or a symbolic variable
46 pointing to a shared visible units bias
47 """
48
49 self.n_visible = n_visible
50 self.n_hidden = n_hidden
51
52
53 if W is None :
54 # W is initialized with `initial_W` which is uniformely sampled
55 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
56 # the output of uniform if converted using asarray to dtype
57 # theano.config.floatX so that the code is runable on GPU
58 initial_W = numpy.asarray( numpy.random.uniform(
59 low = -numpy.sqrt(6./(n_hidden+n_visible)),
60 high = numpy.sqrt(6./(n_hidden+n_visible)),
61 size = (n_visible, n_hidden)),
62 dtype = theano.config.floatX)
63 # theano shared variables for weights and biases
64 W = theano.shared(value = initial_W, name = 'W')
65
66 if hbias is None :
67 # create shared variable for hidden units bias
68 hbias = theano.shared(value = numpy.zeros(n_hidden,
69 dtype = theano.config.floatX), name='hbias')
70
71 if vbias is None :
72 # create shared variable for visible units bias
73 vbias = theano.shared(value =numpy.zeros(n_visible,
74 dtype = theano.config.floatX),name='vbias')
75
76 if numpy_rng is None:
77 # create a number generator
78 numpy_rng = numpy.random.RandomState(1234)
79
80 if theano_rng is None :
81 theano_rng = RandomStreams(numpy_rng.randint(2**30))
82
83
84 # initialize input layer for standalone RBM or layer0 of DBN
85 self.input = input if input else T.dmatrix('input')
86
87 self.W = W
88 self.hbias = hbias
89 self.vbias = vbias
90 self.theano_rng = theano_rng
91 # **** WARNING: It is not a good idea to put things in this list
92 # other than shared variables created in this function.
93 self.params = [self.W, self.hbias, self.vbias]
94 self.batch_size = self.input.shape[0]
95
96 def free_energy(self, v_sample):
97 ''' Function to compute the free energy '''
98 wx_b = T.dot(v_sample, self.W) + self.hbias
99 vbias_term = T.sum(T.dot(v_sample, self.vbias))
100 hidden_term = T.sum(T.log(1+T.exp(wx_b)))
101 return -hidden_term - vbias_term
102
103 def sample_h_given_v(self, v0_sample):
104 ''' This function infers state of hidden units given visible units '''
105 # compute the activation of the hidden units given a sample of the visibles
106 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias)
107 # get a sample of the hiddens given their activation
108 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean)
109 return [h1_mean, h1_sample]
110
111 def sample_v_given_h(self, h0_sample):
112 ''' This function infers state of visible units given hidden units '''
113 # compute the activation of the visible given the hidden sample
114 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias)
115 # get a sample of the visible given their activation
116 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean)
117 return [v1_mean, v1_sample]
118
119 def gibbs_hvh(self, h0_sample):
120 ''' This function implements one step of Gibbs sampling,
121 starting from the hidden state'''
122 v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
123 h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
124 return [v1_mean, v1_sample, h1_mean, h1_sample]
125
126 def gibbs_vhv(self, v0_sample):
127 ''' This function implements one step of Gibbs sampling,
128 starting from the visible state'''
129 h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
130 v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
131 return [h1_mean, h1_sample, v1_mean, v1_sample]
132
133 def cd(self, lr = 0.1, persistent=None):
134 """
135 This functions implements one step of CD-1 or PCD-1
136
137 :param lr: learning rate used to train the RBM
138 :param persistent: None for CD. For PCD, shared variable containing old state
139 of Gibbs chain. This must be a shared variable of size (batch size, number of
140 hidden units).
141
142 Returns the updates dictionary. The dictionary contains the update rules for weights
143 and biases but also an update of the shared variable used to store the persistent
144 chain, if one is used.
145 """
146
147 # compute positive phase
148 ph_mean, ph_sample = self.sample_h_given_v(self.input)
149
150 # decide how to initialize persistent chain:
151 # for CD, we use the newly generate hidden sample
152 # for PCD, we initialize from the old state of the chain
153 if persistent is None:
154 chain_start = ph_sample
155 else:
156 chain_start = persistent
157
158 # perform actual negative phase
159 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start)
160
161 # determine gradients on RBM parameters
162 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size
163 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size
164 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \
165 T.dot(nh_mean.T, nv_mean )/ self.batch_size
166
167 gparams = [g_W.T, g_hbias, g_vbias]
168
169 # constructs the update dictionary
170 updates = {}
171 for gparam, param in zip(gparams, self.params):
172 updates[param] = param + gparam * lr
173
174 if persistent:
175 # Note that this works only if persistent is a shared variable
176 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX)
177 # pseudo-likelihood is a better proxy for PCD
178 cost = self.get_pseudo_likelihood_cost(updates)
179 else:
180 # reconstruction cross-entropy is a better proxy for CD
181 cost = self.get_reconstruction_cost(updates, nv_mean)
182
183 return cost, updates
184
185 def get_pseudo_likelihood_cost(self, updates):
186 """Stochastic approximation to the pseudo-likelihood"""
187
188 # index of bit i in expression p(x_i | x_{\i})
189 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx')
190
191 # binarize the input image by rounding to nearest integer
192 xi = T.iround(self.input)
193
194 # calculate free energy for the given bit configuration
195 fe_xi = self.free_energy(xi)
196
197 # flip bit x_i of matrix xi and preserve all other bits x_{\i}
198 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx]
199 # NB: slice(start,stop,step) is the python object used for
200 # slicing, e.g. to index matrix x as follows: x[start:stop:step]
201 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx],
202 idx_list=(slice(None,None,None),bit_i_idx))
203
204 # calculate free energy with bit flipped
205 fe_xi_flip = self.free_energy(xi_flip)
206
207 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
208 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi))
209
210 # increment bit_i_idx % number as part of updates
211 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
212
213 return cost
214
215 def get_reconstruction_cost(self, updates, nv_mean):
216 """Approximation to the reconstruction error"""
217
218 cross_entropy = T.mean(
219 T.sum(self.input*T.log(nv_mean) +
220 (1 - self.input)*T.log(1-nv_mean), axis = 1))
221
222 return cross_entropy
223
224
225
226 def test_rbm(learning_rate=0.1, training_epochs = 15,
227 dataset='mnist.pkl.gz'):
228 """
229 Demonstrate ***
230
231 This is demonstrated on MNIST.
232
233 :param learning_rate: learning rate used for training the RBM
234
235 :param training_epochs: number of epochs used for training
236
237 :param dataset: path the the pickled dataset
238
239 """
240 datasets = load_data(dataset)
241
242 train_set_x, train_set_y = datasets[0]
243 test_set_x , test_set_y = datasets[2]
244
245
246 batch_size = 20 # size of the minibatch
247
248 # compute number of minibatches for training, validation and testing
249 n_train_batches = train_set_x.value.shape[0] / batch_size
250
251 # allocate symbolic variables for the data
252 index = T.lscalar() # index to a [mini]batch
253 x = T.matrix('x') # the data is presented as rasterized images
254
255 rng = numpy.random.RandomState(123)
256 theano_rng = RandomStreams( rng.randint(2**30))
257
258 # initialize storage fot the persistent chain (state = hidden layer of chain)
259 persistent_chain = theano.shared(numpy.zeros((batch_size, 500)))
260
261 # construct the RBM class
262 rbm = RBM( input = x, n_visible=28*28, \
263 n_hidden = 500,numpy_rng = rng, theano_rng = theano_rng)
264
265 # get the cost and the gradient corresponding to one step of CD
266 cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain)
267
268
269 #################################
270 # Training the RBM #
271 #################################
272 dirname = 'lr=%.5f'%learning_rate
273 os.makedirs(dirname)
274 os.chdir(dirname)
275
276 # it is ok for a theano function to have no output
277 # the purpose of train_rbm is solely to update the RBM parameters
278 train_rbm = theano.function([index], cost,
279 updates = updates,
280 givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]})
281
282 plotting_time = 0.
283 start_time = time.clock()
284
285
286 # go through training epochs
287 for epoch in xrange(training_epochs):
288
289 # go through the training set
290 mean_cost = []
291 for batch_index in xrange(n_train_batches):
292 mean_cost += [train_rbm(batch_index)]
293
294 print 'Training epoch %d, cost is '%epoch, numpy.mean(mean_cost)
295
296 # Plot filters after each training epoch
297 plotting_start = time.clock()
298 # Construct image from the weight matrix
299 image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T,
300 img_shape = (28,28),tile_shape = (10,10),
301 tile_spacing=(1,1)))
302 image.save('filters_at_epoch_%i.png'%epoch)
303 plotting_stop = time.clock()
304 plotting_time += (plotting_stop - plotting_start)
305
306 end_time = time.clock()
307
308 pretraining_time = (end_time - start_time) - plotting_time
309
310 print ('Training took %f minutes' %(pretraining_time/60.))
311
312
313 #################################
314 # Sampling from the RBM #
315 #################################
316
317 # find out the number of test samples
318 number_of_test_samples = test_set_x.value.shape[0]
319
320 # pick random test examples, with which to initialize the persistent chain
321 test_idx = rng.randint(number_of_test_samples-20)
322 persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+20])
323
324 # define one step of Gibbs sampling (mf = mean-field)
325 [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain)
326
327 # the sample at the end of the channel is returned by ``gibbs_1`` as
328 # its second output; note that this is computed as a binomial draw,
329 # therefore it is formed of ints (0 and 1) and therefore needs to
330 # be converted to the same dtype as ``persistent_vis_chain``
331 vis_sample = T.cast(vis_sample, dtype=theano.config.floatX)
332
333 # construct the function that implements our persistent chain
334 # we generate the "mean field" activations for plotting and the actual samples for
335 # reinitializing the state of our persistent chain
336 sample_fn = theano.function([], [vis_mf, vis_sample],
337 updates = { persistent_vis_chain:vis_sample})
338
339 # sample the RBM, plotting every `plot_every`-th sample; do this
340 # until you plot at least `n_samples`
341 n_samples = 10
342 plot_every = 1000
343
344 for idx in xrange(n_samples):
345
346 # do `plot_every` intermediate samplings of which we do not care
347 for jdx in xrange(plot_every):
348 vis_mf, vis_sample = sample_fn()
349
350 # construct image
351 image = PIL.Image.fromarray(tile_raster_images(
352 X = vis_mf,
353 img_shape = (28,28),
354 tile_shape = (10,10),
355 tile_spacing = (1,1) ) )
356 print ' ... plotting sample ', idx
357 image.save('sample_%i_step_%i.png'%(idx,idx*jdx))
358
359 if __name__ == '__main__':
360 test_rbm()