Mercurial > ift6266
comparison code_tutoriel/rbm.py @ 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
164:e3de934a98b6 | 165:4bc5eeec6394 |
---|---|
1 """This tutorial introduces restricted boltzmann machines (RBM) using Theano. | |
2 | |
3 Boltzmann Machines (BMs) are a particular form of energy-based model which | |
4 contain hidden variables. Restricted Boltzmann Machines further restrict BMs | |
5 to those without visible-visible and hidden-hidden connections. | |
6 """ | |
7 | |
8 | |
9 import numpy, time, cPickle, gzip, PIL.Image | |
10 | |
11 import theano | |
12 import theano.tensor as T | |
13 import os | |
14 | |
15 from theano.tensor.shared_randomstreams import RandomStreams | |
16 | |
17 from utils import tile_raster_images | |
18 from logistic_sgd import load_data | |
19 | |
20 class RBM(object): | |
21 """Restricted Boltzmann Machine (RBM) """ | |
22 def __init__(self, input=None, n_visible=784, n_hidden=500, \ | |
23 W = None, hbias = None, vbias = None, numpy_rng = None, | |
24 theano_rng = None): | |
25 """ | |
26 RBM constructor. Defines the parameters of the model along with | |
27 basic operations for inferring hidden from visible (and vice-versa), | |
28 as well as for performing CD updates. | |
29 | |
30 :param input: None for standalone RBMs or symbolic variable if RBM is | |
31 part of a larger graph. | |
32 | |
33 :param n_visible: number of visible units | |
34 | |
35 :param n_hidden: number of hidden units | |
36 | |
37 :param W: None for standalone RBMs or symbolic variable pointing to a | |
38 shared weight matrix in case RBM is part of a DBN network; in a DBN, | |
39 the weights are shared between RBMs and layers of a MLP | |
40 | |
41 :param hbias: None for standalone RBMs or symbolic variable pointing | |
42 to a shared hidden units bias vector in case RBM is part of a | |
43 different network | |
44 | |
45 :param vbias: None for standalone RBMs or a symbolic variable | |
46 pointing to a shared visible units bias | |
47 """ | |
48 | |
49 self.n_visible = n_visible | |
50 self.n_hidden = n_hidden | |
51 | |
52 | |
53 if W is None : | |
54 # W is initialized with `initial_W` which is uniformely sampled | |
55 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) | |
56 # the output of uniform if converted using asarray to dtype | |
57 # theano.config.floatX so that the code is runable on GPU | |
58 initial_W = numpy.asarray( numpy.random.uniform( | |
59 low = -numpy.sqrt(6./(n_hidden+n_visible)), | |
60 high = numpy.sqrt(6./(n_hidden+n_visible)), | |
61 size = (n_visible, n_hidden)), | |
62 dtype = theano.config.floatX) | |
63 # theano shared variables for weights and biases | |
64 W = theano.shared(value = initial_W, name = 'W') | |
65 | |
66 if hbias is None : | |
67 # create shared variable for hidden units bias | |
68 hbias = theano.shared(value = numpy.zeros(n_hidden, | |
69 dtype = theano.config.floatX), name='hbias') | |
70 | |
71 if vbias is None : | |
72 # create shared variable for visible units bias | |
73 vbias = theano.shared(value =numpy.zeros(n_visible, | |
74 dtype = theano.config.floatX),name='vbias') | |
75 | |
76 if numpy_rng is None: | |
77 # create a number generator | |
78 numpy_rng = numpy.random.RandomState(1234) | |
79 | |
80 if theano_rng is None : | |
81 theano_rng = RandomStreams(numpy_rng.randint(2**30)) | |
82 | |
83 | |
84 # initialize input layer for standalone RBM or layer0 of DBN | |
85 self.input = input if input else T.dmatrix('input') | |
86 | |
87 self.W = W | |
88 self.hbias = hbias | |
89 self.vbias = vbias | |
90 self.theano_rng = theano_rng | |
91 # **** WARNING: It is not a good idea to put things in this list | |
92 # other than shared variables created in this function. | |
93 self.params = [self.W, self.hbias, self.vbias] | |
94 self.batch_size = self.input.shape[0] | |
95 | |
96 def free_energy(self, v_sample): | |
97 ''' Function to compute the free energy ''' | |
98 wx_b = T.dot(v_sample, self.W) + self.hbias | |
99 vbias_term = T.sum(T.dot(v_sample, self.vbias)) | |
100 hidden_term = T.sum(T.log(1+T.exp(wx_b))) | |
101 return -hidden_term - vbias_term | |
102 | |
103 def sample_h_given_v(self, v0_sample): | |
104 ''' This function infers state of hidden units given visible units ''' | |
105 # compute the activation of the hidden units given a sample of the visibles | |
106 h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias) | |
107 # get a sample of the hiddens given their activation | |
108 h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean) | |
109 return [h1_mean, h1_sample] | |
110 | |
111 def sample_v_given_h(self, h0_sample): | |
112 ''' This function infers state of visible units given hidden units ''' | |
113 # compute the activation of the visible given the hidden sample | |
114 v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias) | |
115 # get a sample of the visible given their activation | |
116 v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean) | |
117 return [v1_mean, v1_sample] | |
118 | |
119 def gibbs_hvh(self, h0_sample): | |
120 ''' This function implements one step of Gibbs sampling, | |
121 starting from the hidden state''' | |
122 v1_mean, v1_sample = self.sample_v_given_h(h0_sample) | |
123 h1_mean, h1_sample = self.sample_h_given_v(v1_sample) | |
124 return [v1_mean, v1_sample, h1_mean, h1_sample] | |
125 | |
126 def gibbs_vhv(self, v0_sample): | |
127 ''' This function implements one step of Gibbs sampling, | |
128 starting from the visible state''' | |
129 h1_mean, h1_sample = self.sample_h_given_v(v0_sample) | |
130 v1_mean, v1_sample = self.sample_v_given_h(h1_sample) | |
131 return [h1_mean, h1_sample, v1_mean, v1_sample] | |
132 | |
133 def cd(self, lr = 0.1, persistent=None): | |
134 """ | |
135 This functions implements one step of CD-1 or PCD-1 | |
136 | |
137 :param lr: learning rate used to train the RBM | |
138 :param persistent: None for CD. For PCD, shared variable containing old state | |
139 of Gibbs chain. This must be a shared variable of size (batch size, number of | |
140 hidden units). | |
141 | |
142 Returns the updates dictionary. The dictionary contains the update rules for weights | |
143 and biases but also an update of the shared variable used to store the persistent | |
144 chain, if one is used. | |
145 """ | |
146 | |
147 # compute positive phase | |
148 ph_mean, ph_sample = self.sample_h_given_v(self.input) | |
149 | |
150 # decide how to initialize persistent chain: | |
151 # for CD, we use the newly generate hidden sample | |
152 # for PCD, we initialize from the old state of the chain | |
153 if persistent is None: | |
154 chain_start = ph_sample | |
155 else: | |
156 chain_start = persistent | |
157 | |
158 # perform actual negative phase | |
159 [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start) | |
160 | |
161 # determine gradients on RBM parameters | |
162 g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size | |
163 g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size | |
164 g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \ | |
165 T.dot(nh_mean.T, nv_mean )/ self.batch_size | |
166 | |
167 gparams = [g_W.T, g_hbias, g_vbias] | |
168 | |
169 # constructs the update dictionary | |
170 updates = {} | |
171 for gparam, param in zip(gparams, self.params): | |
172 updates[param] = param + gparam * lr | |
173 | |
174 if persistent: | |
175 # Note that this works only if persistent is a shared variable | |
176 updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX) | |
177 # pseudo-likelihood is a better proxy for PCD | |
178 cost = self.get_pseudo_likelihood_cost(updates) | |
179 else: | |
180 # reconstruction cross-entropy is a better proxy for CD | |
181 cost = self.get_reconstruction_cost(updates, nv_mean) | |
182 | |
183 return cost, updates | |
184 | |
185 def get_pseudo_likelihood_cost(self, updates): | |
186 """Stochastic approximation to the pseudo-likelihood""" | |
187 | |
188 # index of bit i in expression p(x_i | x_{\i}) | |
189 bit_i_idx = theano.shared(value=0, name = 'bit_i_idx') | |
190 | |
191 # binarize the input image by rounding to nearest integer | |
192 xi = T.iround(self.input) | |
193 | |
194 # calculate free energy for the given bit configuration | |
195 fe_xi = self.free_energy(xi) | |
196 | |
197 # flip bit x_i of matrix xi and preserve all other bits x_{\i} | |
198 # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx] | |
199 # NB: slice(start,stop,step) is the python object used for | |
200 # slicing, e.g. to index matrix x as follows: x[start:stop:step] | |
201 xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], | |
202 idx_list=(slice(None,None,None),bit_i_idx)) | |
203 | |
204 # calculate free energy with bit flipped | |
205 fe_xi_flip = self.free_energy(xi_flip) | |
206 | |
207 # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) | |
208 cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)) | |
209 | |
210 # increment bit_i_idx % number as part of updates | |
211 updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible | |
212 | |
213 return cost | |
214 | |
215 def get_reconstruction_cost(self, updates, nv_mean): | |
216 """Approximation to the reconstruction error""" | |
217 | |
218 cross_entropy = T.mean( | |
219 T.sum(self.input*T.log(nv_mean) + | |
220 (1 - self.input)*T.log(1-nv_mean), axis = 1)) | |
221 | |
222 return cross_entropy | |
223 | |
224 | |
225 | |
226 def test_rbm(learning_rate=0.1, training_epochs = 15, | |
227 dataset='mnist.pkl.gz'): | |
228 """ | |
229 Demonstrate *** | |
230 | |
231 This is demonstrated on MNIST. | |
232 | |
233 :param learning_rate: learning rate used for training the RBM | |
234 | |
235 :param training_epochs: number of epochs used for training | |
236 | |
237 :param dataset: path the the pickled dataset | |
238 | |
239 """ | |
240 datasets = load_data(dataset) | |
241 | |
242 train_set_x, train_set_y = datasets[0] | |
243 test_set_x , test_set_y = datasets[2] | |
244 | |
245 | |
246 batch_size = 20 # size of the minibatch | |
247 | |
248 # compute number of minibatches for training, validation and testing | |
249 n_train_batches = train_set_x.value.shape[0] / batch_size | |
250 | |
251 # allocate symbolic variables for the data | |
252 index = T.lscalar() # index to a [mini]batch | |
253 x = T.matrix('x') # the data is presented as rasterized images | |
254 | |
255 rng = numpy.random.RandomState(123) | |
256 theano_rng = RandomStreams( rng.randint(2**30)) | |
257 | |
258 # initialize storage fot the persistent chain (state = hidden layer of chain) | |
259 persistent_chain = theano.shared(numpy.zeros((batch_size, 500))) | |
260 | |
261 # construct the RBM class | |
262 rbm = RBM( input = x, n_visible=28*28, \ | |
263 n_hidden = 500,numpy_rng = rng, theano_rng = theano_rng) | |
264 | |
265 # get the cost and the gradient corresponding to one step of CD | |
266 cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain) | |
267 | |
268 | |
269 ################################# | |
270 # Training the RBM # | |
271 ################################# | |
272 dirname = 'lr=%.5f'%learning_rate | |
273 os.makedirs(dirname) | |
274 os.chdir(dirname) | |
275 | |
276 # it is ok for a theano function to have no output | |
277 # the purpose of train_rbm is solely to update the RBM parameters | |
278 train_rbm = theano.function([index], cost, | |
279 updates = updates, | |
280 givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]}) | |
281 | |
282 plotting_time = 0. | |
283 start_time = time.clock() | |
284 | |
285 | |
286 # go through training epochs | |
287 for epoch in xrange(training_epochs): | |
288 | |
289 # go through the training set | |
290 mean_cost = [] | |
291 for batch_index in xrange(n_train_batches): | |
292 mean_cost += [train_rbm(batch_index)] | |
293 | |
294 print 'Training epoch %d, cost is '%epoch, numpy.mean(mean_cost) | |
295 | |
296 # Plot filters after each training epoch | |
297 plotting_start = time.clock() | |
298 # Construct image from the weight matrix | |
299 image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T, | |
300 img_shape = (28,28),tile_shape = (10,10), | |
301 tile_spacing=(1,1))) | |
302 image.save('filters_at_epoch_%i.png'%epoch) | |
303 plotting_stop = time.clock() | |
304 plotting_time += (plotting_stop - plotting_start) | |
305 | |
306 end_time = time.clock() | |
307 | |
308 pretraining_time = (end_time - start_time) - plotting_time | |
309 | |
310 print ('Training took %f minutes' %(pretraining_time/60.)) | |
311 | |
312 | |
313 ################################# | |
314 # Sampling from the RBM # | |
315 ################################# | |
316 | |
317 # find out the number of test samples | |
318 number_of_test_samples = test_set_x.value.shape[0] | |
319 | |
320 # pick random test examples, with which to initialize the persistent chain | |
321 test_idx = rng.randint(number_of_test_samples-20) | |
322 persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+20]) | |
323 | |
324 # define one step of Gibbs sampling (mf = mean-field) | |
325 [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain) | |
326 | |
327 # the sample at the end of the channel is returned by ``gibbs_1`` as | |
328 # its second output; note that this is computed as a binomial draw, | |
329 # therefore it is formed of ints (0 and 1) and therefore needs to | |
330 # be converted to the same dtype as ``persistent_vis_chain`` | |
331 vis_sample = T.cast(vis_sample, dtype=theano.config.floatX) | |
332 | |
333 # construct the function that implements our persistent chain | |
334 # we generate the "mean field" activations for plotting and the actual samples for | |
335 # reinitializing the state of our persistent chain | |
336 sample_fn = theano.function([], [vis_mf, vis_sample], | |
337 updates = { persistent_vis_chain:vis_sample}) | |
338 | |
339 # sample the RBM, plotting every `plot_every`-th sample; do this | |
340 # until you plot at least `n_samples` | |
341 n_samples = 10 | |
342 plot_every = 1000 | |
343 | |
344 for idx in xrange(n_samples): | |
345 | |
346 # do `plot_every` intermediate samplings of which we do not care | |
347 for jdx in xrange(plot_every): | |
348 vis_mf, vis_sample = sample_fn() | |
349 | |
350 # construct image | |
351 image = PIL.Image.fromarray(tile_raster_images( | |
352 X = vis_mf, | |
353 img_shape = (28,28), | |
354 tile_shape = (10,10), | |
355 tile_spacing = (1,1) ) ) | |
356 print ' ... plotting sample ', idx | |
357 image.save('sample_%i_step_%i.png'%(idx,idx*jdx)) | |
358 | |
359 if __name__ == '__main__': | |
360 test_rbm() |