Mercurial > ift6266
comparison deep/crbm/crbm.py @ 377:0b7e64e8e93f
branch merge
author | Arnaud Bergeron <abergeron@gmail.com> |
---|---|
date | Sun, 25 Apr 2010 17:12:03 -0400 |
parents | 8d116d4a7593 |
children |
comparison
equal
deleted
inserted
replaced
376:01445a75c702 | 377:0b7e64e8e93f |
---|---|
1 import sys | |
2 import os, os.path | |
3 | |
4 import numpy | |
5 | |
6 import theano | |
7 | |
8 USING_GPU = "gpu" in theano.config.device | |
9 | |
10 import theano.tensor as T | |
11 from theano.tensor.nnet import conv, sigmoid | |
12 | |
13 if not USING_GPU: | |
14 from theano.tensor.shared_randomstreams import RandomStreams | |
15 else: | |
16 from theano.sandbox.rng_mrg import MRG_RandomStreams | |
17 | |
18 _PRINT_GRAPHS = True | |
19 | |
20 def _init_conv_biases(num_filters, varname, rng=numpy.random): | |
21 b_shp = (num_filters,) | |
22 b = theano.shared( numpy.asarray( | |
23 rng.uniform(low=-.5, high=.5, size=b_shp), | |
24 dtype=theano.config.floatX), name=varname) | |
25 return b | |
26 | |
27 def _init_conv_weights(conv_params, varname, rng=numpy.random): | |
28 cp = conv_params | |
29 | |
30 # initialize shared variable for weights. | |
31 w_shp = conv_params.as_conv2d_shape_tuple() | |
32 w_bound = numpy.sqrt(cp.num_input_planes * \ | |
33 cp.height_filters * cp.width_filters) | |
34 W = theano.shared( numpy.asarray( | |
35 rng.uniform( | |
36 low=-1.0 / w_bound, | |
37 high=1.0 / w_bound, | |
38 size=w_shp), | |
39 dtype=theano.config.floatX), name=varname) | |
40 | |
41 return W | |
42 | |
43 # Shape of W for conv2d | |
44 class ConvolutionParams: | |
45 def __init__(self, num_filters, num_input_planes, height_filters, width_filters): | |
46 self.num_filters = num_filters | |
47 self.num_input_planes = num_input_planes | |
48 self.height_filters = height_filters | |
49 self.width_filters = width_filters | |
50 | |
51 def as_conv2d_shape_tuple(self): | |
52 cp = self | |
53 return (cp.num_filters, cp.num_input_planes, | |
54 cp.height_filters, cp.width_filters) | |
55 | |
56 class CRBM: | |
57 def __init__(self, minibatch_size, image_size, conv_params, | |
58 learning_rate, sparsity_lambda, sparsity_p): | |
59 ''' | |
60 Parameters | |
61 ---------- | |
62 image_size | |
63 height, width | |
64 ''' | |
65 self.minibatch_size = minibatch_size | |
66 self.image_size = image_size | |
67 self.conv_params = conv_params | |
68 | |
69 ''' | |
70 Dimensions: | |
71 0- minibatch | |
72 1- plane/color | |
73 2- y (rows) | |
74 3- x (cols) | |
75 ''' | |
76 self.x = T.tensor4('x') | |
77 self.h = T.tensor4('h') | |
78 | |
79 self.lr = theano.shared(numpy.asarray(learning_rate, | |
80 dtype=theano.config.floatX)) | |
81 self.sparsity_lambda = \ | |
82 theano.shared( \ | |
83 numpy.asarray( \ | |
84 sparsity_lambda, | |
85 dtype=theano.config.floatX)) | |
86 self.sparsity_p = \ | |
87 theano.shared( \ | |
88 numpy.asarray(sparsity_p, \ | |
89 dtype=theano.config.floatX)) | |
90 | |
91 self.numpy_rng = numpy.random.RandomState(1234) | |
92 | |
93 if not USING_GPU: | |
94 self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30)) | |
95 else: | |
96 self.theano_rng = MRG_RandomStreams(234, use_cuda=True) | |
97 | |
98 self._init_params() | |
99 self._init_functions() | |
100 | |
101 def _get_visibles_shape(self): | |
102 imsz = self.image_size | |
103 return (self.minibatch_size, | |
104 self.conv_params.num_input_planes, | |
105 imsz[0], imsz[1]) | |
106 | |
107 def _get_hiddens_shape(self): | |
108 cp = self.conv_params | |
109 imsz = self.image_size | |
110 wf, hf = cp.height_filters, cp.width_filters | |
111 return (self.minibatch_size, cp.num_filters, | |
112 imsz[0] - hf + 1, imsz[1] - wf + 1) | |
113 | |
114 def _init_params(self): | |
115 cp = self.conv_params | |
116 | |
117 self.W = _init_conv_weights(cp, 'W') | |
118 self.b_h = _init_conv_biases(cp.num_filters, 'b_h') | |
119 ''' | |
120 Lee09 mentions "all visible units share a single bias c" | |
121 but for upper layers it's pretty clear we need one | |
122 per plane, by symmetry | |
123 ''' | |
124 self.b_x = _init_conv_biases(cp.num_input_planes, 'b_x') | |
125 | |
126 self.params = [self.W, self.b_h, self.b_x] | |
127 | |
128 # flip filters horizontally and vertically | |
129 W_flipped = self.W[:, :, ::-1, ::-1] | |
130 # also have to invert the filters/num_planes | |
131 self.W_tilde = W_flipped.dimshuffle(1,0,2,3) | |
132 | |
133 ''' | |
134 I_up and I_down come from the symbol used in the | |
135 Lee 2009 CRBM paper | |
136 ''' | |
137 def _I_up(self, visibles_mb): | |
138 ''' | |
139 output of conv is features maps of size | |
140 image_size - filter_size + 1 | |
141 The dimshuffle serves to broadcast b_h so that it | |
142 corresponds to output planes | |
143 ''' | |
144 fshp = self.conv_params.as_conv2d_shape_tuple() | |
145 return conv.conv2d(visibles_mb, self.W, | |
146 filter_shape=fshp) + \ | |
147 self.b_h.dimshuffle('x',0,'x','x') | |
148 | |
149 def _I_down(self, hiddens_mb): | |
150 ''' | |
151 notice border_mode='full'... we want to get | |
152 back the original size | |
153 so we get feature_map_size + filter_size - 1 | |
154 The dimshuffle serves to broadcast b_x so that | |
155 it corresponds to output planes | |
156 ''' | |
157 fshp = list(self.conv_params.as_conv2d_shape_tuple()) | |
158 # num_filters and num_planes swapped | |
159 fshp[0], fshp[1] = fshp[1], fshp[0] | |
160 return conv.conv2d(hiddens_mb, self.W_tilde, | |
161 border_mode='full',filter_shape=tuple(fshp)) + \ | |
162 self.b_x.dimshuffle('x',0,'x','x') | |
163 | |
164 def _mean_free_energy(self, visibles_mb): | |
165 ''' | |
166 visibles_mb is mb_size x num_planes x h x w | |
167 | |
168 we want to match the summed input planes | |
169 (second dimension, first is mb index) | |
170 to respective bias terms for the visibles | |
171 The dimshuffle isn't really necessary, | |
172 but I put it there for clarity. | |
173 ''' | |
174 vbias_term = \ | |
175 self.b_x.dimshuffle('x',0) * \ | |
176 T.sum(visibles_mb,axis=[2,3]) | |
177 # now sum over term per planes, get one free energy | |
178 # contribution per element of minibatch | |
179 vbias_term = - T.sum(vbias_term, axis=1) | |
180 | |
181 ''' | |
182 Here it's a bit more complex, a few points: | |
183 - The usual free energy, in the fully connected case, | |
184 is a sum over all hiddens. | |
185 We do the same thing here, but each unit has limited | |
186 connectivity and there's weight reuse. | |
187 Therefore we only need to first do the convolutions | |
188 (with I_up) which gives us | |
189 what would normally be the Wx+b_h for each hidden. | |
190 Once we have this, | |
191 we take the log(1+exp(sum for this hidden)) elemwise | |
192 for each hidden, | |
193 then we sum for all hiddens in one example of the minibatch. | |
194 | |
195 - Notice that we reuse the same b_h everywhere instead of | |
196 using one b per hidden, | |
197 so the broadcasting for b_h done in I_up is all right. | |
198 | |
199 That sum is over all hiddens, so all filters | |
200 (planes of hiddens), x, and y. | |
201 In the end we get one free energy contribution per | |
202 example of the minibatch. | |
203 ''' | |
204 softplused = T.log(1.0+T.exp(self._I_up(visibles_mb))) | |
205 # h_sz = self._get_hiddens_shape() | |
206 # this simplifies the sum | |
207 # num_hiddens = h_sz[1] * h_sz[2] * h_sz[3] | |
208 # reshaped = T.reshape(softplused, | |
209 # (self.minibatch_size, num_hiddens)) | |
210 | |
211 # this is because the 0,1,1,1 sum pattern is not | |
212 # implemented on gpu, but the 1,0,1,1 pattern is | |
213 dimshuffled = softplused.dimshuffle(1,0,2,3) | |
214 xh_and_hbias_term = - T.sum(dimshuffled, axis=[0,2,3]) | |
215 | |
216 ''' | |
217 both bias_term and vbias_term end up with one | |
218 contributor to free energy per minibatch | |
219 so we mean over minibatches | |
220 ''' | |
221 return T.mean(vbias_term + xh_and_hbias_term) | |
222 | |
223 def _init_functions(self): | |
224 # propup | |
225 # b_h is broadcasted keeping in mind we want it to | |
226 # correspond to each new plane (corresponding to filters) | |
227 I_up = self._I_up(self.x) | |
228 # expected values for the distributions for each hidden | |
229 E_h_given_x = sigmoid(I_up) | |
230 # might be needed if we ever want a version where we | |
231 # take expectations instead of samples for CD learning | |
232 self.E_h_given_x_func = theano.function([self.x], E_h_given_x) | |
233 | |
234 if _PRINT_GRAPHS: | |
235 print "----------------------\nE_h_given_x_func" | |
236 theano.printing.debugprint(self.E_h_given_x_func) | |
237 | |
238 h_sample_given_x = \ | |
239 self.theano_rng.binomial( \ | |
240 size = self._get_hiddens_shape(), | |
241 n = 1, | |
242 p = E_h_given_x, | |
243 dtype = theano.config.floatX) | |
244 | |
245 self.h_sample_given_x_func = \ | |
246 theano.function([self.x], | |
247 h_sample_given_x) | |
248 | |
249 if _PRINT_GRAPHS: | |
250 print "----------------------\nh_sample_given_x_func" | |
251 theano.printing.debugprint(self.h_sample_given_x_func) | |
252 | |
253 # propdown | |
254 I_down = self._I_down(self.h) | |
255 E_x_given_h = sigmoid(I_down) | |
256 self.E_x_given_h_func = theano.function([self.h], E_x_given_h) | |
257 | |
258 if _PRINT_GRAPHS: | |
259 print "----------------------\nE_x_given_h_func" | |
260 theano.printing.debugprint(self.E_x_given_h_func) | |
261 | |
262 x_sample_given_h = \ | |
263 self.theano_rng.binomial( \ | |
264 size = self._get_visibles_shape(), | |
265 n = 1, | |
266 p = E_x_given_h, | |
267 dtype = theano.config.floatX) | |
268 | |
269 self.x_sample_given_h_func = \ | |
270 theano.function([self.h], | |
271 x_sample_given_h) | |
272 | |
273 if _PRINT_GRAPHS: | |
274 print "----------------------\nx_sample_given_h_func" | |
275 theano.printing.debugprint(self.x_sample_given_h_func) | |
276 | |
277 ############################################## | |
278 # cd update done by grad of free energy | |
279 | |
280 x_tilde = T.tensor4('x_tilde') | |
281 cd_update_cost = self._mean_free_energy(self.x) - \ | |
282 self._mean_free_energy(x_tilde) | |
283 | |
284 cd_grad = T.grad(cd_update_cost, self.params) | |
285 # This is NLL minimization so we use a - | |
286 cd_updates = {self.W: self.W - self.lr * cd_grad[0], | |
287 self.b_h: self.b_h - self.lr * cd_grad[1], | |
288 self.b_x: self.b_x - self.lr * cd_grad[2]} | |
289 | |
290 cd_returned = [cd_update_cost, | |
291 cd_grad[0], cd_grad[1], cd_grad[2], | |
292 self.lr * cd_grad[0], | |
293 self.lr * cd_grad[1], | |
294 self.lr * cd_grad[2]] | |
295 self.cd_return_desc = \ | |
296 ['cd_update_cost', | |
297 'cd_grad_W', 'cd_grad_b_h', 'cd_grad_b_x', | |
298 'lr_times_cd_grad_W', | |
299 'lr_times_cd_grad_b_h', | |
300 'lr_times_cd_grad_b_x'] | |
301 | |
302 self.cd_update_function = \ | |
303 theano.function([self.x, x_tilde], | |
304 cd_returned, updates=cd_updates) | |
305 | |
306 if _PRINT_GRAPHS: | |
307 print "----------------------\ncd_update_function" | |
308 theano.printing.debugprint(self.cd_update_function) | |
309 | |
310 ############## | |
311 # sparsity update, based on grad for b_h only | |
312 | |
313 ''' | |
314 This mean returns an array of shape | |
315 (num_hiddens_planes, feature_map_height, feature_map_width) | |
316 (so it's a mean over each unit's activation) | |
317 ''' | |
318 mean_expected_activation = T.mean(E_h_given_x, axis=0) | |
319 # sparsity_p is broadcasted everywhere | |
320 sparsity_update_cost = \ | |
321 T.sqr(self.sparsity_p - mean_expected_activation) | |
322 sparsity_update_cost = \ | |
323 T.sum(T.sum(T.sum( \ | |
324 sparsity_update_cost, axis=2), axis=1), axis=0) | |
325 sparsity_grad = T.grad(sparsity_update_cost, [self.W, self.b_h]) | |
326 | |
327 sparsity_returned = \ | |
328 [sparsity_update_cost, | |
329 sparsity_grad[0], sparsity_grad[1], | |
330 self.sparsity_lambda * self.lr * sparsity_grad[0], | |
331 self.sparsity_lambda * self.lr * sparsity_grad[1]] | |
332 self.sparsity_return_desc = \ | |
333 ['sparsity_update_cost', | |
334 'sparsity_grad_W', | |
335 'sparsity_grad_b_h', | |
336 'lambda_lr_times_sparsity_grad_W', | |
337 'lambda_lr_times_sparsity_grad_b_h'] | |
338 | |
339 # gradient _descent_ so we use a - | |
340 sparsity_update = \ | |
341 {self.b_h: self.b_h - \ | |
342 self.sparsity_lambda * self.lr * sparsity_grad[1], | |
343 self.W: self.W - \ | |
344 self.sparsity_lambda * self.lr * sparsity_grad[0]} | |
345 self.sparsity_update_function = \ | |
346 theano.function([self.x], | |
347 sparsity_returned, updates=sparsity_update) | |
348 | |
349 if _PRINT_GRAPHS: | |
350 print "----------------------\nsparsity_update_function" | |
351 theano.printing.debugprint(self.sparsity_update_function) | |
352 | |
353 def CD_step(self, x): | |
354 h1 = self.h_sample_given_x_func(x) | |
355 x2 = self.x_sample_given_h_func(h1) | |
356 return self.cd_update_function(x, x2) | |
357 | |
358 def sparsity_step(self, x): | |
359 return self.sparsity_update_function(x) | |
360 | |
361 # these two also operate on minibatches | |
362 | |
363 def random_gibbs_samples(self, num_updown_steps): | |
364 start_x = self.numpy_rng.rand(*self._get_visibles_shape()) | |
365 return self.gibbs_samples_from(start_x, num_updown_steps) | |
366 | |
367 def gibbs_samples_from(self, start_x, num_updown_steps): | |
368 x_sample = start_x | |
369 for i in xrange(num_updown_steps): | |
370 h_sample = self.h_sample_given_x_func(x_sample) | |
371 x_sample = self.x_sample_given_h_func(h_sample) | |
372 return x_sample | |
373 | |
374 |