comparison deep/crbm/crbm.py @ 377:0b7e64e8e93f

branch merge
author Arnaud Bergeron <abergeron@gmail.com>
date Sun, 25 Apr 2010 17:12:03 -0400
parents 8d116d4a7593
children
comparison
equal deleted inserted replaced
376:01445a75c702 377:0b7e64e8e93f
1 import sys
2 import os, os.path
3
4 import numpy
5
6 import theano
7
8 USING_GPU = "gpu" in theano.config.device
9
10 import theano.tensor as T
11 from theano.tensor.nnet import conv, sigmoid
12
13 if not USING_GPU:
14 from theano.tensor.shared_randomstreams import RandomStreams
15 else:
16 from theano.sandbox.rng_mrg import MRG_RandomStreams
17
18 _PRINT_GRAPHS = True
19
20 def _init_conv_biases(num_filters, varname, rng=numpy.random):
21 b_shp = (num_filters,)
22 b = theano.shared( numpy.asarray(
23 rng.uniform(low=-.5, high=.5, size=b_shp),
24 dtype=theano.config.floatX), name=varname)
25 return b
26
27 def _init_conv_weights(conv_params, varname, rng=numpy.random):
28 cp = conv_params
29
30 # initialize shared variable for weights.
31 w_shp = conv_params.as_conv2d_shape_tuple()
32 w_bound = numpy.sqrt(cp.num_input_planes * \
33 cp.height_filters * cp.width_filters)
34 W = theano.shared( numpy.asarray(
35 rng.uniform(
36 low=-1.0 / w_bound,
37 high=1.0 / w_bound,
38 size=w_shp),
39 dtype=theano.config.floatX), name=varname)
40
41 return W
42
43 # Shape of W for conv2d
44 class ConvolutionParams:
45 def __init__(self, num_filters, num_input_planes, height_filters, width_filters):
46 self.num_filters = num_filters
47 self.num_input_planes = num_input_planes
48 self.height_filters = height_filters
49 self.width_filters = width_filters
50
51 def as_conv2d_shape_tuple(self):
52 cp = self
53 return (cp.num_filters, cp.num_input_planes,
54 cp.height_filters, cp.width_filters)
55
56 class CRBM:
57 def __init__(self, minibatch_size, image_size, conv_params,
58 learning_rate, sparsity_lambda, sparsity_p):
59 '''
60 Parameters
61 ----------
62 image_size
63 height, width
64 '''
65 self.minibatch_size = minibatch_size
66 self.image_size = image_size
67 self.conv_params = conv_params
68
69 '''
70 Dimensions:
71 0- minibatch
72 1- plane/color
73 2- y (rows)
74 3- x (cols)
75 '''
76 self.x = T.tensor4('x')
77 self.h = T.tensor4('h')
78
79 self.lr = theano.shared(numpy.asarray(learning_rate,
80 dtype=theano.config.floatX))
81 self.sparsity_lambda = \
82 theano.shared( \
83 numpy.asarray( \
84 sparsity_lambda,
85 dtype=theano.config.floatX))
86 self.sparsity_p = \
87 theano.shared( \
88 numpy.asarray(sparsity_p, \
89 dtype=theano.config.floatX))
90
91 self.numpy_rng = numpy.random.RandomState(1234)
92
93 if not USING_GPU:
94 self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30))
95 else:
96 self.theano_rng = MRG_RandomStreams(234, use_cuda=True)
97
98 self._init_params()
99 self._init_functions()
100
101 def _get_visibles_shape(self):
102 imsz = self.image_size
103 return (self.minibatch_size,
104 self.conv_params.num_input_planes,
105 imsz[0], imsz[1])
106
107 def _get_hiddens_shape(self):
108 cp = self.conv_params
109 imsz = self.image_size
110 wf, hf = cp.height_filters, cp.width_filters
111 return (self.minibatch_size, cp.num_filters,
112 imsz[0] - hf + 1, imsz[1] - wf + 1)
113
114 def _init_params(self):
115 cp = self.conv_params
116
117 self.W = _init_conv_weights(cp, 'W')
118 self.b_h = _init_conv_biases(cp.num_filters, 'b_h')
119 '''
120 Lee09 mentions "all visible units share a single bias c"
121 but for upper layers it's pretty clear we need one
122 per plane, by symmetry
123 '''
124 self.b_x = _init_conv_biases(cp.num_input_planes, 'b_x')
125
126 self.params = [self.W, self.b_h, self.b_x]
127
128 # flip filters horizontally and vertically
129 W_flipped = self.W[:, :, ::-1, ::-1]
130 # also have to invert the filters/num_planes
131 self.W_tilde = W_flipped.dimshuffle(1,0,2,3)
132
133 '''
134 I_up and I_down come from the symbol used in the
135 Lee 2009 CRBM paper
136 '''
137 def _I_up(self, visibles_mb):
138 '''
139 output of conv is features maps of size
140 image_size - filter_size + 1
141 The dimshuffle serves to broadcast b_h so that it
142 corresponds to output planes
143 '''
144 fshp = self.conv_params.as_conv2d_shape_tuple()
145 return conv.conv2d(visibles_mb, self.W,
146 filter_shape=fshp) + \
147 self.b_h.dimshuffle('x',0,'x','x')
148
149 def _I_down(self, hiddens_mb):
150 '''
151 notice border_mode='full'... we want to get
152 back the original size
153 so we get feature_map_size + filter_size - 1
154 The dimshuffle serves to broadcast b_x so that
155 it corresponds to output planes
156 '''
157 fshp = list(self.conv_params.as_conv2d_shape_tuple())
158 # num_filters and num_planes swapped
159 fshp[0], fshp[1] = fshp[1], fshp[0]
160 return conv.conv2d(hiddens_mb, self.W_tilde,
161 border_mode='full',filter_shape=tuple(fshp)) + \
162 self.b_x.dimshuffle('x',0,'x','x')
163
164 def _mean_free_energy(self, visibles_mb):
165 '''
166 visibles_mb is mb_size x num_planes x h x w
167
168 we want to match the summed input planes
169 (second dimension, first is mb index)
170 to respective bias terms for the visibles
171 The dimshuffle isn't really necessary,
172 but I put it there for clarity.
173 '''
174 vbias_term = \
175 self.b_x.dimshuffle('x',0) * \
176 T.sum(visibles_mb,axis=[2,3])
177 # now sum over term per planes, get one free energy
178 # contribution per element of minibatch
179 vbias_term = - T.sum(vbias_term, axis=1)
180
181 '''
182 Here it's a bit more complex, a few points:
183 - The usual free energy, in the fully connected case,
184 is a sum over all hiddens.
185 We do the same thing here, but each unit has limited
186 connectivity and there's weight reuse.
187 Therefore we only need to first do the convolutions
188 (with I_up) which gives us
189 what would normally be the Wx+b_h for each hidden.
190 Once we have this,
191 we take the log(1+exp(sum for this hidden)) elemwise
192 for each hidden,
193 then we sum for all hiddens in one example of the minibatch.
194
195 - Notice that we reuse the same b_h everywhere instead of
196 using one b per hidden,
197 so the broadcasting for b_h done in I_up is all right.
198
199 That sum is over all hiddens, so all filters
200 (planes of hiddens), x, and y.
201 In the end we get one free energy contribution per
202 example of the minibatch.
203 '''
204 softplused = T.log(1.0+T.exp(self._I_up(visibles_mb)))
205 # h_sz = self._get_hiddens_shape()
206 # this simplifies the sum
207 # num_hiddens = h_sz[1] * h_sz[2] * h_sz[3]
208 # reshaped = T.reshape(softplused,
209 # (self.minibatch_size, num_hiddens))
210
211 # this is because the 0,1,1,1 sum pattern is not
212 # implemented on gpu, but the 1,0,1,1 pattern is
213 dimshuffled = softplused.dimshuffle(1,0,2,3)
214 xh_and_hbias_term = - T.sum(dimshuffled, axis=[0,2,3])
215
216 '''
217 both bias_term and vbias_term end up with one
218 contributor to free energy per minibatch
219 so we mean over minibatches
220 '''
221 return T.mean(vbias_term + xh_and_hbias_term)
222
223 def _init_functions(self):
224 # propup
225 # b_h is broadcasted keeping in mind we want it to
226 # correspond to each new plane (corresponding to filters)
227 I_up = self._I_up(self.x)
228 # expected values for the distributions for each hidden
229 E_h_given_x = sigmoid(I_up)
230 # might be needed if we ever want a version where we
231 # take expectations instead of samples for CD learning
232 self.E_h_given_x_func = theano.function([self.x], E_h_given_x)
233
234 if _PRINT_GRAPHS:
235 print "----------------------\nE_h_given_x_func"
236 theano.printing.debugprint(self.E_h_given_x_func)
237
238 h_sample_given_x = \
239 self.theano_rng.binomial( \
240 size = self._get_hiddens_shape(),
241 n = 1,
242 p = E_h_given_x,
243 dtype = theano.config.floatX)
244
245 self.h_sample_given_x_func = \
246 theano.function([self.x],
247 h_sample_given_x)
248
249 if _PRINT_GRAPHS:
250 print "----------------------\nh_sample_given_x_func"
251 theano.printing.debugprint(self.h_sample_given_x_func)
252
253 # propdown
254 I_down = self._I_down(self.h)
255 E_x_given_h = sigmoid(I_down)
256 self.E_x_given_h_func = theano.function([self.h], E_x_given_h)
257
258 if _PRINT_GRAPHS:
259 print "----------------------\nE_x_given_h_func"
260 theano.printing.debugprint(self.E_x_given_h_func)
261
262 x_sample_given_h = \
263 self.theano_rng.binomial( \
264 size = self._get_visibles_shape(),
265 n = 1,
266 p = E_x_given_h,
267 dtype = theano.config.floatX)
268
269 self.x_sample_given_h_func = \
270 theano.function([self.h],
271 x_sample_given_h)
272
273 if _PRINT_GRAPHS:
274 print "----------------------\nx_sample_given_h_func"
275 theano.printing.debugprint(self.x_sample_given_h_func)
276
277 ##############################################
278 # cd update done by grad of free energy
279
280 x_tilde = T.tensor4('x_tilde')
281 cd_update_cost = self._mean_free_energy(self.x) - \
282 self._mean_free_energy(x_tilde)
283
284 cd_grad = T.grad(cd_update_cost, self.params)
285 # This is NLL minimization so we use a -
286 cd_updates = {self.W: self.W - self.lr * cd_grad[0],
287 self.b_h: self.b_h - self.lr * cd_grad[1],
288 self.b_x: self.b_x - self.lr * cd_grad[2]}
289
290 cd_returned = [cd_update_cost,
291 cd_grad[0], cd_grad[1], cd_grad[2],
292 self.lr * cd_grad[0],
293 self.lr * cd_grad[1],
294 self.lr * cd_grad[2]]
295 self.cd_return_desc = \
296 ['cd_update_cost',
297 'cd_grad_W', 'cd_grad_b_h', 'cd_grad_b_x',
298 'lr_times_cd_grad_W',
299 'lr_times_cd_grad_b_h',
300 'lr_times_cd_grad_b_x']
301
302 self.cd_update_function = \
303 theano.function([self.x, x_tilde],
304 cd_returned, updates=cd_updates)
305
306 if _PRINT_GRAPHS:
307 print "----------------------\ncd_update_function"
308 theano.printing.debugprint(self.cd_update_function)
309
310 ##############
311 # sparsity update, based on grad for b_h only
312
313 '''
314 This mean returns an array of shape
315 (num_hiddens_planes, feature_map_height, feature_map_width)
316 (so it's a mean over each unit's activation)
317 '''
318 mean_expected_activation = T.mean(E_h_given_x, axis=0)
319 # sparsity_p is broadcasted everywhere
320 sparsity_update_cost = \
321 T.sqr(self.sparsity_p - mean_expected_activation)
322 sparsity_update_cost = \
323 T.sum(T.sum(T.sum( \
324 sparsity_update_cost, axis=2), axis=1), axis=0)
325 sparsity_grad = T.grad(sparsity_update_cost, [self.W, self.b_h])
326
327 sparsity_returned = \
328 [sparsity_update_cost,
329 sparsity_grad[0], sparsity_grad[1],
330 self.sparsity_lambda * self.lr * sparsity_grad[0],
331 self.sparsity_lambda * self.lr * sparsity_grad[1]]
332 self.sparsity_return_desc = \
333 ['sparsity_update_cost',
334 'sparsity_grad_W',
335 'sparsity_grad_b_h',
336 'lambda_lr_times_sparsity_grad_W',
337 'lambda_lr_times_sparsity_grad_b_h']
338
339 # gradient _descent_ so we use a -
340 sparsity_update = \
341 {self.b_h: self.b_h - \
342 self.sparsity_lambda * self.lr * sparsity_grad[1],
343 self.W: self.W - \
344 self.sparsity_lambda * self.lr * sparsity_grad[0]}
345 self.sparsity_update_function = \
346 theano.function([self.x],
347 sparsity_returned, updates=sparsity_update)
348
349 if _PRINT_GRAPHS:
350 print "----------------------\nsparsity_update_function"
351 theano.printing.debugprint(self.sparsity_update_function)
352
353 def CD_step(self, x):
354 h1 = self.h_sample_given_x_func(x)
355 x2 = self.x_sample_given_h_func(h1)
356 return self.cd_update_function(x, x2)
357
358 def sparsity_step(self, x):
359 return self.sparsity_update_function(x)
360
361 # these two also operate on minibatches
362
363 def random_gibbs_samples(self, num_updown_steps):
364 start_x = self.numpy_rng.rand(*self._get_visibles_shape())
365 return self.gibbs_samples_from(start_x, num_updown_steps)
366
367 def gibbs_samples_from(self, start_x, num_updown_steps):
368 x_sample = start_x
369 for i in xrange(num_updown_steps):
370 h_sample = self.h_sample_given_x_func(x_sample)
371 x_sample = self.x_sample_given_h_func(h_sample)
372 return x_sample
373
374