comparison pylearn/algorithms/mcRBM.py @ 1272:ba25c6e4f55d

mcRBM working with whole learning algo in theano
author James Bergstra <bergstrj@iro.umontreal.ca>
date Sat, 04 Sep 2010 19:32:27 -0400
parents d38cb039c662
children 7bb5dd98e671
comparison
equal deleted inserted replaced
1271:cc6c6d7234a7 1272:ba25c6e4f55d
197 import theano 197 import theano
198 from theano import function, shared, dot 198 from theano import function, shared, dot
199 from theano import tensor as TT 199 from theano import tensor as TT
200 floatX = theano.config.floatX 200 floatX = theano.config.floatX
201 201
202 sharedX = lambda X, name : shared(numpy.asarray(X, dtype=floatX), name=name)
203
202 import pylearn 204 import pylearn
203 #TODO: clean up the HMC_sampler code 205 #TODO: clean up the HMC_sampler code
204 #TODO: think of naming convention for acronyms + suffix? 206 #TODO: think of naming convention for acronyms + suffix?
205 from pylearn.sampling.hmc import HMC_sampler 207 from pylearn.sampling.hmc import HMC_sampler
206 from pylearn.io import image_tiling 208 from pylearn.io import image_tiling
211 # 213 #
212 # Candidates for factoring 214 # Candidates for factoring
213 # 215 #
214 ########################################### 216 ###########################################
215 217
216 #TODO: Document, move to pylearn's math lib
217 def l1(X): 218 def l1(X):
219 """
220 :param X: TensorType variable
221
222 :rtype: TensorType scalar
223
224 :returns: the sum of absolute values of the terms in X
225
226 :math: \sum_i |X_i|
227
228 Where i is an appropriately dimensioned index.
229
230 """
218 return abs(X).sum() 231 return abs(X).sum()
219 232
220 #TODO: Document, move to pylearn's math lib
221 def l2(X): 233 def l2(X):
234 """
235 :param X: TensorType variable
236
237 :rtype: TensorType scalar
238
239 :returns: the sum of absolute values of the terms in X
240
241 :math: \sqrt{ \sum_i X_i^2 }
242
243 Where i is an appropriately dimensioned index.
244
245 """
222 return TT.sqrt((X**2).sum()) 246 return TT.sqrt((X**2).sum())
223 247
224 #TODO: Document, move to pylearn's math lib
225 def contrastive_cost(free_energy_fn, pos_v, neg_v): 248 def contrastive_cost(free_energy_fn, pos_v, neg_v):
249 """
250 :param free_energy_fn: lambda (TensorType matrix MxN) -> TensorType vector of M free energies
251 :param pos_v: TensorType matrix MxN of M "positive phase" particles
252 :param neg_v: TensorType matrix MxN of M "negative phase" particles
253
254 :returns: TensorType scalar that's the sum of the difference of free energies
255
256 :math: \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
257
258 """
226 return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum() 259 return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum()
227 260
228 #TODO: Typical use of contrastive_cost is to later use tensor.grad, but in that case we want to 261 def contrastive_grad(free_energy_fn, pos_v, neg_v, wrt, other_cost=0):
229 # block gradient going through neg_v 262 """
230 def contrastive_grad(free_energy_fn, pos_v, neg_v, params, other_cost=0): 263 :param free_energy_fn: lambda (TensorType matrix MxN) -> TensorType vector of M free energies
231 """
232 :param pos_v: positive-phase sample of visible units 264 :param pos_v: positive-phase sample of visible units
233 :param neg_v: negative-phase sample of visible units 265 :param neg_v: negative-phase sample of visible units
234 """ 266 :param wrt: TensorType variables with respect to which we want gradients (similar to the
235 #block the grad through neg_v 267 'wrt' argument to tensor.grad)
268 :param other_cost: TensorType scalar
269
270 :returns: TensorType variables for the gradient on each of the 'wrt' arguments
271
272
273 :math: Cost = other_cost + \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
274 :math: d Cost / dW for W in `wrt`
275
276
277 This function is similar to tensor.grad - it returns the gradient[s] on a cost with respect
278 to one or more parameters. The difference between tensor.grad and this function is that
279 the negative phase term (`neg_v`) is considered constant, i.e. d `Cost` / d `neg_v` = 0.
280 This is desirable because `neg_v` might be the result of a sampling expression involving
281 some of the parameters, but the contrastive divergence algorithm does not call for
282 backpropagating through the sampling procedure.
283
284 Warning - if other_cost depends on pos_v or neg_v and you *do* want to backpropagate from
285 the `other_cost` through those terms, then this function is inappropriate. In that case,
286 you should call tensor.grad separately for the other_cost and add the gradient expressions
287 you get from ``contrastive_grad(..., other_cost=0)``
288
289 """
236 cost=contrastive_cost(free_energy_fn, pos_v, neg_v) 290 cost=contrastive_cost(free_energy_fn, pos_v, neg_v)
237 if other_cost: 291 if other_cost:
238 cost = cost + other_cost 292 cost = cost + other_cost
239 return theano.tensor.grad(cost, 293 return theano.tensor.grad(cost,
240 wrt=params, 294 wrt=wrt,
241 consider_constant=[neg_v]) 295 consider_constant=[neg_v])
242 296
243 ########################################### 297 ###########################################
244 # 298 #
245 # Expressions that are mcRBM-specific 299 # Expressions that are mcRBM-specific
254 - U - the covariance filters (theano shared variable) 308 - U - the covariance filters (theano shared variable)
255 - W - the mean filters (theano shared variable) 309 - W - the mean filters (theano shared variable)
256 - a - the visible bias (theano shared variable) 310 - a - the visible bias (theano shared variable)
257 - b - the covariance bias (theano shared variable) 311 - b - the covariance bias (theano shared variable)
258 - c - the mean bias (theano shared variable) 312 - c - the mean bias (theano shared variable)
313
259 """ 314 """
260 def __init__(self, U, W, a, b, c): 315 def __init__(self, U, W, a, b, c):
261 self.U = U 316 self.U = U
262 self.W = W 317 self.W = W
263 self.a = a 318 self.a = a
327 if not hasattr(rng, 'randn'): 382 if not hasattr(rng, 'randn'):
328 rng = np.random.RandomState(rng) 383 rng = np.random.RandomState(rng)
329 if n_visible is None: 384 if n_visible is None:
330 n_visible = self.n_visible_units() 385 n_visible = self.n_visible_units()
331 rval = HMC_sampler.new_from_shared_positions( 386 rval = HMC_sampler.new_from_shared_positions(
332 shared_positions = shared( 387 shared_positions = sharedX(
333 rng.randn( 388 rng.randn(
334 n_particles, 389 n_particles,
335 n_visible).astype(floatX), 390 n_visible),
336 name='particles'), 391 name='particles'),
337 energy_fn=self.free_energy_given_v, 392 energy_fn=self.free_energy_given_v,
338 seed=int(rng.randint(2**30))) 393 seed=int(rng.randint(2**30)))
339 return rval 394 return rval
340 395
341 def as_feedforward_layer(self, v): 396 def as_feedforward_layer(self, v):
397 """Return a dictionary with keys: inputs, outputs and params
398
399 The inputs is [v]
400
401 The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is
402 the mean hidden units.
403
404 The params are ``[U, W, b, c]``, the model parameters that enter into the conditional
405 expectations.
406
407 :TODO: add an optional parameter to return only one of the expections.
408
409 """
342 return dict( 410 return dict(
343 outputs = self.expected_h_g_given_v(v), 411 inputs = [v],
412 outputs = list(self.expected_h_g_given_v(v)),
344 params = [self.U, self.W, self.b, self.c], 413 params = [self.U, self.W, self.b, self.c],
345 ) 414 )
346 415
347 @classmethod 416 @classmethod
348 def alloc(cls, n_I, n_K, n_J, rng = 8923402190): 417 def alloc(cls, n_I, n_K, n_J, rng = 8923402190,
349 """ 418 U_range=0.02,
350 Return a MeanCovRBM instance with randomly-initialized parameters. 419 W_range=0.05,
420 a_ival=0,
421 b_ival=2,
422 c_ival=-2):
423 """
424 Return a MeanCovRBM instance with randomly-initialized shared variable parameters.
351 425
352 :param n_I: input dimensionality 426 :param n_I: input dimensionality
353 :param n_K: number of covariance hidden units 427 :param n_K: number of covariance hidden units
354 :param n_J: number of mean filters (linear) 428 :param n_J: number of mean filters (linear)
355 :param rng: seed or numpy RandomState object to initialize params 429 :param rng: seed or numpy RandomState object to initialize parameters
430
431 :note:
432 Constants for initial ranges and values taken from train_mcRBM.py.
356 """ 433 """
357 if not hasattr(rng, 'randn'): 434 if not hasattr(rng, 'randn'):
358 rng = np.random.RandomState(rng) 435 rng = np.random.RandomState(rng)
359 436
360 def shrd(X,name):
361 return shared(X.astype(floatX), name=name)
362
363 # initialization taken from train_mcRBM.py
364 rval = cls( 437 rval = cls(
365 U = shrd(0.02 * rng.randn(n_I, n_K),'U'), 438 U = sharedX(U_range * rng.randn(n_I, n_K),'U'),
366 W = shrd(0.05 * rng.randn(n_I, n_J),'W'), 439 W = sharedX(W_range * rng.randn(n_I, n_J),'W'),
367 a = shrd(np.ones(n_I)*(0),'a'), 440 a = sharedX(np.ones(n_I)*a_ival,'a'),
368 b = shrd(np.ones(n_K)*2,'b'), 441 b = sharedX(np.ones(n_K)*b_ival,'b'),
369 c = shrd(np.ones(n_J)*(-2),'c')) 442 c = sharedX(np.ones(n_J)*c_ival,'c'),)
370 443
371 rval.params = [rval.U, rval.W, rval.a, rval.b, rval.c] 444 rval.params = lambda : [rval.U, rval.W, rval.a, rval.b, rval.c]
372 return rval 445 return rval
373 446
374 class mcRBMTrainer(object): 447 class mcRBMTrainer(object):
375 """ 448 """Light-weight class encapsulating math for mcRBM training
376 449
377 Attributes: 450 Attributes:
378 - rbm 451 - rbm - an mcRBM instance
379 - sampler 452 - sampler - an HMC_sampler instance
380 - normVF 453 - normVF - geometrically updated norm of U matrix columns (shared var)
381 - learn_rate 454 - learn_rate - SGD learning rate [un-annealed]
382 - learn_rate_multipliers 455 - learn_rate_multipliers - the learning rates for each of the parameters of the rbm (in
383 456 order corresponding to what's returned by ``rbm.params()``)
384 """ 457 - l1_penalty - float or TensorType scalar to modulate l1 penalty of rbm.U and rbm.W
458 - iter - number of cd_updates (shared var) - used to anneal the effective learn_rate
459 - lr_anneal_start - scalar or TensorType scalar - iter at which time to start decreasing
460 the learning rate proportional to 1/iter
461
462 """
463 # TODO: accept a GD algo as an argument?
464 @classmethod
465 def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234,
466 l1_penalty=0,
467 learn_rate_multipliers=[2, .2, .02, .1, .02],
468 lr_anneal_start=2000,
469 ):
470
471 """
472 :param rbm: mcRBM instance to train
473 :param visible_batch: TensorType variable for training data
474 :param batchsize: the number of rows in visible_batch
475 :param initial_lr: the learning rate (may be annealed)
476 :param rng: seed or RandomState to initialze PCD sampler
477 :param l1_penalty: see class doc
478 :param learn_rate_multipliers: see class doc
479 :param lr_anneal_start: see class doc
480 """
481 #TODO: :param lr_anneal_iter: the iteration at which 1/t annealing will begin
482
483 #TODO: get batchsize from visible_batch??
484 # allocates shared var for negative phase particles
485
486
487 # TODO: should normVF be initialized to match the size of rbm.U ?
488
489 return cls(
490 rbm=rbm,
491 visible_batch=visible_batch,
492 sampler=rbm.sampler(batchsize, rng=rng),
493 normVF=sharedX(1.0, 'normVF'),
494 learn_rate=sharedX(initial_lr/batchsize, 'learn_rate'),
495 iter=sharedX(0, 'iter'),
496 l1_penalty=l1_penalty,
497 learn_rate_multipliers=learn_rate_multipliers,
498 lr_anneal_start=lr_anneal_start)
499
385 def __init__(self, **kwargs): 500 def __init__(self, **kwargs):
386 self.__dict__.update(kwargs) 501 self.__dict__.update(kwargs)
387 502
388 def normalize_U(self, new_U): 503 def normalize_U(self, new_U):
389 #TODO: write the docstring 504 """
505 :param new_U: a proposed new value for rbm.U
506
507 :returns: a pair of TensorType variables:
508 a corrected new value for U, and a new value for self.normVF
509
510 This is a weird normalization procedure, but the sample code for the paper has it, and
511 it seems to be important.
512 """
390 U_norms = TT.sqrt((new_U**2).sum(axis=0)) 513 U_norms = TT.sqrt((new_U**2).sum(axis=0))
391 new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms) 514 new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms)
392 return (new_U * this_normVF / U_norms), new_normVF 515 return (new_U * new_normVF / U_norms), new_normVF
393 516
394 def contrastive_grads(self, visible_batch, params=None): 517 def contrastive_grads(self):
395 if params is not None: 518 """Return the contrastive divergence gradients on the parameters of self.rbm """
396 params = self.rbm.params
397 return contrastive_grad( 519 return contrastive_grad(
398 free_energy_fn=self.rbm.free_energy_given_v, 520 free_energy_fn=self.rbm.free_energy_given_v,
399 pos_v=visible_batch, 521 pos_v=self.visible_batch,
400 neg_v=self.sampler.positions, 522 neg_v=self.sampler.positions,
401 params=params, 523 wrt = self.rbm.params(),
402 other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty) 524 other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty)
403 525
404 526 def cd_updates(self):
405 def cd_updates(self, visible_batch, params=None, rng=89234): 527 """
406 if params is not None: 528 Return a dictionary of shared variable updates that implements contrastive divergence
407 params = self.rbm.params 529 learning by stochastic gradient descent with an annealed learning rate.
408 530 """
409 grads = self.contrastive_grads(visible_batch, params) 531
532 grads = self.contrastive_grads()
410 533
411 # contrastive divergence updates 534 # contrastive divergence updates
412 # TODO: sgd_updates is a particular optization algo (others are possible) 535 # TODO: sgd_updates is a particular optization algo (others are possible)
413 # parametrize so that algo is plugin 536 # parametrize so that algo is plugin
414 # the normalization normVF might be sgd-specific though... 537 # the normalization normVF might be sgd-specific though...
415 538
416 # TODO: when sgd has an annealing schedule, this should 539 # TODO: when sgd has an annealing schedule, this should
417 # go through that mechanism. 540 # go through that mechanism.
418 541
419 # TODO: parametrize these constants (e.g. 2000)
420
421 ups[self.iter] = self.iter + 1
422 lr = TT.clip( 542 lr = TT.clip(
423 self.learn_rate * 2000 / (self.iter+1), 543 self.learn_rate * TT.cast(self.lr_anneal_start / (self.iter+1), floatX),
424 0.0, #min 544 0.0, #min
425 self.learn_rate) #max 545 self.learn_rate) #max
426 546
427 ups = sgd_updates( 547 ups = dict(sgd_updates(
428 params, 548 self.rbm.params(),
429 grads, 549 grads,
430 stepsizes=[a*lr for a in learn_rate_multipliers]) 550 stepsizes=[a*lr for a in self.learn_rate_multipliers]))
551
552 ups[self.iter] = self.iter + 1
431 553
432 # sampler updates 554 # sampler updates
433 ups.update(dict(self.sampler.updates())) 555 ups.update(dict(self.sampler.updates()))
434 556
435 # add trainer updates (replace CD update of U) 557 # add trainer updates (replace CD update of U)
436 ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[U]) 558 ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U])
437 559
438 return ups 560 return ups
439
440 # TODO: accept a GD algo as an argument?
441 @classmethod
442 def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234,
443 l1_penalty=0,
444 learn_rate_multipliers=[2, .2, .02, .1, .02]):
445 # allocates shared var for negative phase particles
446
447 return cls(
448 rbm=rbm,
449 sampler=rbm.sampler(batchsize, rng=rng),
450 normVF=shared(1.0, 'normVF'),
451 learn_rate=shared(initial_lr/batchsize, 'learn_rate'),
452 iter=shared(0, 'iter'),
453 l1_penalty=l1_penalty,
454 learn_rate_multipliers=learn_rate_multipliers)
455
456 561
457 if __name__ == '__main__': 562 if __name__ == '__main__':
458 import pylearn.algorithms.tests.test_mcRBM 563 import pylearn.algorithms.tests.test_mcRBM
459 pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True) 564 pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True)