Mercurial > pylearn
comparison pylearn/algorithms/mcRBM.py @ 1272:ba25c6e4f55d
mcRBM working with whole learning algo in theano
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Sat, 04 Sep 2010 19:32:27 -0400 |
parents | d38cb039c662 |
children | 7bb5dd98e671 |
comparison
equal
deleted
inserted
replaced
1271:cc6c6d7234a7 | 1272:ba25c6e4f55d |
---|---|
197 import theano | 197 import theano |
198 from theano import function, shared, dot | 198 from theano import function, shared, dot |
199 from theano import tensor as TT | 199 from theano import tensor as TT |
200 floatX = theano.config.floatX | 200 floatX = theano.config.floatX |
201 | 201 |
202 sharedX = lambda X, name : shared(numpy.asarray(X, dtype=floatX), name=name) | |
203 | |
202 import pylearn | 204 import pylearn |
203 #TODO: clean up the HMC_sampler code | 205 #TODO: clean up the HMC_sampler code |
204 #TODO: think of naming convention for acronyms + suffix? | 206 #TODO: think of naming convention for acronyms + suffix? |
205 from pylearn.sampling.hmc import HMC_sampler | 207 from pylearn.sampling.hmc import HMC_sampler |
206 from pylearn.io import image_tiling | 208 from pylearn.io import image_tiling |
211 # | 213 # |
212 # Candidates for factoring | 214 # Candidates for factoring |
213 # | 215 # |
214 ########################################### | 216 ########################################### |
215 | 217 |
216 #TODO: Document, move to pylearn's math lib | |
217 def l1(X): | 218 def l1(X): |
219 """ | |
220 :param X: TensorType variable | |
221 | |
222 :rtype: TensorType scalar | |
223 | |
224 :returns: the sum of absolute values of the terms in X | |
225 | |
226 :math: \sum_i |X_i| | |
227 | |
228 Where i is an appropriately dimensioned index. | |
229 | |
230 """ | |
218 return abs(X).sum() | 231 return abs(X).sum() |
219 | 232 |
220 #TODO: Document, move to pylearn's math lib | |
221 def l2(X): | 233 def l2(X): |
234 """ | |
235 :param X: TensorType variable | |
236 | |
237 :rtype: TensorType scalar | |
238 | |
239 :returns: the sum of absolute values of the terms in X | |
240 | |
241 :math: \sqrt{ \sum_i X_i^2 } | |
242 | |
243 Where i is an appropriately dimensioned index. | |
244 | |
245 """ | |
222 return TT.sqrt((X**2).sum()) | 246 return TT.sqrt((X**2).sum()) |
223 | 247 |
224 #TODO: Document, move to pylearn's math lib | |
225 def contrastive_cost(free_energy_fn, pos_v, neg_v): | 248 def contrastive_cost(free_energy_fn, pos_v, neg_v): |
249 """ | |
250 :param free_energy_fn: lambda (TensorType matrix MxN) -> TensorType vector of M free energies | |
251 :param pos_v: TensorType matrix MxN of M "positive phase" particles | |
252 :param neg_v: TensorType matrix MxN of M "negative phase" particles | |
253 | |
254 :returns: TensorType scalar that's the sum of the difference of free energies | |
255 | |
256 :math: \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i]) | |
257 | |
258 """ | |
226 return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum() | 259 return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum() |
227 | 260 |
228 #TODO: Typical use of contrastive_cost is to later use tensor.grad, but in that case we want to | 261 def contrastive_grad(free_energy_fn, pos_v, neg_v, wrt, other_cost=0): |
229 # block gradient going through neg_v | 262 """ |
230 def contrastive_grad(free_energy_fn, pos_v, neg_v, params, other_cost=0): | 263 :param free_energy_fn: lambda (TensorType matrix MxN) -> TensorType vector of M free energies |
231 """ | |
232 :param pos_v: positive-phase sample of visible units | 264 :param pos_v: positive-phase sample of visible units |
233 :param neg_v: negative-phase sample of visible units | 265 :param neg_v: negative-phase sample of visible units |
234 """ | 266 :param wrt: TensorType variables with respect to which we want gradients (similar to the |
235 #block the grad through neg_v | 267 'wrt' argument to tensor.grad) |
268 :param other_cost: TensorType scalar | |
269 | |
270 :returns: TensorType variables for the gradient on each of the 'wrt' arguments | |
271 | |
272 | |
273 :math: Cost = other_cost + \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i]) | |
274 :math: d Cost / dW for W in `wrt` | |
275 | |
276 | |
277 This function is similar to tensor.grad - it returns the gradient[s] on a cost with respect | |
278 to one or more parameters. The difference between tensor.grad and this function is that | |
279 the negative phase term (`neg_v`) is considered constant, i.e. d `Cost` / d `neg_v` = 0. | |
280 This is desirable because `neg_v` might be the result of a sampling expression involving | |
281 some of the parameters, but the contrastive divergence algorithm does not call for | |
282 backpropagating through the sampling procedure. | |
283 | |
284 Warning - if other_cost depends on pos_v or neg_v and you *do* want to backpropagate from | |
285 the `other_cost` through those terms, then this function is inappropriate. In that case, | |
286 you should call tensor.grad separately for the other_cost and add the gradient expressions | |
287 you get from ``contrastive_grad(..., other_cost=0)`` | |
288 | |
289 """ | |
236 cost=contrastive_cost(free_energy_fn, pos_v, neg_v) | 290 cost=contrastive_cost(free_energy_fn, pos_v, neg_v) |
237 if other_cost: | 291 if other_cost: |
238 cost = cost + other_cost | 292 cost = cost + other_cost |
239 return theano.tensor.grad(cost, | 293 return theano.tensor.grad(cost, |
240 wrt=params, | 294 wrt=wrt, |
241 consider_constant=[neg_v]) | 295 consider_constant=[neg_v]) |
242 | 296 |
243 ########################################### | 297 ########################################### |
244 # | 298 # |
245 # Expressions that are mcRBM-specific | 299 # Expressions that are mcRBM-specific |
254 - U - the covariance filters (theano shared variable) | 308 - U - the covariance filters (theano shared variable) |
255 - W - the mean filters (theano shared variable) | 309 - W - the mean filters (theano shared variable) |
256 - a - the visible bias (theano shared variable) | 310 - a - the visible bias (theano shared variable) |
257 - b - the covariance bias (theano shared variable) | 311 - b - the covariance bias (theano shared variable) |
258 - c - the mean bias (theano shared variable) | 312 - c - the mean bias (theano shared variable) |
313 | |
259 """ | 314 """ |
260 def __init__(self, U, W, a, b, c): | 315 def __init__(self, U, W, a, b, c): |
261 self.U = U | 316 self.U = U |
262 self.W = W | 317 self.W = W |
263 self.a = a | 318 self.a = a |
327 if not hasattr(rng, 'randn'): | 382 if not hasattr(rng, 'randn'): |
328 rng = np.random.RandomState(rng) | 383 rng = np.random.RandomState(rng) |
329 if n_visible is None: | 384 if n_visible is None: |
330 n_visible = self.n_visible_units() | 385 n_visible = self.n_visible_units() |
331 rval = HMC_sampler.new_from_shared_positions( | 386 rval = HMC_sampler.new_from_shared_positions( |
332 shared_positions = shared( | 387 shared_positions = sharedX( |
333 rng.randn( | 388 rng.randn( |
334 n_particles, | 389 n_particles, |
335 n_visible).astype(floatX), | 390 n_visible), |
336 name='particles'), | 391 name='particles'), |
337 energy_fn=self.free_energy_given_v, | 392 energy_fn=self.free_energy_given_v, |
338 seed=int(rng.randint(2**30))) | 393 seed=int(rng.randint(2**30))) |
339 return rval | 394 return rval |
340 | 395 |
341 def as_feedforward_layer(self, v): | 396 def as_feedforward_layer(self, v): |
397 """Return a dictionary with keys: inputs, outputs and params | |
398 | |
399 The inputs is [v] | |
400 | |
401 The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is | |
402 the mean hidden units. | |
403 | |
404 The params are ``[U, W, b, c]``, the model parameters that enter into the conditional | |
405 expectations. | |
406 | |
407 :TODO: add an optional parameter to return only one of the expections. | |
408 | |
409 """ | |
342 return dict( | 410 return dict( |
343 outputs = self.expected_h_g_given_v(v), | 411 inputs = [v], |
412 outputs = list(self.expected_h_g_given_v(v)), | |
344 params = [self.U, self.W, self.b, self.c], | 413 params = [self.U, self.W, self.b, self.c], |
345 ) | 414 ) |
346 | 415 |
347 @classmethod | 416 @classmethod |
348 def alloc(cls, n_I, n_K, n_J, rng = 8923402190): | 417 def alloc(cls, n_I, n_K, n_J, rng = 8923402190, |
349 """ | 418 U_range=0.02, |
350 Return a MeanCovRBM instance with randomly-initialized parameters. | 419 W_range=0.05, |
420 a_ival=0, | |
421 b_ival=2, | |
422 c_ival=-2): | |
423 """ | |
424 Return a MeanCovRBM instance with randomly-initialized shared variable parameters. | |
351 | 425 |
352 :param n_I: input dimensionality | 426 :param n_I: input dimensionality |
353 :param n_K: number of covariance hidden units | 427 :param n_K: number of covariance hidden units |
354 :param n_J: number of mean filters (linear) | 428 :param n_J: number of mean filters (linear) |
355 :param rng: seed or numpy RandomState object to initialize params | 429 :param rng: seed or numpy RandomState object to initialize parameters |
430 | |
431 :note: | |
432 Constants for initial ranges and values taken from train_mcRBM.py. | |
356 """ | 433 """ |
357 if not hasattr(rng, 'randn'): | 434 if not hasattr(rng, 'randn'): |
358 rng = np.random.RandomState(rng) | 435 rng = np.random.RandomState(rng) |
359 | 436 |
360 def shrd(X,name): | |
361 return shared(X.astype(floatX), name=name) | |
362 | |
363 # initialization taken from train_mcRBM.py | |
364 rval = cls( | 437 rval = cls( |
365 U = shrd(0.02 * rng.randn(n_I, n_K),'U'), | 438 U = sharedX(U_range * rng.randn(n_I, n_K),'U'), |
366 W = shrd(0.05 * rng.randn(n_I, n_J),'W'), | 439 W = sharedX(W_range * rng.randn(n_I, n_J),'W'), |
367 a = shrd(np.ones(n_I)*(0),'a'), | 440 a = sharedX(np.ones(n_I)*a_ival,'a'), |
368 b = shrd(np.ones(n_K)*2,'b'), | 441 b = sharedX(np.ones(n_K)*b_ival,'b'), |
369 c = shrd(np.ones(n_J)*(-2),'c')) | 442 c = sharedX(np.ones(n_J)*c_ival,'c'),) |
370 | 443 |
371 rval.params = [rval.U, rval.W, rval.a, rval.b, rval.c] | 444 rval.params = lambda : [rval.U, rval.W, rval.a, rval.b, rval.c] |
372 return rval | 445 return rval |
373 | 446 |
374 class mcRBMTrainer(object): | 447 class mcRBMTrainer(object): |
375 """ | 448 """Light-weight class encapsulating math for mcRBM training |
376 | 449 |
377 Attributes: | 450 Attributes: |
378 - rbm | 451 - rbm - an mcRBM instance |
379 - sampler | 452 - sampler - an HMC_sampler instance |
380 - normVF | 453 - normVF - geometrically updated norm of U matrix columns (shared var) |
381 - learn_rate | 454 - learn_rate - SGD learning rate [un-annealed] |
382 - learn_rate_multipliers | 455 - learn_rate_multipliers - the learning rates for each of the parameters of the rbm (in |
383 | 456 order corresponding to what's returned by ``rbm.params()``) |
384 """ | 457 - l1_penalty - float or TensorType scalar to modulate l1 penalty of rbm.U and rbm.W |
458 - iter - number of cd_updates (shared var) - used to anneal the effective learn_rate | |
459 - lr_anneal_start - scalar or TensorType scalar - iter at which time to start decreasing | |
460 the learning rate proportional to 1/iter | |
461 | |
462 """ | |
463 # TODO: accept a GD algo as an argument? | |
464 @classmethod | |
465 def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234, | |
466 l1_penalty=0, | |
467 learn_rate_multipliers=[2, .2, .02, .1, .02], | |
468 lr_anneal_start=2000, | |
469 ): | |
470 | |
471 """ | |
472 :param rbm: mcRBM instance to train | |
473 :param visible_batch: TensorType variable for training data | |
474 :param batchsize: the number of rows in visible_batch | |
475 :param initial_lr: the learning rate (may be annealed) | |
476 :param rng: seed or RandomState to initialze PCD sampler | |
477 :param l1_penalty: see class doc | |
478 :param learn_rate_multipliers: see class doc | |
479 :param lr_anneal_start: see class doc | |
480 """ | |
481 #TODO: :param lr_anneal_iter: the iteration at which 1/t annealing will begin | |
482 | |
483 #TODO: get batchsize from visible_batch?? | |
484 # allocates shared var for negative phase particles | |
485 | |
486 | |
487 # TODO: should normVF be initialized to match the size of rbm.U ? | |
488 | |
489 return cls( | |
490 rbm=rbm, | |
491 visible_batch=visible_batch, | |
492 sampler=rbm.sampler(batchsize, rng=rng), | |
493 normVF=sharedX(1.0, 'normVF'), | |
494 learn_rate=sharedX(initial_lr/batchsize, 'learn_rate'), | |
495 iter=sharedX(0, 'iter'), | |
496 l1_penalty=l1_penalty, | |
497 learn_rate_multipliers=learn_rate_multipliers, | |
498 lr_anneal_start=lr_anneal_start) | |
499 | |
385 def __init__(self, **kwargs): | 500 def __init__(self, **kwargs): |
386 self.__dict__.update(kwargs) | 501 self.__dict__.update(kwargs) |
387 | 502 |
388 def normalize_U(self, new_U): | 503 def normalize_U(self, new_U): |
389 #TODO: write the docstring | 504 """ |
505 :param new_U: a proposed new value for rbm.U | |
506 | |
507 :returns: a pair of TensorType variables: | |
508 a corrected new value for U, and a new value for self.normVF | |
509 | |
510 This is a weird normalization procedure, but the sample code for the paper has it, and | |
511 it seems to be important. | |
512 """ | |
390 U_norms = TT.sqrt((new_U**2).sum(axis=0)) | 513 U_norms = TT.sqrt((new_U**2).sum(axis=0)) |
391 new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms) | 514 new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms) |
392 return (new_U * this_normVF / U_norms), new_normVF | 515 return (new_U * new_normVF / U_norms), new_normVF |
393 | 516 |
394 def contrastive_grads(self, visible_batch, params=None): | 517 def contrastive_grads(self): |
395 if params is not None: | 518 """Return the contrastive divergence gradients on the parameters of self.rbm """ |
396 params = self.rbm.params | |
397 return contrastive_grad( | 519 return contrastive_grad( |
398 free_energy_fn=self.rbm.free_energy_given_v, | 520 free_energy_fn=self.rbm.free_energy_given_v, |
399 pos_v=visible_batch, | 521 pos_v=self.visible_batch, |
400 neg_v=self.sampler.positions, | 522 neg_v=self.sampler.positions, |
401 params=params, | 523 wrt = self.rbm.params(), |
402 other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty) | 524 other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty) |
403 | 525 |
404 | 526 def cd_updates(self): |
405 def cd_updates(self, visible_batch, params=None, rng=89234): | 527 """ |
406 if params is not None: | 528 Return a dictionary of shared variable updates that implements contrastive divergence |
407 params = self.rbm.params | 529 learning by stochastic gradient descent with an annealed learning rate. |
408 | 530 """ |
409 grads = self.contrastive_grads(visible_batch, params) | 531 |
532 grads = self.contrastive_grads() | |
410 | 533 |
411 # contrastive divergence updates | 534 # contrastive divergence updates |
412 # TODO: sgd_updates is a particular optization algo (others are possible) | 535 # TODO: sgd_updates is a particular optization algo (others are possible) |
413 # parametrize so that algo is plugin | 536 # parametrize so that algo is plugin |
414 # the normalization normVF might be sgd-specific though... | 537 # the normalization normVF might be sgd-specific though... |
415 | 538 |
416 # TODO: when sgd has an annealing schedule, this should | 539 # TODO: when sgd has an annealing schedule, this should |
417 # go through that mechanism. | 540 # go through that mechanism. |
418 | 541 |
419 # TODO: parametrize these constants (e.g. 2000) | |
420 | |
421 ups[self.iter] = self.iter + 1 | |
422 lr = TT.clip( | 542 lr = TT.clip( |
423 self.learn_rate * 2000 / (self.iter+1), | 543 self.learn_rate * TT.cast(self.lr_anneal_start / (self.iter+1), floatX), |
424 0.0, #min | 544 0.0, #min |
425 self.learn_rate) #max | 545 self.learn_rate) #max |
426 | 546 |
427 ups = sgd_updates( | 547 ups = dict(sgd_updates( |
428 params, | 548 self.rbm.params(), |
429 grads, | 549 grads, |
430 stepsizes=[a*lr for a in learn_rate_multipliers]) | 550 stepsizes=[a*lr for a in self.learn_rate_multipliers])) |
551 | |
552 ups[self.iter] = self.iter + 1 | |
431 | 553 |
432 # sampler updates | 554 # sampler updates |
433 ups.update(dict(self.sampler.updates())) | 555 ups.update(dict(self.sampler.updates())) |
434 | 556 |
435 # add trainer updates (replace CD update of U) | 557 # add trainer updates (replace CD update of U) |
436 ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[U]) | 558 ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U]) |
437 | 559 |
438 return ups | 560 return ups |
439 | |
440 # TODO: accept a GD algo as an argument? | |
441 @classmethod | |
442 def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234, | |
443 l1_penalty=0, | |
444 learn_rate_multipliers=[2, .2, .02, .1, .02]): | |
445 # allocates shared var for negative phase particles | |
446 | |
447 return cls( | |
448 rbm=rbm, | |
449 sampler=rbm.sampler(batchsize, rng=rng), | |
450 normVF=shared(1.0, 'normVF'), | |
451 learn_rate=shared(initial_lr/batchsize, 'learn_rate'), | |
452 iter=shared(0, 'iter'), | |
453 l1_penalty=l1_penalty, | |
454 learn_rate_multipliers=learn_rate_multipliers) | |
455 | |
456 | 561 |
457 if __name__ == '__main__': | 562 if __name__ == '__main__': |
458 import pylearn.algorithms.tests.test_mcRBM | 563 import pylearn.algorithms.tests.test_mcRBM |
459 pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True) | 564 pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True) |