pylearn: pylearn/algorithms/mcRBM.py comparison

comparison pylearn/algorithms/mcRBM.py @ 1272:ba25c6e4f55d

mcRBM working with whole learning algo in theano

author	James Bergstra <bergstrj@iro.umontreal.ca>
date	Sat, 04 Sep 2010 19:32:27 -0400
parents	d38cb039c662
children	7bb5dd98e671

comparison

equal deleted inserted replaced

-:cc6c6d7234a7
+:ba25c6e4f55d
 import theano
 from theano import function, shared, dot
 from theano import tensor as TT
 floatX = theano.config.floatX
+sharedX = lambda X, name : shared(numpy.asarray(X, dtype=floatX), name=name)
 import pylearn
 #TODO: clean up the HMC_sampler code
 #TODO: think of naming convention for acronyms + suffix?
 from pylearn.sampling.hmc import HMC_sampler
 from pylearn.io import image_tiling
 #
 # Candidates for factoring
 #
 ###########################################
-#TODO: Document, move to pylearn's math lib
 def l1(X):
+"""
+:param X: TensorType variable
+:rtype: TensorType scalar
+:returns: the sum of absolute values of the terms in X
+:math: \sum_i |X_i|
+Where i is an appropriately dimensioned index.
+"""
 return abs(X).sum()
-#TODO: Document, move to pylearn's math lib
 def l2(X):
+"""
+:param X: TensorType variable
+:rtype: TensorType scalar
+:returns: the sum of absolute values of the terms in X
+:math: \sqrt{ \sum_i X_i^2 }
+Where i is an appropriately dimensioned index.
+"""
 return TT.sqrt((X**2).sum())
-#TODO: Document, move to pylearn's math lib
 def contrastive_cost(free_energy_fn, pos_v, neg_v):
+"""
+:param free_energy_fn: lambda (TensorType matrix MxN) ->  TensorType vector of M free energies
+:param pos_v: TensorType matrix MxN of M "positive phase" particles
+:param neg_v: TensorType matrix MxN of M "negative phase" particles
+:returns: TensorType scalar that's the sum of the difference of free energies
+:math: \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
+"""
 return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum()
-#TODO: Typical use of contrastive_cost is to later use tensor.grad, but in that case we want to
+def contrastive_grad(free_energy_fn, pos_v, neg_v, wrt, other_cost=0):
-#      block  gradient going through neg_v
+"""
-def contrastive_grad(free_energy_fn, pos_v, neg_v, params, other_cost=0):
+:param free_energy_fn: lambda (TensorType matrix MxN) ->  TensorType vector of M free energies
-"""
 :param pos_v: positive-phase sample of visible units
 :param neg_v: negative-phase sample of visible units
-"""
+:param wrt: TensorType variables with respect to which we want gradients (similar to the
-#block the grad through neg_v
+'wrt' argument to tensor.grad)
+:param other_cost: TensorType scalar
+:returns: TensorType variables for the gradient on each of the 'wrt' arguments
+:math: Cost = other_cost + \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
+:math: d Cost / dW for W in `wrt`
+This function is similar to tensor.grad - it returns the gradient[s] on a cost with respect
+to one or more parameters.  The difference between tensor.grad and this function is that
+the negative phase term (`neg_v`) is considered constant, i.e. d `Cost` / d `neg_v` = 0.
+This is desirable because `neg_v` might be the result of a sampling expression involving
+some of the parameters, but the contrastive divergence algorithm does not call for
+backpropagating through the sampling procedure.
+Warning - if other_cost depends on pos_v or neg_v and you *do* want to backpropagate from
+the `other_cost` through those terms, then this function is inappropriate.  In that case,
+you should call tensor.grad separately for the other_cost and add the gradient expressions
+you get from ``contrastive_grad(..., other_cost=0)``
+"""
 cost=contrastive_cost(free_energy_fn, pos_v, neg_v)
 if other_cost:
 cost = cost + other_cost
 return theano.tensor.grad(cost,
-wrt=params,
+wrt=wrt,
 consider_constant=[neg_v])
 ###########################################
 #
 # Expressions that are mcRBM-specific
 - U - the covariance filters (theano shared variable)
 - W - the mean filters (theano shared variable)
 - a - the visible bias (theano shared variable)
 - b - the covariance bias (theano shared variable)
 - c - the mean bias (theano shared variable)
 """
 def __init__(self, U, W, a, b, c):
 self.U = U
 self.W = W
 self.a = a
 if not hasattr(rng, 'randn'):
 rng = np.random.RandomState(rng)
 if n_visible is None:
 n_visible = self.n_visible_units()
 rval = HMC_sampler.new_from_shared_positions(
-shared_positions = shared(
+shared_positions = sharedX(
 rng.randn(
 n_particles,
-n_visible).astype(floatX),
+n_visible),
 name='particles'),
 energy_fn=self.free_energy_given_v,
 seed=int(rng.randint(2**30)))
 return rval
 def as_feedforward_layer(self, v):
+"""Return a dictionary with keys: inputs, outputs and params
+The inputs is [v]
+The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is
+the mean hidden units.
+The params are ``[U, W, b, c]``, the model parameters that enter into the conditional
+expectations.
+:TODO: add an optional parameter to return only one of the expections.
+"""
 return dict(
-outputs = self.expected_h_g_given_v(v),
+inputs = [v],
+outputs = list(self.expected_h_g_given_v(v)),
 params = [self.U, self.W, self.b, self.c],
 )
 @classmethod
-def alloc(cls, n_I, n_K, n_J, rng = 8923402190):
+def alloc(cls, n_I, n_K, n_J, rng = 8923402190,
-"""
+U_range=0.02,
-Return a MeanCovRBM instance with randomly-initialized parameters.
+W_range=0.05,
+a_ival=0,
+b_ival=2,
+c_ival=-2):
+"""
+Return a MeanCovRBM instance with randomly-initialized shared variable parameters.
 :param n_I: input dimensionality
 :param n_K: number of covariance hidden units
 :param n_J: number of mean filters (linear)
-:param rng: seed or numpy RandomState object to initialize params
+:param rng: seed or numpy RandomState object to initialize parameters
+:note:
+Constants for initial ranges and values taken from train_mcRBM.py.
 """
 if not hasattr(rng, 'randn'):
 rng = np.random.RandomState(rng)
-def shrd(X,name):
-return shared(X.astype(floatX), name=name)
-# initialization taken from train_mcRBM.py
 rval =  cls(
-U = shrd(0.02 * rng.randn(n_I, n_K),'U'),
+U = sharedX(U_range * rng.randn(n_I, n_K),'U'),
-W = shrd(0.05 * rng.randn(n_I, n_J),'W'),
+W = sharedX(W_range * rng.randn(n_I, n_J),'W'),
-a = shrd(np.ones(n_I)*(0),'a'),
+a = sharedX(np.ones(n_I)*a_ival,'a'),
-b = shrd(np.ones(n_K)*2,'b'),
+b = sharedX(np.ones(n_K)*b_ival,'b'),
-c = shrd(np.ones(n_J)*(-2),'c'))
+c = sharedX(np.ones(n_J)*c_ival,'c'),)
-rval.params = [rval.U, rval.W, rval.a, rval.b, rval.c]
+rval.params = lambda : [rval.U, rval.W, rval.a, rval.b, rval.c]
 return rval
 class mcRBMTrainer(object):
-"""
+"""Light-weight class encapsulating math for mcRBM training
 Attributes:
-- rbm
+- rbm  - an mcRBM instance
-- sampler
+- sampler - an HMC_sampler instance
-- normVF
+- normVF - geometrically updated norm of U matrix columns (shared var)
-- learn_rate
+- learn_rate - SGD learning rate [un-annealed]
-- learn_rate_multipliers
+- learn_rate_multipliers - the learning rates for each of the parameters of the rbm (in
+order corresponding to what's returned by ``rbm.params()``)
-"""
+- l1_penalty - float or TensorType scalar to modulate l1 penalty of rbm.U and rbm.W
+- iter - number of cd_updates (shared var) - used to anneal the effective learn_rate
+- lr_anneal_start - scalar or TensorType scalar - iter at which time to start decreasing
+the learning rate proportional to 1/iter
+"""
+# TODO: accept a GD algo as an argument?
+@classmethod
+def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234,
+l1_penalty=0,
+learn_rate_multipliers=[2, .2, .02, .1, .02],
+lr_anneal_start=2000,
+):
+"""
+:param rbm: mcRBM instance to train
+:param visible_batch: TensorType variable for training data
+:param batchsize: the number of rows in visible_batch
+:param initial_lr: the learning rate (may be annealed)
+:param rng: seed or RandomState to initialze PCD sampler
+:param l1_penalty: see class doc
+:param learn_rate_multipliers: see class doc
+:param lr_anneal_start: see class doc
+"""
+#TODO: :param lr_anneal_iter: the iteration at which 1/t annealing will begin
+#TODO: get batchsize from visible_batch??
+# allocates shared var for negative phase particles
+# TODO: should normVF be initialized to match the size of rbm.U ?
+return cls(
+rbm=rbm,
+visible_batch=visible_batch,
+sampler=rbm.sampler(batchsize, rng=rng),
+normVF=sharedX(1.0, 'normVF'),
+learn_rate=sharedX(initial_lr/batchsize, 'learn_rate'),
+iter=sharedX(0, 'iter'),
+l1_penalty=l1_penalty,
+learn_rate_multipliers=learn_rate_multipliers,
+lr_anneal_start=lr_anneal_start)
 def __init__(self, **kwargs):
 self.__dict__.update(kwargs)
 def normalize_U(self, new_U):
-#TODO: write the docstring
+"""
+:param new_U: a proposed new value for rbm.U
+:returns: a pair of TensorType variables:
+a corrected new value for U, and a new value for self.normVF
+This is a weird normalization procedure, but the sample code for the paper has it, and
+it seems to be important.
+"""
 U_norms = TT.sqrt((new_U**2).sum(axis=0))
 new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms)
-return (new_U * this_normVF / U_norms), new_normVF
+return (new_U * new_normVF / U_norms), new_normVF
-def contrastive_grads(self, visible_batch, params=None):
+def contrastive_grads(self):
-if params is not None:
+"""Return the contrastive divergence gradients on the parameters of self.rbm """
-params = self.rbm.params
 return contrastive_grad(
 free_energy_fn=self.rbm.free_energy_given_v,
-pos_v=visible_batch,
+pos_v=self.visible_batch,
 neg_v=self.sampler.positions,
-params=params,
+wrt = self.rbm.params(),
 other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty)
+def cd_updates(self):
-def cd_updates(self, visible_batch, params=None, rng=89234):
+"""
-if params is not None:
+Return a dictionary of shared variable updates that implements contrastive divergence
-params = self.rbm.params
+learning by stochastic gradient descent with an annealed learning rate.
+"""
-grads = self.contrastive_grads(visible_batch, params)
+grads = self.contrastive_grads()
 # contrastive divergence updates
 # TODO: sgd_updates is a particular optization algo (others are possible)
 #       parametrize so that algo is plugin
 #       the normalization normVF might be sgd-specific though...
 # TODO: when sgd has an annealing schedule, this should
 #       go through that mechanism.
-# TODO: parametrize these constants (e.g. 2000)
-ups[self.iter] = self.iter + 1
 lr = TT.clip(
-self.learn_rate * 2000 / (self.iter+1),
+self.learn_rate * TT.cast(self.lr_anneal_start / (self.iter+1), floatX),
 0.0, #min
 self.learn_rate) #max
-ups = sgd_updates(
+ups = dict(sgd_updates(
-params,
+self.rbm.params(),
 grads,
-stepsizes=[a*lr for a in learn_rate_multipliers])
+stepsizes=[a*lr for a in self.learn_rate_multipliers]))
+ups[self.iter] = self.iter + 1
 # sampler updates
 ups.update(dict(self.sampler.updates()))
 # add trainer updates (replace CD update of U)
-ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[U])
+ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U])
 return ups
-# TODO: accept a GD algo as an argument?
-@classmethod
-def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234,
-l1_penalty=0,
-learn_rate_multipliers=[2, .2, .02, .1, .02]):
-# allocates shared var for negative phase particles
-return cls(
-rbm=rbm,
-sampler=rbm.sampler(batchsize, rng=rng),
-normVF=shared(1.0, 'normVF'),
-learn_rate=shared(initial_lr/batchsize, 'learn_rate'),
-iter=shared(0, 'iter'),
-l1_penalty=l1_penalty,
-learn_rate_multipliers=learn_rate_multipliers)
 if __name__ == '__main__':
 import pylearn.algorithms.tests.test_mcRBM
 pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True)

Mercurial > pylearn

comparison pylearn/algorithms/mcRBM.py @ 1272:ba25c6e4f55d