# HG changeset patch # User James Bergstra # Date 1283531710 14400 # Node ID 075c193afd1bb45a2874ac17d269342cc6d7e390 # Parent bc4d98995badfd3adb8799f39bfbefccf87dd831 refactoring mcRBM diff -r bc4d98995bad -r 075c193afd1b pylearn/algorithms/mcRBM.py --- a/pylearn/algorithms/mcRBM.py Thu Sep 02 16:48:33 2010 -0400 +++ b/pylearn/algorithms/mcRBM.py Fri Sep 03 12:35:10 2010 -0400 @@ -246,97 +246,8 @@ # ########################################### -#TODO: make global function to initialize parameter - -def hidden_cov_units_preactivation_given_v(rbm, v, small=0.5): - """Return argument to the sigmoid that would give mean of covariance hid units - - See the math at the top of this file for what 'adjusted' means. - - return b - 0.5 * dot(adjusted(v), U)**2 - """ - (U,W,a,b,c) = rbm - unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm - return b - 0.5 * dot(unit_v, U)**2 - -def free_energy_terms_given_v(rbm, v): - """Returns theano expression for the terms that are added to form the free energy of - visible vector `v` in an mcRBM. - - 1. Free energy related to covariance hiddens - 2. Free energy related to mean hiddens - 3. Free energy related to L2-Norm of `v` - 4. Free energy related to projection of `v` onto biases `a` - """ - U, W, a, b, c = rbm - t0 = -TT.sum(TT.nnet.softplus(hidden_cov_units_preactivation_given_v(rbm, v)),axis=1) - t1 = -TT.sum(TT.nnet.softplus(c + dot(v,W)), axis=1) - t2 = 0.5 * TT.sum(v**2, axis=1) - t3 = -TT.dot(v, a) - return [t0, t1, t2, t3] - -def free_energy_given_v(rbm, v): - """Returns theano expression for free energy of visible vector `v` in an mcRBM - """ - return sum(free_energy_terms_given_v(rbm,v)) - -def expected_h_g_given_v(rbm, v): - """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM. - - `h` is the conditional on the covariance units. - `g` is the conditional on the mean units. - - """ - (U, W, a, b, c) = rbm - h = TT.nnet.sigmoid(hidden_cov_units_preactivation_given_v(rbm, v)) - g = nnet.sigmoid(c + dot(v,W)) - return (h, g) - -def n_visible_units(rbm): - """Return the number of visible units of this RBM - - For an RBM made from shared variables, this will return an integer, - for a purely symbolic RBM this will return a theano expression. - - """ - W = rbm[1] - try: - return W.value.shape[0] - except AttributeError: - return W.shape[0] - -def sampler(rbm, n_particles, n_visible=None, rng=7823748): - """Return an `HMC_sampler` that will draw samples from the distribution over visible - units specified by this RBM. - - :param n_particles: this many parallel chains will be simulated. - :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation. - """ - if not hasattr(rng, 'randn'): - rng = np.random.RandomState(rng) - if n_visible is None: - n_visible = n_visible_units(rbm) - rval = HMC_sampler( - positions = [shared( - rng.randn( - n_particles, - n_visible).astype(floatX), - name='particles')], - energy_fn = lambda p : free_energy_given_v(rbm, p[0]), - seed=int(rng.randint(2**30))) - return rval - -############################# -# -# Convenient data container -# -############################# - -class MeanCovRBM(object): - """Container for mcRBM parameters - - It provides parameter lookup by name, as well as a heuristic for initializing the - parameters for effective learning. +class mcRBM(object): + """Light-weight class that provides the math related to inference Attributes: @@ -345,23 +256,90 @@ - a - the visible bias (theano shared variable) - b - the covariance bias (theano shared variable) - c - the mean bias (theano shared variable) - """ + def __init__(self, U, W, a, b, c): + self.U = U + self.W = W + self.a = a + self.b = b + self.c = c + + def hidden_cov_units_preactivation_given_v(self, v, small=0.5): + """Return argument to the sigmoid that would give mean of covariance hid units + + See the math at the top of this file for what 'adjusted' means. + + return b - 0.5 * dot(adjusted(v), U)**2 + """ + unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm + return self.b - 0.5 * dot(unit_v, self.U)**2 - params = property(lambda s: [s.U, s.W, s.a, s.b, s.c]) + def free_energy_terms_given_v(self, v): + """Returns theano expression for the terms that are added to form the free energy of + visible vector `v` in an mcRBM. - n_visible = property(lambda s: s.W.value.shape[0]) + 1. Free energy related to covariance hiddens + 2. Free energy related to mean hiddens + 3. Free energy related to L2-Norm of `v` + 4. Free energy related to projection of `v` onto biases `a` + """ + t0 = -TT.sum(TT.nnet.softplus(self.hidden_cov_units_preactivation_given_v(v)),axis=1) + t1 = -TT.sum(TT.nnet.softplus(self.c + dot(v,self.W)), axis=1) + t2 = 0.5 * TT.sum(v**2, axis=1) + t3 = -TT.dot(v, self.a) + return [t0, t1, t2, t3] + + def free_energy_given_v(self, v): + """Returns theano expression for free energy of visible vector `v` in an mcRBM + """ + return TT.add(*self.free_energy_terms_given_v(v)) + + def expected_h_g_given_v(self, v): + """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM. - def __init__(self, U, W, a, b, c): - self.__dict__.update(locals()) - del self.__dict__['self'] + `h` is the conditional on the covariance units. + `g` is the conditional on the mean units. + + """ + h = TT.nnet.sigmoid(self.hidden_cov_units_preactivation_given_v(v)) + g = nnet.sigmoid(self.c + dot(v,self.W)) + return (h, g) + + def n_visible_units(self): + """Return the number of visible units of this RBM + + For an RBM made from shared variables, this will return an integer, + for a purely symbolic RBM this will return a theano expression. + + """ + try: + return self.W.value.shape[0] + except AttributeError: + return self.W.shape[0] - def __getitem__(self, idx): - # support unpacking of this container as if it were a tuple - return self.params[idx] + def sampler(self, n_particles, n_visible=None, rng=7823748): + """Return an `HMC_sampler` that will draw samples from the distribution over visible + units specified by this RBM. + + :param n_particles: this many parallel chains will be simulated. + :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation. + """ + if not hasattr(rng, 'randn'): + rng = np.random.RandomState(rng) + if n_visible is None: + n_visible = self.n_visible_units() + rval = HMC_sampler( + positions = [shared( + rng.randn( + n_particles, + n_visible).astype(floatX), + name='particles')], + energy_fn=self.free_energy_given_v, + seed=int(rng.randint(2**30))) + return rval @classmethod - def new_from_dims(cls, n_I, n_K, n_J, rng = 8923402190): + def alloc(cls, n_I, n_K, n_J, rng = 8923402190): """ Return a MeanCovRBM instance with randomly-initialized parameters. @@ -377,33 +355,98 @@ return shared(X.astype(floatX), name=name) # initialization taken from train_mcRBM.py - return cls( + rval = cls( U = shrd(0.02 * rng.randn(n_I, n_K),'U'), W = shrd(0.05 * rng.randn(n_I, n_J),'W'), a = shrd(np.ones(n_I)*(0),'a'), b = shrd(np.ones(n_K)*2,'b'), c = shrd(np.ones(n_J)*(-2),'c')) - def __getstate__(self): - # unpack shared containers, which may have references to Theano stuff and are not a - # long-term stable data type. - return dict( - U = self.U.value, - W = self.W.value, - b = self.b.value, - c = self.c.value) + rval.params = [rval.U, rval.W, rval.a, rval.b, rval.c] + return rval + +class mcRBMTrainer(object): + """ + + Attributes: + - rbm + - sampler + - normVF + - learn_rate + - learn_rate_multipliers - def __setstate__(self, dct): - d = dict(dct) - for key in ['U', 'W', 'a', 'b', 'c']: - d[key] = shared(d[key], name=key) - self.__init__(**d) + """ + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + + def normalize_U(self, new_U): + #TODO: write the docstring + U_norms = TT.sqrt((new_U**2).sum(axis=0)) + new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms) + return new_U * this_normVF / U_norms), new_normVF + + def contrastive_grads(self, visible_batch, params=None): + if params is not None: + params = self.rbm.params + return contrastive_grad( + free_energy_fn=rbm.free_energy_given_v, + pos_v=visible_batch, + neg_v=self.sampler.positions, + params=params, + other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty) -#TODO: put the normalization of U as a global function + def cd_updates(self, visible_batch, params=None, rng=89234): + if params is not None: + params = self.rbm.params + + grads = self.contrastive_grads(visible_batch, params) + + # contrastive divergence updates + # TODO: sgd_updates is a particular optization algo (others are possible) + # parametrize so that algo is plugin + # the normalization normVF might be sgd-specific though... + + # TODO: when sgd has an annealing schedule, this should + # go through that mechanism. + + # TODO: parametrize these constants (e.g. 2000) + + ups[self.iter] = self.iter + 1 + lr = TT.clip( + self.learn_rate * 2000 / (self.iter+1), + 0.0, #min + self.learn_rate) #max + ups = sgd_updates( + params, + grads, + stepsizes=[a*lr for a in learn_rate_multipliers]) -#TODO: put the learning loop as a global function or class, so that someone could load and *TRAIN* an mcRBM!!! + # sampler updates + ups.update(dict(self.sampler.updates())) + + # add trainer updates (replace CD update of U) + ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[U]) + + return ups + + # TODO: accept a GD algo as an argument? + @classmethod + def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234, + l1_penalty=0, + learn_rate_multipliers=[2, .2, .02, .1, .02]): + # allocates shared var for negative phase particles + + return cls( + rbm=rbm, + sampler=rbm.sampler(batchsize, rng=rng), + normVF=shared(1.0, 'normVF'), + learn_rate=shared(initial_lr/batchsize, 'learn_rate'), + iter=shared(0, 'iter'), + l1_penalty=l1_penalty, + learn_rate_multipliers=learn_rate_multipliers) + if __name__ == '__main__': import pylearn.algorithms.tests.test_mcRBM diff -r bc4d98995bad -r 075c193afd1b pylearn/algorithms/tests/test_mcRBM.py --- a/pylearn/algorithms/tests/test_mcRBM.py Thu Sep 02 16:48:33 2010 -0400 +++ b/pylearn/algorithms/tests/test_mcRBM.py Fri Sep 03 12:35:10 2010 -0400 @@ -1,8 +1,5 @@ - - from pylearn.algorithms.mcRBM import * - def test_reproduce_ranzato_hinton_2010(dataset='MAR', as_unittest=True): dataset='MAR' if dataset == 'MAR': @@ -17,6 +14,7 @@ n_burnin_steps=10000 + l1_penalty=1e-3 no_l1_epochs = 10 effective_l1_penalty=0.0 @@ -25,14 +23,9 @@ batchsize = 128 lr = 0.075 / batchsize s_lr = TT.scalar() - s_l1_penalty=TT.scalar() n_K=256 n_J=100 - rbm = MeanCovRBM.new_from_dims(n_I=n_vis, n_K=n_K, n_J=n_J) - - smplr = sampler(rbm, n_particles=batchsize) - def l2(X): return numpy.sqrt((X**2).sum()) if dataset == 'MAR': @@ -60,26 +53,19 @@ if not as_unittest: imgs_fn = function([batch_idx], outputs=train_batch) - grads = contrastive_grad( - free_energy_fn=lambda v: free_energy_given_v(rbm, v), - pos_v=train_batch, - neg_v=smplr.positions[0], - params=list(rbm), - other_cost=(l1(rbm.U)+l1(rbm.W)) * s_l1_penalty) - sgd_ups = sgd_updates( - rbm.params, - grads, - stepsizes=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ]) - learn_fn = function([batch_idx, s_lr, s_l1_penalty], - outputs=[ - grads[0].norm(2), - (sgd_ups[0][1] - sgd_ups[0][0]).norm(2), - (sgd_ups[1][1] - sgd_ups[1][0]).norm(2), - ], - updates = sgd_ups) + trainer = mcRBMTrainer.alloc( + mcRBM.alloc(n_I=n_vis, n_K=n_K, n_J=n_J), + train_batch, + batchsize, l1_penalty=TT.scalar()) + rbm=trainer.rbm + smplr = trainer.sampler + + grads = trainer.contrastive_grads(train_batch) + learn_fn = function([batch_idx, trainer.l1_penalty], + outputs=[grads[0].norm(2), grads[0].norm(2), grads[1].norm(2)], + updates=trainer.cd_updates(train_batch)) print "Learning..." - normVF=1 last_epoch = -1 for jj in xrange(n_train_iters): epoch = jj*batchsize / epoch_size @@ -87,9 +73,6 @@ print_jj = epoch != last_epoch last_epoch = epoch - if epoch > 10: - break - if as_unittest and epoch == 5: U = rbm.U.value W = rbm.W.value @@ -117,7 +100,7 @@ if print_jj: if not as_unittest: tile(imgs_fn(jj), "imgs_%06i.png"%jj) - tile(smplr.positions[0].value, "sample_%06i.png"%jj) + tile(smplr.positions.value, "sample_%06i.png"%jj) tile(rbm.U.value.T, "U_%06i.png"%jj) tile(rbm.W.value.T, "W_%06i.png"%jj) @@ -132,18 +115,22 @@ print 'b min max', rbm.b.value.min(), rbm.b.value.max(), print 'c min max', rbm.c.value.min(), rbm.c.value.max() - print 'parts min', smplr.positions[0].value.min(), - print 'max',smplr.positions[0].value.max(), + print 'parts min', smplr.positions.value.min(), + print 'max',smplr.positions.value.max(), print 'HMC step', smplr.stepsize, print 'arate', smplr.avg_acceptance_rate - # Continue HMC chain - smplr.simulate() + + if 0: + # Continue HMC chain + smplr.simulate() - # Do CD update - l2_of_Ugrad = learn_fn(jj, - lr/max(1, jj/(20*epoch_size/batchsize)), - effective_l1_penalty) + # Do CD update + l2_of_Ugrad = learn_fn(jj, + lr/max(1, jj/(20*epoch_size/batchsize)), + effective_l1_penalty) + + learn_fn(jj, effective_l1_penalty) if print_jj: print 'l2(U_grad)', float(l2_of_Ugrad[0]), @@ -161,9 +148,10 @@ # weird normalization technique... # It constrains all the columns of the matrix to have the same length - # But the matrix itself is re-scaled to have an arbitrary abslute size. - U = rbm.U.value - U_norms = np.sqrt((U*U).sum(axis=0)) - assert len(U_norms) == n_K - normVF = .95 * normVF + .05 * np.mean(U_norms) - rbm.U.value = rbm.U.value * normVF/U_norms + if 0: + U = rbm.U.value + U_norms = np.sqrt((U*U).sum(axis=0)) + assert len(U_norms) == n_K + normVF = .95 * normVF + .05 * np.mean(U_norms) + rbm.U.value = rbm.U.value * normVF/U_norms +