Mercurial > pylearn
diff pylearn/algorithms/mcRBM.py @ 1284:1817485d586d
mcRBM - many changes incl. adding support for pooling matrix
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Wed, 15 Sep 2010 17:49:21 -0400 |
parents | f0129e37a8ef |
children | 4fa2a32e8fde |
line wrap: on
line diff
--- a/pylearn/algorithms/mcRBM.py Wed Sep 15 17:46:21 2010 -0400 +++ b/pylearn/algorithms/mcRBM.py Wed Sep 15 17:49:21 2010 -0400 @@ -45,9 +45,9 @@ - \sum_j c_j g_j For the energy function to correspond to a probability distribution, P must be non-positive. P -is initialized to be a diagonal, and in our experience it can be left as such because even in -the paper it has a very low learning rate, and is only allowed to be updated after the filters -in U are learned (in effect). +is initialized to be a diagonal or a topological pooling matrix, and in our experience it can +be left as such because even in the paper it has a very low learning rate, and is only allowed +to be updated after the filters in U are learned (in effect). Version in published train_mcRBM code ------------------------------------- @@ -90,6 +90,13 @@ - \sum_j \sum_i W_{ij} g_j v_i - \sum_j c_j g_j + E (v, h, g) = + - 0.5 \sum_f \sum_k P_{fk} h_k (\sum_i U_{if} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2 + - \sum_k b_k h_k + + 0.5 \sum_i v_i^2 + - \sum_j \sum_i W_{ij} g_j v_i + - \sum_j c_j g_j + Conventions in this file @@ -101,12 +108,14 @@ Global functions like `free_energy` work on an mcRBM as parametrized in a particular way. Suppose we have -I input dimensions, -F squared filters, -J mean variables, and -K covariance variables. -The mcRBM is parametrized by 5 variables: + - I input dimensions, + - F squared filters, + - J mean variables, and + - K covariance variables. +The mcRBM is parametrized by 6 variables: + + - `P`, a matrix whose rows indicate covariance filter groups (F x K) - `U`, a matrix whose rows are visible covariance directions (I x F) - `W`, a matrix whose rows are visible mean directions (I x J) - `b`, a vector of hidden covariance biases (K) @@ -202,8 +211,6 @@ sharedX = lambda X, name : shared(numpy.asarray(X, dtype=floatX), name=name) import pylearn -#TODO: clean up the HMC_sampler code -#TODO: think of naming convention for acronyms + suffix? from pylearn.sampling.hmc import HMC_sampler from pylearn.io import image_tiling from pylearn.gd.sgd import sgd_updates @@ -357,7 +364,7 @@ """ h = TT.nnet.sigmoid(self.hidden_cov_units_preactivation_given_v(v)) - g = nnet.sigmoid(self.c + dot(v,self.W)) + g = TT.nnet.sigmoid(self.c + dot(v,self.W)) return (h, g) def n_visible_units(self): @@ -372,6 +379,51 @@ except AttributeError: return self.W.shape[0] + def n_hidden_cov_units(self): + """Return the number of hidden units for the covariance in this RBM + + For an RBM made from shared variables, this will return an integer, + for a purely symbolic RBM this will return a theano expression. + + """ + try: + return self.U.value.shape[1] + except AttributeError: + return self.U.shape[1] + + def n_hidden_mean_units(self): + """Return the number of hidden units for the mean in this RBM + + For an RBM made from shared variables, this will return an integer, + for a purely symbolic RBM this will return a theano expression. + + """ + try: + return self.W.value.shape[1] + except AttributeError: + return self.W.shape[1] + + def CD1_sampler(self, v, n_particles, n_visible=None, rng=8923984): + """Return a symbolic negative-phase particle obtained by simulating the Hamiltonian + associated with the energy function. + """ + #TODO: why not expose all HMC arguments somehow? + if not hasattr(rng, 'randn'): + rng = np.random.RandomState(rng) + if n_visible is None: + n_visible = self.n_visible_units() + + # create a dummy hmc object because we want to use *some* of it + hmc = HMC_sampler.new_from_shared_positions( + shared_positions=v, # v is not shared, so some functionality will not work + energy_fn=self.free_energy_given_v, + seed=int(rng.randint(2**30)), + shared_positions_shape=(n_particles,n_visible), + compile_simulate=False) + updates = dict(hmc.updates()) + final_p = updates.pop(v) + return hmc, final_p, updates + def sampler(self, n_particles, n_visible=None, rng=7823748): """Return an `HMC_sampler` that will draw samples from the distribution over visible units specified by this RBM. @@ -379,6 +431,8 @@ :param n_particles: this many parallel chains will be simulated. :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation. """ + #TODO: why not expose all HMC arguments somehow? + #TODO: Consider returning a sample kwargs for passing to HMC_sampler? if not hasattr(rng, 'randn'): rng = np.random.RandomState(rng) if n_visible is None: @@ -393,25 +447,26 @@ seed=int(rng.randint(2**30))) return rval - def as_feedforward_layer(self, v): - """Return a dictionary with keys: inputs, outputs and params + if 0: + def as_feedforward_layer(self, v): + """Return a dictionary with keys: inputs, outputs and params - The inputs is [v] + The inputs is [v] - The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is - the mean hidden units. + The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is + the mean hidden units. - The params are ``[U, W, b, c]``, the model parameters that enter into the conditional - expectations. + The params are ``[U, W, b, c]``, the model parameters that enter into the conditional + expectations. - :TODO: add an optional parameter to return only one of the expections. + :TODO: add an optional parameter to return only one of the expections. - """ - return dict( - inputs = [v], - outputs = list(self.expected_h_g_given_v(v)), - params = [self.U, self.W, self.b, self.c], - ) + """ + return dict( + inputs = [v], + outputs = list(self.expected_h_g_given_v(v)), + params = [self.U, self.W, self.b, self.c], + ) def params(self): """Return the elements of [U,W,a,b,c] that are shared variables @@ -452,6 +507,118 @@ rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c] return rval +def topological_connectivity(out_shape=(12,12), window_shape=(3,3), window_stride=(2,2), + **kwargs): + + in_shape = (window_stride[0] * out_shape[0], + window_stride[1] * out_shape[1]) + + rval = numpy.zeros(in_shape + out_shape, dtype=theano.config.floatX) + A,B,C,D = rval.shape + + # for each output position (out_r, out_c) + for out_r in range(out_shape[0]): + for out_c in range(out_shape[1]): + # for each window position (win_r, win_c) + for win_r in range(window_shape[0]): + for win_c in range(window_shape[1]): + # add 1 to the corresponding input location + in_r = out_r * window_stride[0] + win_r + in_c = out_c * window_stride[1] + win_c + rval[in_r%A, in_c%B, out_r%C, out_c%D] += 1 + + # This normalization algorithm is a guess, based on inspection of the matrix loaded from + # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat + rval = rval.reshape((A*B, C*D)) + rval = (rval.T / rval.sum(axis=1)).T + + rval /= rval.sum(axis=0) + return rval + +class mcRBM_withP(mcRBM): + """Light-weight class that provides the math related to inference + + Attributes: + + - U - the covariance filters (theano shared variable) + - W - the mean filters (theano shared variable) + - a - the visible bias (theano shared variable) + - b - the covariance bias (theano shared variable) + - c - the mean bias (theano shared variable) + + """ + def __init__(self, U, W, a, b, c, P): + self.P = P + super(mcRBM_withP, self).__init__(U,W,a,b,c) + + def hidden_cov_units_preactivation_given_v(self, v, small=0.5): + """Return argument to the sigmoid that would give mean of covariance hid units + + See the math at the top of this file for what 'adjusted' means. + + return b - 0.5 * dot(adjusted(v), U)**2 + """ + unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm + return self.b + 0.5 * dot(dot(unit_v, self.U)**2, self.P) + + def n_hidden_cov_units(self): + """Return the number of hidden units for the covariance in this RBM + + For an RBM made from shared variables, this will return an integer, + for a purely symbolic RBM this will return a theano expression. + + """ + try: + return self.P.value.shape[1] + except AttributeError: + return self.P.shape[1] + + @classmethod + def alloc(cls, n_I, n_K, n_J, *args, **kwargs): + """ + Return a MeanCovRBM instance with randomly-initialized shared variable parameters. + + :param n_I: input dimensionality + :param n_K: number of covariance hidden units + :param n_J: number of mean filters (linear) + :param rng: seed or numpy RandomState object to initialize parameters + + :note: + Constants for initial ranges and values taken from train_mcRBM.py. + """ + return cls.alloc_with_P( + -numpy.eye((n_K, n_K)).astype(theano.config.floatX), + n_I, + n_J, + *args, **kwargs) + + @classmethod + def alloc_topo_P(cls, n_I, n_J, p_out_shape=(12,12), p_win_shape=(3,3), p_win_stride=(2,2), + **kwargs): + return cls.alloc_with_P( + -topological_connectivity(p_out_shape, p_win_shape, p_win_stride), + n_I=n_I, n_J=n_J, **kwargs) + + @classmethod + def alloc_with_P(cls, Pval, n_I, n_J, rng = 8923402190, + U_range=0.02, + W_range=0.05, + a_ival=0, + b_ival=2, + c_ival=-2): + n_F, n_K = Pval.shape + if not hasattr(rng, 'randn'): + rng = np.random.RandomState(rng) + rval = cls( + U = sharedX(U_range * rng.randn(n_I, n_F),'U'), + W = sharedX(W_range * rng.randn(n_I, n_J),'W'), + a = sharedX(np.ones(n_I)*a_ival,'a'), + b = sharedX(np.ones(n_K)*b_ival,'b'), + c = sharedX(np.ones(n_J)*c_ival,'c'), + P = sharedX(Pval, 'P'),) + rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c, rval.P] + return rval + class mcRBMTrainer(object): """Light-weight class encapsulating math for mcRBM training @@ -470,17 +637,43 @@ """ # TODO: accept a GD algo as an argument? @classmethod - def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234, + def alloc_for_P(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234, l1_penalty=0, + l1_penalty_start=0, + learn_rate_multipliers=None, + lr_anneal_start=2000, + p_training_start=4000, + p_training_lr=0.02, + persistent_chains=True + ): + if learn_rate_multipliers is None: + p_lr = sharedX(0.0, 'P_lr_multiplier') + learn_rate_multipliers = [2, .2, .02, .1, .02, p_lr] + else: + p_lr = None + rval = cls.alloc(rbm, visible_batch, batchsize, initial_lr_per_example, rng, l1_penalty, + l1_penalty_start, learn_rate_multipliers, lr_anneal_start, persistent_chains) + + rval.p_lr = p_lr + rval.p_training_start=p_training_start + rval.p_training_lr=p_training_lr + return rval + + + @classmethod + def alloc(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234, + l1_penalty=0, + l1_penalty_start=0, learn_rate_multipliers=[2, .2, .02, .1, .02], lr_anneal_start=2000, + persistent_chains=True ): """ :param rbm: mcRBM instance to train :param visible_batch: TensorType variable for training data :param batchsize: the number of rows in visible_batch - :param initial_lr: the learning rate (may be annealed) + :param initial_lr_per_example: the learning rate (may be annealed) :param rng: seed or RandomState to initialze PCD sampler :param l1_penalty: see class doc :param learn_rate_multipliers: see class doc @@ -494,16 +687,30 @@ # TODO: should normVF be initialized to match the size of rbm.U ? + if (l1_penalty_start > 0) and (l1_penalty != 0.0): + effective_l1_penalty = sharedX(0.0, 'l1_penalty') + else: + effective_l1_penalty = l1_penalty + + if persistent_chains: + sampler = rbm.sampler(batchsize, rng=rng) + else: + sampler = None + return cls( rbm=rbm, + batchsize=batchsize, visible_batch=visible_batch, - sampler=rbm.sampler(batchsize, rng=rng), + sampler=sampler, normVF=sharedX(1.0, 'normVF'), - learn_rate=sharedX(initial_lr/batchsize, 'learn_rate'), + learn_rate=sharedX(initial_lr_per_example/batchsize, 'learn_rate'), iter=sharedX(0, 'iter'), + effective_l1_penalty=effective_l1_penalty, l1_penalty=l1_penalty, + l1_penalty_start=l1_penalty_start, learn_rate_multipliers=learn_rate_multipliers, - lr_anneal_start=lr_anneal_start) + lr_anneal_start=lr_anneal_start, + persistent_chains=persistent_chains,) def __init__(self, **kwargs): self.__dict__.update(kwargs) @@ -522,14 +729,16 @@ new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms) return (new_U * new_normVF / U_norms), new_normVF - def contrastive_grads(self): + def contrastive_grads(self, neg_v = None): """Return the contrastive divergence gradients on the parameters of self.rbm """ + if neg_v is None: + neg_v = self.sampler.positions return contrastive_grad( free_energy_fn=self.rbm.free_energy_given_v, pos_v=self.visible_batch, - neg_v=self.sampler.positions, + neg_v=neg_v, wrt = self.rbm.params(), - other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty) + other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.effective_l1_penalty) def cd_updates(self): """ @@ -537,7 +746,19 @@ learning by stochastic gradient descent with an annealed learning rate. """ - grads = self.contrastive_grads() + ups = {} + + if self.persistent_chains: + grads = self.contrastive_grads() + ups.update(dict(self.sampler.updates())) + else: + cd1_sampler, final_p, cd1_updates = self.rbm.CD1_sampler(self.visible_batch, + self.batchsize) + self._last_cd1_sampler = cd1_sampler # hacked in here for the unit test + #ignore the cd1_sampler + grads = self.contrastive_grads(neg_v = final_p) + ups.update(dict(cd1_updates)) + # contrastive divergence updates # TODO: sgd_updates is a particular optization algo (others are possible) @@ -552,30 +773,29 @@ 0.0, #min self.learn_rate) #max - ups = dict(sgd_updates( + ups.update(dict(sgd_updates( self.rbm.params(), grads, - stepsizes=[a*lr for a in self.learn_rate_multipliers])) + stepsizes=[a*lr for a in self.learn_rate_multipliers]))) ups[self.iter] = self.iter + 1 - # sampler updates - ups.update(dict(self.sampler.updates())) # add trainer updates (replace CD update of U) ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U]) + #l1_updates: + if (self.l1_penalty_start > 0) and (self.l1_penalty != 0.0): + ups[self.effective_l1_penalty] = TT.switch( + self.iter >= self.l1_penalty_start, + self.l1_penalty, + 0.0) + + if getattr(self,'p_lr', None): + ups[self.p_lr] = TT.switch(self.iter > self.p_training_start, + self.p_training_lr, + 0) + ups[self.rbm.P] = TT.clip(ups[self.rbm.P], -5, 0) + return ups -if __name__ == '__main__': - import pylearn.algorithms.tests.test_mcRBM - rbm,smplr = pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010( - as_unittest=False, - n_train_iters=10) - import cPickle - print '' - print 'Saving rbm...' - cPickle.dump(rbm, open('mcRBM.rbm.pkl', 'w'), -1) - print 'Saving sampler...' - cPickle.dump(smplr, open('mcRBM.smplr.pkl', 'w'), -1) -