diff pylearn/algorithms/mcRBM.py @ 1284:1817485d586d

mcRBM - many changes incl. adding support for pooling matrix
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 15 Sep 2010 17:49:21 -0400
parents f0129e37a8ef
children 4fa2a32e8fde
line wrap: on
line diff
--- a/pylearn/algorithms/mcRBM.py	Wed Sep 15 17:46:21 2010 -0400
+++ b/pylearn/algorithms/mcRBM.py	Wed Sep 15 17:49:21 2010 -0400
@@ -45,9 +45,9 @@
         - \sum_j c_j g_j
 
 For the energy function to correspond to a probability distribution, P must be non-positive.  P
-is initialized to be a diagonal, and in our experience it can be left as such because even in
-the paper it has a very low learning rate, and is only allowed to be updated after the filters
-in U are learned (in effect).
+is initialized to be a diagonal or a topological pooling matrix, and in our experience it can
+be left as such because even in the paper it has a very low learning rate, and is only allowed
+to be updated after the filters in U are learned (in effect).
 
 Version in published train_mcRBM code
 -------------------------------------
@@ -90,6 +90,13 @@
         - \sum_j \sum_i W_{ij} g_j v_i
         - \sum_j c_j g_j
 
+    E (v, h, g) =
+        - 0.5 \sum_f \sum_k P_{fk} h_k (\sum_i U_{if} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2 
+        - \sum_k b_k h_k
+        + 0.5 \sum_i v_i^2
+        - \sum_j \sum_i W_{ij} g_j v_i
+        - \sum_j c_j g_j
+
       
 
 Conventions in this file
@@ -101,12 +108,14 @@
 
 Global functions like `free_energy` work on an mcRBM as parametrized in a particular way.
 Suppose we have 
-I input dimensions, 
-F squared filters, 
-J mean variables, and 
-K covariance variables.
-The mcRBM is parametrized by 5 variables:
+ - I input dimensions, 
+ - F squared filters, 
+ - J mean variables, and
+ - K covariance variables.
 
+The mcRBM is parametrized by 6 variables:
+
+ - `P`, a matrix whose rows indicate covariance filter groups (F x K)
  - `U`, a matrix whose rows are visible covariance directions (I x F)
  - `W`, a matrix whose rows are visible mean directions (I x J)
  - `b`, a vector of hidden covariance biases (K)
@@ -202,8 +211,6 @@
 sharedX = lambda X, name : shared(numpy.asarray(X, dtype=floatX), name=name)
 
 import pylearn
-#TODO: clean up the HMC_sampler code
-#TODO: think of naming convention for acronyms + suffix?
 from pylearn.sampling.hmc import HMC_sampler
 from pylearn.io import image_tiling
 from pylearn.gd.sgd import sgd_updates
@@ -357,7 +364,7 @@
         
         """
         h = TT.nnet.sigmoid(self.hidden_cov_units_preactivation_given_v(v))
-        g = nnet.sigmoid(self.c + dot(v,self.W))
+        g = TT.nnet.sigmoid(self.c + dot(v,self.W))
         return (h, g)
 
     def n_visible_units(self):
@@ -372,6 +379,51 @@
         except AttributeError:
             return self.W.shape[0]
 
+    def n_hidden_cov_units(self):
+        """Return the number of hidden units for the covariance in this RBM
+
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.U.value.shape[1]
+        except AttributeError:
+            return self.U.shape[1]
+
+    def n_hidden_mean_units(self):
+        """Return the number of hidden units for the mean in this RBM
+
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.W.value.shape[1]
+        except AttributeError:
+            return self.W.shape[1]
+
+    def CD1_sampler(self, v, n_particles, n_visible=None, rng=8923984):
+        """Return a symbolic negative-phase particle obtained by simulating the Hamiltonian
+        associated with the energy function.
+        """
+        #TODO: why not expose all HMC arguments somehow?
+        if not hasattr(rng, 'randn'):
+            rng = np.random.RandomState(rng)
+        if n_visible is None:
+            n_visible = self.n_visible_units()
+
+        # create a dummy hmc object because we want to use *some* of it
+        hmc = HMC_sampler.new_from_shared_positions(
+                shared_positions=v, # v is not shared, so some functionality will not work
+                energy_fn=self.free_energy_given_v,
+                seed=int(rng.randint(2**30)),
+                shared_positions_shape=(n_particles,n_visible),
+                compile_simulate=False)
+        updates = dict(hmc.updates())
+        final_p = updates.pop(v)
+        return hmc, final_p, updates
+
     def sampler(self, n_particles, n_visible=None, rng=7823748):
         """Return an `HMC_sampler` that will draw samples from the distribution over visible
         units specified by this RBM.
@@ -379,6 +431,8 @@
         :param n_particles: this many parallel chains will be simulated.
         :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation.
         """
+        #TODO: why not expose all HMC arguments somehow?
+        #TODO: Consider returning a sample kwargs for passing to HMC_sampler?
         if not hasattr(rng, 'randn'):
             rng = np.random.RandomState(rng)
         if n_visible is None:
@@ -393,25 +447,26 @@
             seed=int(rng.randint(2**30)))
         return rval
 
-    def as_feedforward_layer(self, v):
-        """Return a dictionary with keys: inputs, outputs and params
+    if 0:
+        def as_feedforward_layer(self, v):
+            """Return a dictionary with keys: inputs, outputs and params
 
-        The inputs is [v]
+            The inputs is [v]
 
-        The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is
-        the mean hidden units.
+            The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is
+            the mean hidden units.
 
-        The params are ``[U, W, b, c]``, the model parameters that enter into the conditional
-        expectations.
+            The params are ``[U, W, b, c]``, the model parameters that enter into the conditional
+            expectations.
 
-        :TODO: add an optional parameter to return only one of the expections.
+            :TODO: add an optional parameter to return only one of the expections.
 
-        """
-        return dict(
-                inputs = [v],
-                outputs = list(self.expected_h_g_given_v(v)),
-                params = [self.U, self.W, self.b, self.c],
-                )
+            """
+            return dict(
+                    inputs = [v],
+                    outputs = list(self.expected_h_g_given_v(v)),
+                    params = [self.U, self.W, self.b, self.c],
+                    )
 
     def params(self):
         """Return the elements of [U,W,a,b,c] that are shared variables
@@ -452,6 +507,118 @@
         rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c]
         return rval
 
+def topological_connectivity(out_shape=(12,12), window_shape=(3,3), window_stride=(2,2),
+        **kwargs):
+
+    in_shape = (window_stride[0] * out_shape[0],
+            window_stride[1] * out_shape[1])
+
+    rval = numpy.zeros(in_shape + out_shape, dtype=theano.config.floatX)
+    A,B,C,D = rval.shape
+
+    # for each output position (out_r, out_c)
+    for out_r in range(out_shape[0]):
+        for out_c in range(out_shape[1]):
+            # for each window position (win_r, win_c)
+            for win_r in range(window_shape[0]):
+                for win_c in range(window_shape[1]):
+                    # add 1 to the corresponding input location
+                    in_r = out_r * window_stride[0] + win_r
+                    in_c = out_c * window_stride[1] + win_c
+                    rval[in_r%A, in_c%B, out_r%C, out_c%D] += 1
+
+    # This normalization algorithm is a guess, based on inspection of the matrix loaded from 
+    # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat
+    rval = rval.reshape((A*B, C*D))
+    rval = (rval.T / rval.sum(axis=1)).T
+
+    rval /= rval.sum(axis=0)
+    return rval
+
+class mcRBM_withP(mcRBM):
+    """Light-weight class that provides the math related to inference
+
+    Attributes:
+
+      - U - the covariance filters (theano shared variable)
+      - W - the mean filters (theano shared variable)
+      - a - the visible bias (theano shared variable)
+      - b - the covariance bias (theano shared variable)
+      - c - the mean bias (theano shared variable)
+
+    """
+    def __init__(self, U, W, a, b, c, P):
+        self.P = P
+        super(mcRBM_withP, self).__init__(U,W,a,b,c)
+
+    def hidden_cov_units_preactivation_given_v(self, v, small=0.5):
+        """Return argument to the sigmoid that would give mean of covariance hid units
+
+        See the math at the top of this file for what 'adjusted' means.
+
+        return b - 0.5 * dot(adjusted(v), U)**2
+        """
+        unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
+        return self.b + 0.5 * dot(dot(unit_v, self.U)**2, self.P)
+
+    def n_hidden_cov_units(self):
+        """Return the number of hidden units for the covariance in this RBM
+
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.P.value.shape[1]
+        except AttributeError:
+            return self.P.shape[1]
+
+    @classmethod
+    def alloc(cls, n_I, n_K, n_J, *args, **kwargs):
+        """
+        Return a MeanCovRBM instance with randomly-initialized shared variable parameters.
+
+        :param n_I: input dimensionality
+        :param n_K: number of covariance hidden units
+        :param n_J: number of mean filters (linear)
+        :param rng: seed or numpy RandomState object to initialize parameters
+        
+        :note:
+        Constants for initial ranges and values taken from train_mcRBM.py.
+        """
+        return cls.alloc_with_P(
+            -numpy.eye((n_K, n_K)).astype(theano.config.floatX),
+            n_I,
+            n_J,
+            *args, **kwargs)
+
+    @classmethod
+    def alloc_topo_P(cls, n_I, n_J, p_out_shape=(12,12), p_win_shape=(3,3), p_win_stride=(2,2),
+            **kwargs):
+        return cls.alloc_with_P(
+                -topological_connectivity(p_out_shape, p_win_shape, p_win_stride),
+                n_I=n_I, n_J=n_J, **kwargs)
+
+    @classmethod
+    def alloc_with_P(cls, Pval, n_I, n_J, rng = 8923402190,
+            U_range=0.02,
+            W_range=0.05,
+            a_ival=0,
+            b_ival=2,
+            c_ival=-2):
+        n_F, n_K = Pval.shape
+        if not hasattr(rng, 'randn'):
+            rng = np.random.RandomState(rng)
+        rval =  cls(
+                U = sharedX(U_range * rng.randn(n_I, n_F),'U'),
+                W = sharedX(W_range * rng.randn(n_I, n_J),'W'),
+                a = sharedX(np.ones(n_I)*a_ival,'a'),
+                b = sharedX(np.ones(n_K)*b_ival,'b'),
+                c = sharedX(np.ones(n_J)*c_ival,'c'),
+                P = sharedX(Pval, 'P'),)
+        rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c, rval.P]
+        return rval
+
 class mcRBMTrainer(object):
     """Light-weight class encapsulating math for mcRBM training 
 
@@ -470,17 +637,43 @@
     """
     # TODO: accept a GD algo as an argument?
     @classmethod
-    def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234,
+    def alloc_for_P(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234,
             l1_penalty=0,
+            l1_penalty_start=0,
+            learn_rate_multipliers=None,
+            lr_anneal_start=2000,
+            p_training_start=4000,
+            p_training_lr=0.02,
+            persistent_chains=True
+            ):
+        if learn_rate_multipliers is None:
+            p_lr = sharedX(0.0, 'P_lr_multiplier')
+            learn_rate_multipliers = [2, .2, .02, .1, .02, p_lr]
+        else:
+            p_lr = None
+        rval = cls.alloc(rbm, visible_batch, batchsize, initial_lr_per_example, rng, l1_penalty,
+                l1_penalty_start, learn_rate_multipliers, lr_anneal_start, persistent_chains)
+
+        rval.p_lr = p_lr
+        rval.p_training_start=p_training_start
+        rval.p_training_lr=p_training_lr
+        return rval
+
+
+    @classmethod
+    def alloc(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234,
+            l1_penalty=0,
+            l1_penalty_start=0,
             learn_rate_multipliers=[2, .2, .02, .1, .02],
             lr_anneal_start=2000,
+            persistent_chains=True
             ):
 
         """
         :param rbm: mcRBM instance to train
         :param visible_batch: TensorType variable for training data
         :param batchsize: the number of rows in visible_batch
-        :param initial_lr: the learning rate (may be annealed)
+        :param initial_lr_per_example: the learning rate (may be annealed)
         :param rng: seed or RandomState to initialze PCD sampler
         :param l1_penalty: see class doc
         :param learn_rate_multipliers: see class doc
@@ -494,16 +687,30 @@
 
         # TODO: should normVF be initialized to match the size of rbm.U ?
 
+        if (l1_penalty_start > 0) and (l1_penalty != 0.0):
+            effective_l1_penalty = sharedX(0.0, 'l1_penalty')
+        else:
+            effective_l1_penalty = l1_penalty
+
+        if persistent_chains:
+            sampler = rbm.sampler(batchsize, rng=rng)
+        else:
+            sampler = None
+
         return cls(
                 rbm=rbm,
+                batchsize=batchsize,
                 visible_batch=visible_batch,
-                sampler=rbm.sampler(batchsize, rng=rng),
+                sampler=sampler,
                 normVF=sharedX(1.0, 'normVF'),
-                learn_rate=sharedX(initial_lr/batchsize, 'learn_rate'),
+                learn_rate=sharedX(initial_lr_per_example/batchsize, 'learn_rate'),
                 iter=sharedX(0, 'iter'),
+                effective_l1_penalty=effective_l1_penalty,
                 l1_penalty=l1_penalty,
+                l1_penalty_start=l1_penalty_start,
                 learn_rate_multipliers=learn_rate_multipliers,
-                lr_anneal_start=lr_anneal_start)
+                lr_anneal_start=lr_anneal_start,
+                persistent_chains=persistent_chains,)
 
     def __init__(self, **kwargs):
         self.__dict__.update(kwargs)
@@ -522,14 +729,16 @@
         new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms)
         return (new_U * new_normVF / U_norms), new_normVF
 
-    def contrastive_grads(self):
+    def contrastive_grads(self, neg_v = None):
         """Return the contrastive divergence gradients on the parameters of self.rbm """
+        if neg_v is None:
+            neg_v = self.sampler.positions
         return contrastive_grad(
                 free_energy_fn=self.rbm.free_energy_given_v,
                 pos_v=self.visible_batch, 
-                neg_v=self.sampler.positions,
+                neg_v=neg_v,
                 wrt = self.rbm.params(),
-                other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty)
+                other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.effective_l1_penalty)
 
     def cd_updates(self):
         """
@@ -537,7 +746,19 @@
         learning by stochastic gradient descent with an annealed learning rate.
         """
 
-        grads = self.contrastive_grads()
+        ups = {}
+
+        if self.persistent_chains:
+            grads = self.contrastive_grads()
+            ups.update(dict(self.sampler.updates()))
+        else:
+            cd1_sampler, final_p, cd1_updates = self.rbm.CD1_sampler(self.visible_batch,
+                    self.batchsize)
+            self._last_cd1_sampler = cd1_sampler # hacked in here for the unit test
+            #ignore the cd1_sampler
+            grads = self.contrastive_grads(neg_v = final_p)
+            ups.update(dict(cd1_updates))
+
 
         # contrastive divergence updates
         # TODO: sgd_updates is a particular optization algo (others are possible)
@@ -552,30 +773,29 @@
                 0.0, #min
                 self.learn_rate) #max
 
-        ups = dict(sgd_updates(
+        ups.update(dict(sgd_updates(
                     self.rbm.params(),
                     grads,
-                    stepsizes=[a*lr for a in self.learn_rate_multipliers]))
+                    stepsizes=[a*lr for a in self.learn_rate_multipliers])))
 
         ups[self.iter] = self.iter + 1
 
-        # sampler updates
-        ups.update(dict(self.sampler.updates()))
 
         # add trainer updates (replace CD update of U)
         ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U])
 
+        #l1_updates:
+        if (self.l1_penalty_start > 0) and (self.l1_penalty != 0.0):
+            ups[self.effective_l1_penalty] = TT.switch(
+                    self.iter >= self.l1_penalty_start,
+                    self.l1_penalty,
+                    0.0)
+
+        if getattr(self,'p_lr', None):
+            ups[self.p_lr] = TT.switch(self.iter > self.p_training_start,
+                    self.p_training_lr,
+                    0)
+            ups[self.rbm.P] = TT.clip(ups[self.rbm.P], -5, 0)
+
         return ups
 
-if __name__ == '__main__':
-    import pylearn.algorithms.tests.test_mcRBM
-    rbm,smplr = pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(
-            as_unittest=False,
-            n_train_iters=10)
-    import cPickle
-    print ''
-    print 'Saving rbm...'
-    cPickle.dump(rbm, open('mcRBM.rbm.pkl', 'w'), -1)
-    print 'Saving sampler...'
-    cPickle.dump(smplr, open('mcRBM.smplr.pkl', 'w'), -1)
-