changeset 1267:075c193afd1b

refactoring mcRBM
author James Bergstra <bergstrj@iro.umontreal.ca>
date Fri, 03 Sep 2010 12:35:10 -0400
parents bc4d98995bad
children 78a09cdd449b
files pylearn/algorithms/mcRBM.py pylearn/algorithms/tests/test_mcRBM.py
diffstat 2 files changed, 192 insertions(+), 161 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/algorithms/mcRBM.py	Thu Sep 02 16:48:33 2010 -0400
+++ b/pylearn/algorithms/mcRBM.py	Fri Sep 03 12:35:10 2010 -0400
@@ -246,97 +246,8 @@
 #
 ###########################################
 
-#TODO: make global function to initialize parameter
-
-def hidden_cov_units_preactivation_given_v(rbm, v, small=0.5):
-    """Return argument to the sigmoid that would give mean of covariance hid units
-
-    See the math at the top of this file for what 'adjusted' means.
-
-    return b - 0.5 * dot(adjusted(v), U)**2
-    """
-    (U,W,a,b,c) = rbm
-    unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
-    return b - 0.5 * dot(unit_v, U)**2
-
-def free_energy_terms_given_v(rbm, v):
-    """Returns theano expression for the terms that are added to form the free energy of
-    visible vector `v` in an mcRBM.
-
-     1.  Free energy related to covariance hiddens
-     2.  Free energy related to mean hiddens
-     3.  Free energy related to L2-Norm of `v`
-     4.  Free energy related to projection of `v` onto biases `a`
-    """
-    U, W, a, b, c = rbm
-    t0 = -TT.sum(TT.nnet.softplus(hidden_cov_units_preactivation_given_v(rbm, v)),axis=1)
-    t1 = -TT.sum(TT.nnet.softplus(c + dot(v,W)), axis=1)
-    t2 =  0.5 * TT.sum(v**2, axis=1)
-    t3 = -TT.dot(v, a)
-    return [t0, t1, t2, t3]
-
-def free_energy_given_v(rbm, v):
-    """Returns theano expression for free energy of visible vector `v` in an mcRBM
-    """
-    return sum(free_energy_terms_given_v(rbm,v))
-
-def expected_h_g_given_v(rbm, v):
-    """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM.
-
-    `h` is the conditional on the covariance units.
-    `g` is the conditional on the mean units.
-    
-    """
-    (U, W, a, b, c) = rbm
-    h = TT.nnet.sigmoid(hidden_cov_units_preactivation_given_v(rbm, v))
-    g = nnet.sigmoid(c + dot(v,W))
-    return (h, g)
-
-def n_visible_units(rbm):
-    """Return the number of visible units of this RBM
-
-    For an RBM made from shared variables, this will return an integer,
-    for a purely symbolic RBM this will return a theano expression.
-    
-    """
-    W = rbm[1]
-    try:
-        return W.value.shape[0]
-    except AttributeError:
-        return W.shape[0]
-
-def sampler(rbm, n_particles, n_visible=None, rng=7823748):
-    """Return an `HMC_sampler` that will draw samples from the distribution over visible
-    units specified by this RBM.
-
-    :param n_particles: this many parallel chains will be simulated.
-    :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation.
-    """
-    if not hasattr(rng, 'randn'):
-        rng = np.random.RandomState(rng)
-    if n_visible is None:
-        n_visible = n_visible_units(rbm)
-    rval = HMC_sampler(
-        positions = [shared(
-            rng.randn(
-                n_particles,
-                n_visible).astype(floatX),
-            name='particles')],
-        energy_fn = lambda p : free_energy_given_v(rbm, p[0]),
-        seed=int(rng.randint(2**30)))
-    return rval
-
-#############################
-#
-# Convenient data container
-#
-#############################
-
-class MeanCovRBM(object):
-    """Container for mcRBM parameters
-
-    It provides parameter lookup by name, as well as a heuristic for initializing the
-    parameters for effective learning.
+class mcRBM(object):
+    """Light-weight class that provides the math related to inference
 
     Attributes:
 
@@ -345,23 +256,90 @@
       - a - the visible bias (theano shared variable)
       - b - the covariance bias (theano shared variable)
       - c - the mean bias (theano shared variable)
-    
     """
+    def __init__(self, U, W, a, b, c):
+        self.U = U
+        self.W = W
+        self.a = a
+        self.b = b
+        self.c = c
+
+    def hidden_cov_units_preactivation_given_v(self, v, small=0.5):
+        """Return argument to the sigmoid that would give mean of covariance hid units
+
+        See the math at the top of this file for what 'adjusted' means.
+
+        return b - 0.5 * dot(adjusted(v), U)**2
+        """
+        unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
+        return self.b - 0.5 * dot(unit_v, self.U)**2
 
-    params = property(lambda s: [s.U, s.W, s.a, s.b, s.c])
+    def free_energy_terms_given_v(self, v):
+        """Returns theano expression for the terms that are added to form the free energy of
+        visible vector `v` in an mcRBM.
 
-    n_visible = property(lambda s: s.W.value.shape[0])
+         1.  Free energy related to covariance hiddens
+         2.  Free energy related to mean hiddens
+         3.  Free energy related to L2-Norm of `v`
+         4.  Free energy related to projection of `v` onto biases `a`
+        """
+        t0 = -TT.sum(TT.nnet.softplus(self.hidden_cov_units_preactivation_given_v(v)),axis=1)
+        t1 = -TT.sum(TT.nnet.softplus(self.c + dot(v,self.W)), axis=1)
+        t2 =  0.5 * TT.sum(v**2, axis=1)
+        t3 = -TT.dot(v, self.a)
+        return [t0, t1, t2, t3]
+
+    def free_energy_given_v(self, v):
+        """Returns theano expression for free energy of visible vector `v` in an mcRBM
+        """
+        return TT.add(*self.free_energy_terms_given_v(v))
+
+    def expected_h_g_given_v(self, v):
+        """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM.
 
-    def __init__(self, U, W, a, b, c):
-        self.__dict__.update(locals())
-        del self.__dict__['self']
+        `h` is the conditional on the covariance units.
+        `g` is the conditional on the mean units.
+        
+        """
+        h = TT.nnet.sigmoid(self.hidden_cov_units_preactivation_given_v(v))
+        g = nnet.sigmoid(self.c + dot(v,self.W))
+        return (h, g)
+
+    def n_visible_units(self):
+        """Return the number of visible units of this RBM
+
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.W.value.shape[0]
+        except AttributeError:
+            return self.W.shape[0]
 
-    def __getitem__(self, idx):
-        # support unpacking of this container as if it were a tuple
-        return self.params[idx]
+    def sampler(self, n_particles, n_visible=None, rng=7823748):
+        """Return an `HMC_sampler` that will draw samples from the distribution over visible
+        units specified by this RBM.
+
+        :param n_particles: this many parallel chains will be simulated.
+        :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation.
+        """
+        if not hasattr(rng, 'randn'):
+            rng = np.random.RandomState(rng)
+        if n_visible is None:
+            n_visible = self.n_visible_units()
+        rval = HMC_sampler(
+            positions = [shared(
+                rng.randn(
+                    n_particles,
+                    n_visible).astype(floatX),
+                name='particles')],
+            energy_fn=self.free_energy_given_v,
+            seed=int(rng.randint(2**30)))
+        return rval
 
     @classmethod
-    def new_from_dims(cls, n_I, n_K, n_J, rng = 8923402190):
+    def alloc(cls, n_I, n_K, n_J, rng = 8923402190):
         """
         Return a MeanCovRBM instance with randomly-initialized parameters.
 
@@ -377,33 +355,98 @@
             return shared(X.astype(floatX), name=name)
 
         # initialization taken from train_mcRBM.py
-        return cls(
+        rval =  cls(
                 U = shrd(0.02 * rng.randn(n_I, n_K),'U'),
                 W = shrd(0.05 * rng.randn(n_I, n_J),'W'),
                 a = shrd(np.ones(n_I)*(0),'a'),
                 b = shrd(np.ones(n_K)*2,'b'),
                 c = shrd(np.ones(n_J)*(-2),'c'))
 
-    def __getstate__(self):
-        # unpack shared containers, which may have references to Theano stuff and are not a
-        # long-term stable data type.
-        return dict(
-                U = self.U.value,
-                W = self.W.value,
-                b = self.b.value,
-                c = self.c.value)
+        rval.params = [rval.U, rval.W, rval.a, rval.b, rval.c]
+        return rval
+
+class mcRBMTrainer(object):
+    """
+
+    Attributes:
+      - rbm 
+      - sampler
+      - normVF
+      - learn_rate
+      - learn_rate_multipliers
 
-    def __setstate__(self, dct):
-        d = dict(dct)
-        for key in ['U', 'W', 'a', 'b', 'c']:
-            d[key] = shared(d[key], name=key)
-        self.__init__(**d)
+    """
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+    def normalize_U(self, new_U):
+        #TODO: write the docstring
+        U_norms = TT.sqrt((new_U**2).sum(axis=0))
+        new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms)
+        return new_U * this_normVF / U_norms), new_normVF
+
+    def contrastive_grads(self, visible_batch, params=None):
+        if params is not None:
+            params = self.rbm.params
+        return contrastive_grad(
+                free_energy_fn=rbm.free_energy_given_v,
+                pos_v=visible_batch, 
+                neg_v=self.sampler.positions,
+                params=params,
+                other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.l1_penalty)
 
 
-#TODO: put the normalization of U as a global function
+    def cd_updates(self, visible_batch, params=None, rng=89234):
+        if params is not None:
+            params = self.rbm.params
+
+        grads = self.contrastive_grads(visible_batch, params)
+
+        # contrastive divergence updates
+        # TODO: sgd_updates is a particular optization algo (others are possible)
+        #       parametrize so that algo is plugin
+        #       the normalization normVF might be sgd-specific though...
+
+        # TODO: when sgd has an annealing schedule, this should
+        #       go through that mechanism.
+
+        # TODO: parametrize these constants (e.g. 2000)
+
+        ups[self.iter] = self.iter + 1
+        lr = TT.clip(
+                self.learn_rate * 2000 / (self.iter+1), 
+                0.0, #min
+                self.learn_rate) #max
 
+        ups = sgd_updates(
+                    params,
+                    grads,
+                    stepsizes=[a*lr for a in learn_rate_multipliers])
 
-#TODO: put the learning loop as a global function or class, so that someone could load and *TRAIN* an mcRBM!!!
+        # sampler updates
+        ups.update(dict(self.sampler.updates()))
+
+        # add trainer updates (replace CD update of U)
+        ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[U])
+
+        return ups
+
+    # TODO: accept a GD algo as an argument?
+    @classmethod
+    def alloc(cls, rbm, visible_batch, batchsize, initial_lr=0.075, rng=234,
+            l1_penalty=0,
+            learn_rate_multipliers=[2, .2, .02, .1, .02]):
+        # allocates shared var for negative phase particles
+
+        return cls(
+                rbm=rbm,
+                sampler=rbm.sampler(batchsize, rng=rng),
+                normVF=shared(1.0, 'normVF'),
+                learn_rate=shared(initial_lr/batchsize, 'learn_rate'),
+                iter=shared(0, 'iter'),
+                l1_penalty=l1_penalty,
+                learn_rate_multipliers=learn_rate_multipliers)
+
 
 if __name__ == '__main__':
     import pylearn.algorithms.tests.test_mcRBM
--- a/pylearn/algorithms/tests/test_mcRBM.py	Thu Sep 02 16:48:33 2010 -0400
+++ b/pylearn/algorithms/tests/test_mcRBM.py	Fri Sep 03 12:35:10 2010 -0400
@@ -1,8 +1,5 @@
-
-
 from pylearn.algorithms.mcRBM import *
 
-
 def test_reproduce_ranzato_hinton_2010(dataset='MAR', as_unittest=True):
     dataset='MAR'
     if dataset == 'MAR':
@@ -17,6 +14,7 @@
 
     n_burnin_steps=10000
 
+
     l1_penalty=1e-3
     no_l1_epochs = 10
     effective_l1_penalty=0.0
@@ -25,14 +23,9 @@
     batchsize = 128
     lr = 0.075 / batchsize
     s_lr = TT.scalar()
-    s_l1_penalty=TT.scalar()
     n_K=256
     n_J=100
 
-    rbm = MeanCovRBM.new_from_dims(n_I=n_vis, n_K=n_K, n_J=n_J) 
-
-    smplr = sampler(rbm, n_particles=batchsize)
-
     def l2(X):
         return numpy.sqrt((X**2).sum())
     if dataset == 'MAR':
@@ -60,26 +53,19 @@
     if not as_unittest:
         imgs_fn = function([batch_idx], outputs=train_batch)
 
-    grads = contrastive_grad(
-            free_energy_fn=lambda v: free_energy_given_v(rbm, v),
-            pos_v=train_batch, 
-            neg_v=smplr.positions[0],
-            params=list(rbm),
-            other_cost=(l1(rbm.U)+l1(rbm.W)) * s_l1_penalty)
-    sgd_ups = sgd_updates(
-                rbm.params,
-                grads,
-                stepsizes=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ])
-    learn_fn = function([batch_idx, s_lr, s_l1_penalty], 
-            outputs=[ 
-                grads[0].norm(2),
-                (sgd_ups[0][1] - sgd_ups[0][0]).norm(2),
-                (sgd_ups[1][1] - sgd_ups[1][0]).norm(2),
-                ],
-            updates = sgd_ups)
+    trainer = mcRBMTrainer.alloc(
+            mcRBM.alloc(n_I=n_vis, n_K=n_K, n_J=n_J),
+            train_batch,
+            batchsize, l1_penalty=TT.scalar())
+    rbm=trainer.rbm
+    smplr = trainer.sampler
+
+    grads = trainer.contrastive_grads(train_batch)
+    learn_fn = function([batch_idx, trainer.l1_penalty], 
+            outputs=[grads[0].norm(2), grads[0].norm(2), grads[1].norm(2)],
+            updates=trainer.cd_updates(train_batch))
 
     print "Learning..."
-    normVF=1
     last_epoch = -1
     for jj in xrange(n_train_iters):
         epoch = jj*batchsize / epoch_size
@@ -87,9 +73,6 @@
         print_jj = epoch != last_epoch
         last_epoch = epoch
 
-        if epoch > 10:
-            break
-
         if as_unittest and epoch == 5:
             U = rbm.U.value
             W = rbm.W.value
@@ -117,7 +100,7 @@
         if print_jj:
             if not as_unittest:
                 tile(imgs_fn(jj), "imgs_%06i.png"%jj)
-                tile(smplr.positions[0].value, "sample_%06i.png"%jj)
+                tile(smplr.positions.value, "sample_%06i.png"%jj)
                 tile(rbm.U.value.T, "U_%06i.png"%jj)
                 tile(rbm.W.value.T, "W_%06i.png"%jj)
 
@@ -132,18 +115,22 @@
             print 'b min max', rbm.b.value.min(), rbm.b.value.max(),
             print 'c min max', rbm.c.value.min(), rbm.c.value.max()
 
-            print 'parts min', smplr.positions[0].value.min(), 
-            print 'max',smplr.positions[0].value.max(),
+            print 'parts min', smplr.positions.value.min(), 
+            print 'max',smplr.positions.value.max(),
             print 'HMC step', smplr.stepsize,
             print 'arate', smplr.avg_acceptance_rate
 
-        # Continue HMC chain
-        smplr.simulate()
+
+        if 0:
+            # Continue HMC chain
+            smplr.simulate()
 
-        # Do CD update
-        l2_of_Ugrad = learn_fn(jj, 
-                lr/max(1, jj/(20*epoch_size/batchsize)),
-                effective_l1_penalty)
+            # Do CD update
+            l2_of_Ugrad = learn_fn(jj, 
+                    lr/max(1, jj/(20*epoch_size/batchsize)),
+                    effective_l1_penalty)
+
+        learn_fn(jj, effective_l1_penalty)
 
         if print_jj:
             print 'l2(U_grad)', float(l2_of_Ugrad[0]),
@@ -161,9 +148,10 @@
 
         # weird normalization technique...
         # It constrains all the columns of the matrix to have the same length
-        # But the matrix itself is re-scaled to have an arbitrary abslute size.
-        U = rbm.U.value
-        U_norms = np.sqrt((U*U).sum(axis=0))
-        assert len(U_norms) == n_K
-        normVF = .95 * normVF + .05 * np.mean(U_norms)
-        rbm.U.value = rbm.U.value * normVF/U_norms
+        if 0:
+            U = rbm.U.value
+            U_norms = np.sqrt((U*U).sum(axis=0))
+            assert len(U_norms) == n_K
+            normVF = .95 * normVF + .05 * np.mean(U_norms)
+            rbm.U.value = rbm.U.value * normVF/U_norms
+