# HG changeset patch
# User James Bergstra <bergstrj@iro.umontreal.ca>
# Date 1282675929 14400
# Node ID 30b7c4defb6ce8a047b798f2644d76790d10345b
# Parent  d68828c98c3863020e22d0061dba0aa372971ee0
mcRBM - it works and committing it is taking forever... lets try this approach

diff -r d68828c98c38 -r 30b7c4defb6c pylearn/algorithms/mcRBM.py
--- a/pylearn/algorithms/mcRBM.py	Tue Aug 24 14:12:53 2010 -0400
+++ b/pylearn/algorithms/mcRBM.py	Tue Aug 24 14:52:09 2010 -0400
@@ -256,24 +256,26 @@
     unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
     return b - 0.5 * dot(unit_v, U)**2
 
-def free_energy_given_v(rbm, v):
-    """Returns theano expression for free energy of visible vector `v` in an mcRBM
-    
-    An mcRBM is parametrized
-    by `U`, `W`, `b`, `c`.
-    See module - level documentation for explanations of the `U`, `W`, `b` and `c` parameters.
+def free_energy_terms_given_v(rbm, v):
+    """Returns theano expression for the terms that are added to form the free energy of
+    visible vector `v` in an mcRBM.
 
-
-    The free energy of v is what we need for learning and hybrid Monte-carlo negative-phase
-    sampling.
-
+     1.  Free energy related to covariance hiddens
+     2.  Free energy related to mean hiddens
+     3.  Free energy related to L2-Norm of `v`
+     4.  Free energy related to projection of `v` onto biases `a`
     """
     U, W, a, b, c = rbm
     t0 = -TT.sum(TT.nnet.softplus(hidden_cov_units_preactivation_given_v(rbm, v)),axis=1)
     t1 = -TT.sum(TT.nnet.softplus(c + dot(v,W)), axis=1)
     t2 =  0.5 * TT.sum(v**2, axis=1)
     t3 = -TT.dot(v, a)
-    return t0 + t1 + t2 + t3, (t0, t1, t2, t3)
+    return [t0, t1, t2, t3]
+
+def free_energy_given_v(rbm, v):
+    """Returns theano expression for free energy of visible vector `v` in an mcRBM
+    """
+    return sum(free_energy_terms_given_v(rbm,v))
 
 def contrastive_gradient(rbm, pos_v, neg_v, U_l1_penalty=0, W_l1_penalty=0):
     """Return a list of gradient expressions for the rbm parameters
@@ -363,20 +365,22 @@
                 np.random.RandomState(seed^20893).randn(
                     n_particles,
                     self.n_visible ))],
-            energy_fn = lambda p : self.free_energy_given_v(p[0]),
+            energy_fn = lambda p : free_energy_given_v(self.params, p[0]),
             seed=seed)
 
     def free_energy_given_v(self, v, extra=False):
+        assert 0
         rval = free_energy_given_v(self.params, v)
         if extra:
             return rval
         else:
             return rval[0]
 
-    def contrastive_gradient(self, *args, **kwargs)
+    def contrastive_gradient(self, *args, **kwargs):
         """Return a list of gradient expressions for self.params
 
-        See `contrastive_gradient` for parameters.
+        :param pos_v: positive-phase sample of visible units
+        :param neg_v: negative-phase sample of visible units
         """
         return contrastive_gradient(self.params, *args, **kwargs)
 
@@ -394,7 +398,7 @@
         R,C= 16,16 # the size of image patches
         n_patches=100000
 
-    n_train_iters=30000
+    n_train_iters=5000
 
     n_burnin_steps=10000
 
@@ -402,7 +406,7 @@
     no_l1_epochs = 10
     effective_l1_penalty=0.0
 
-    epoch_size=50000
+    epoch_size=n_patches
     batchsize = 128
     lr = 0.075 / batchsize
     s_lr = TT.scalar()
@@ -420,7 +424,7 @@
     sampler = rbm.hmc_sampler(n_particles=batchsize)
 
     def l2(X):
-        return (X**2).sum()
+        return numpy.sqrt((X**2).sum())
     def tile(X, fname):
         if dataset == 'MAR':
             X = np.dot(X, demodata['invpcatransf'].T)
@@ -450,7 +454,13 @@
 
     batch_idx = TT.iscalar()
 
-    if 0:
+    if dataset == 'MAR':
+        op = TensorFnDataset(floatX,
+                bcast=(False,),
+                fn=load_mcRBM_demo_patches,
+                single_shape=(105,))
+        train_batch = op((batch_idx * batchsize + np.arange(batchsize))%n_patches)
+    else:
         from pylearn.dataset_ops import image_patches
         train_batch = image_patches.image_patches(
                 s_idx = (batch_idx * batchsize + np.arange(batchsize)),
@@ -459,12 +469,6 @@
                 unitvar=True,
                 dtype=floatX,
                 rasterized=True)
-    else:
-        op = TensorFnDataset(floatX,
-                bcast=(False,),
-                fn=load_mcRBM_demo_patches,
-                single_shape=(105,))
-        train_batch = op((batch_idx * batchsize + np.arange(batchsize))%n_patches)
 
     imgs_fn = function([batch_idx], outputs=train_batch)
 
@@ -473,31 +477,36 @@
             neg_v=sampler.positions[0],
             U_l1_penalty=s_l1_penalty,
             W_l1_penalty=s_l1_penalty)
+    sgd_ups = sgd_updates(
+                rbm.params,
+                grads,
+                lr=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ])
 
     learn_fn = function([batch_idx, s_lr, s_l1_penalty], 
             outputs=[ 
                 grads[0].norm(2),
-                rbm.free_energy_given_v(train_batch).sum(),
-                rbm.free_energy_given_v(train_batch,extra=1)[1][0].sum(),
-                rbm.free_energy_given_v(train_batch,extra=1)[1][1].sum(),
-                rbm.free_energy_given_v(train_batch,extra=1)[1][2].sum(),
-                rbm.free_energy_given_v(train_batch,extra=1)[1][3].sum(),
+                (sgd_ups[0][1] - sgd_ups[0][0]).norm(2),
+                (sgd_ups[1][1] - sgd_ups[1][0]).norm(2),
                 ],
-            updates = sgd_updates(
-                rbm.params,
-                grads,
-                lr=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ]))
-    theano.printing.pydotprint(learn_fn, 'learn_fn.png')
+            updates = sgd_ups)
+                #rbm.free_energy_given_v(train_batch).sum(),
+                #rbm.free_energy_given_v(train_batch,extra=1)[1][0].sum(),
+                #rbm.free_energy_given_v(train_batch,extra=1)[1][1].sum(),
+                #rbm.free_energy_given_v(train_batch,extra=1)[1][2].sum(),
+                #rbm.free_energy_given_v(train_batch,extra=1)[1][3].sum(),
+    theano.printing.pydotprint(function([batch_idx, s_l1_penalty], grads[0]), 'grads0.png')
 
     print "Learning..."
     normVF=1
+    last_epoch = -1
     for jj in xrange(n_train_iters):
+        epoch = jj*batchsize / epoch_size
 
-        print_jj = ((1 and jj < 100) 
-                or (0 and jj < 100 and 0==jj%10) 
-                or (jj < 1000 and 0==jj%100)
-                or (1 and jj < 10000 and 0==jj%1000))
+        print_jj = epoch != last_epoch
+        last_epoch = epoch
 
+        if epoch > 10:
+            break
 
         if print_jj:
             tile(imgs_fn(jj), "imgs_%06i.png"%jj)
@@ -505,14 +514,16 @@
             tile(rbm.U.value.T, "U_%06i.png"%jj)
             tile(rbm.W.value.T, "W_%06i.png"%jj)
 
-            print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize), 
+            print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize)
+
             print 'l2(U)', l2(rbm.U.value),
-            print 'l2(W)', l2(rbm.W.value),
+            print 'l2(W)', l2(rbm.W.value)
+
             print 'U min max', rbm.U.value.min(), rbm.U.value.max(),
             print 'W min max', rbm.W.value.min(), rbm.W.value.max(),
             print 'a min max', rbm.a.value.min(), rbm.a.value.max(),
             print 'b min max', rbm.b.value.min(), rbm.b.value.max(),
-            print 'c min max', rbm.c.value.min(), rbm.c.value.max(),
+            print 'c min max', rbm.c.value.min(), rbm.c.value.max()
 
             print 'parts min', sampler.positions[0].value.min(), 
             print 'max',sampler.positions[0].value.max(),
@@ -526,28 +537,27 @@
                 effective_l1_penalty)
 
         if print_jj:
-            print 'l2(gU)', float(l2_of_Ugrad[0]),
-            print 'FE+', float(l2_of_Ugrad[1]),
-            print 'FE+[0]', float(l2_of_Ugrad[2]),
-            print 'FE+[1]', float(l2_of_Ugrad[3]),
-            print 'FE+[2]', float(l2_of_Ugrad[4]),
-            print 'FE+[3]', float(l2_of_Ugrad[5]),
+            print 'l2(U_grad)', float(l2_of_Ugrad[0]),
+            print 'l2(U_inc)', float(l2_of_Ugrad[1]),
+            print 'l2(W_inc)', float(l2_of_Ugrad[2]),
+            #print 'FE+', float(l2_of_Ugrad[2]),
+            #print 'FE+[0]', float(l2_of_Ugrad[3]),
+            #print 'FE+[1]', float(l2_of_Ugrad[4]),
+            #print 'FE+[2]', float(l2_of_Ugrad[5]),
+            #print 'FE+[3]', float(l2_of_Ugrad[6])
 
         if jj == no_l1_epochs * epoch_size/batchsize:
             print "Activating L1 weight decay"
             effective_l1_penalty = 1e-3
 
-        if 0:
-            rbm.U.value = numpy_project_onto_ball(rbm.U.value.T).T
-        else:
-            # weird normalization technique...
-            # It constrains all the columns of the matrix to have the same length
-            # But the matrix itself is re-scaled to have an arbitrary abslute size.
-            U = rbm.U.value
-            U_norms = np.sqrt((U*U).sum(axis=0))
-            assert len(U_norms) == n_F
-            normVF = .95 * normVF + .05 * np.mean(U_norms)
-            rbm.U.value = rbm.U.value * normVF/U_norms
+        # weird normalization technique...
+        # It constrains all the columns of the matrix to have the same length
+        # But the matrix itself is re-scaled to have an arbitrary abslute size.
+        U = rbm.U.value
+        U_norms = np.sqrt((U*U).sum(axis=0))
+        assert len(U_norms) == n_F
+        normVF = .95 * normVF + .05 * np.mean(U_norms)
+        rbm.U.value = rbm.U.value * normVF/U_norms
 
 
 #