# HG changeset patch
# User James Bergstra <bergstrj@iro.umontreal.ca>
# Date 1282593931 14400
# Node ID 2a53384d97429977f89a2f82543c29a867d48f2f
# Parent  ab4bc97ca060c3fbabfa6315a76467ff122e992a
mcRBM - hacks to driver

diff -r ab4bc97ca060 -r 2a53384d9742 pylearn/algorithms/mcRBM.py
--- a/pylearn/algorithms/mcRBM.py	Mon Aug 23 16:04:31 2010 -0400
+++ b/pylearn/algorithms/mcRBM.py	Mon Aug 23 16:05:31 2010 -0400
@@ -332,14 +332,28 @@
 
     print >> sys.stderr, "TODO: use P matrix (aka FH matrix)"
 
-    R,C= 8,8 # the size of image patches
+    dataset='MAR'
+    if dataset == 'MAR':
+        R,C= 21,5
+        n_patches=10240
+        demodata = scipy.io.loadmat('/u/bergstrj/cvs/articles/2010/spike_slab_RBM/src/marcaurelio/training_colorpatches_16x16_demo.mat')
+    else:
+        R,C= 16,16 # the size of image patches
+        n_patches=100000
+
+    n_train_iters=30000
+
+    n_burnin_steps=10000
+
     l1_penalty=1e-3
     no_l1_epochs = 10
+    effective_l1_penalty=0.0
 
     epoch_size=50000
     batchsize = 128
     lr = 0.075 / batchsize
     s_lr = TT.scalar()
+    s_l1_penalty=TT.scalar()
     n_K=256
     n_F=256
     n_J=100
@@ -350,56 +364,137 @@
             n_F=n_F,
             ) 
 
-    sampler = rbm.hmc_sampler(n_particles=100)
+    sampler = rbm.hmc_sampler(n_particles=batchsize)
 
-    from pylearn.dataset_ops import image_patches
+    def l2(X):
+        return (X**2).sum()
+    def tile(X, fname):
+        if dataset == 'MAR':
+            X = np.dot(X, demodata['invpcatransf'].T)
+            R=16
+            C=16
+            #X = X.reshape((X.shape[0], 3, 16, 16)).transpose([0,2,3,1]).copy()
+            X = (X[:,:256], X[:,256:512], X[:,512:], None)
+        _img = image_tiling.tile_raster_images(X,
+                img_shape=(R,C),
+                min_dynamic_range=1e-2)
+        image_tiling.save_tiled_raster_images(_img, fname)
+    #print "Burning in..."
+    #for burnin in xrange(n_burnin_steps):
+        #sampler.simulate()
+
+    if 0:
+        print "Just SAMPLING..."
+        for jj in xrange(n_burnin_steps):
+            if 0 == jj % 100:
+                tile(sampler.positions[0].value, "sampler_%06i.png"%jj)
+                tile(numpy.random.randn(100, 105), "random_%06i.png"%jj)
+                print "burning in... ", jj
+                sys.stdout.flush()
+            sampler.simulate()
+
+        sys.exit()
 
     batch_idx = TT.iscalar()
-    train_batch = image_patches.image_patches(
-            s_idx = (batch_idx * batchsize + np.arange(batchsize)),
-            dims = (1000,R,C),
-            dtype=floatX,
-            rasterized=True)
 
-    grads = rbm.contrastive_gradient(pos_v=train_batch, neg_v=sampler.positions[0])
+    if 0:
+        from pylearn.dataset_ops import image_patches
+        train_batch = image_patches.image_patches(
+                s_idx = (batch_idx * batchsize + np.arange(batchsize)),
+                dims = (n_patches,R,C),
+                center=True,
+                unitvar=True,
+                dtype=floatX,
+                rasterized=True)
+    else:
+        op = TensorFnDataset(floatX,
+                bcast=(False,),
+                fn=load_mcRBM_demo_patches,
+                single_shape=(105,))
+        train_batch = op((batch_idx * batchsize + np.arange(batchsize))%n_patches)
 
-    learn_fn = function([batch_idx, s_lr], 
+    imgs_fn = function([batch_idx], outputs=train_batch)
+
+    grads = rbm.contrastive_gradient(
+            pos_v=train_batch, 
+            neg_v=sampler.positions[0],
+            U_l1_penalty=s_l1_penalty,
+            W_l1_penalty=s_l1_penalty)
+
+    learn_fn = function([batch_idx, s_lr, s_l1_penalty], 
             outputs=[ 
                 grads[0].norm(2),
-                rbm.U.norm(2)
+                rbm.free_energy_given_v(train_batch).sum(),
+                rbm.free_energy_given_v(train_batch,extra=1)[1][0].sum(),
+                rbm.free_energy_given_v(train_batch,extra=1)[1][1].sum(),
+                rbm.free_energy_given_v(train_batch,extra=1)[1][2].sum(),
+                rbm.free_energy_given_v(train_batch,extra=1)[1][3].sum(),
                 ],
             updates = sgd_updates(
                 rbm.params,
                 grads,
                 lr=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ]))
+    theano.printing.pydotprint(learn_fn, 'learn_fn.png')
 
-    for jj in xrange(10000):
-        sampler.simulate()
-        l2_of_Ugrad = learn_fn(jj, lr/max(1, jj/(20*epoch_size/batchsize)))
+    print "Learning..."
+    normVF=1
+    for jj in xrange(n_train_iters):
+
+        print_jj = ((1 and jj < 100) 
+                or (0 and jj < 100 and 0==jj%10) 
+                or (jj < 1000 and 0==jj%100)
+                or (1 and jj < 10000 and 0==jj%1000))
+
 
-        if jj > no_l1_epochs * epoch_size/batchsize:
-            rbm.U.value -= l1_penalty * np.sign(rbm.U.value)
-            rbm.W.value -= l1_penalty * np.sign(rbm.W.value)
+        if print_jj:
+            tile(imgs_fn(jj), "imgs_%06i.png"%jj)
+            tile(sampler.positions[0].value, "sample_%06i.png"%jj)
+            tile(rbm.U.value.T, "U_%06i.png"%jj)
+            tile(rbm.W.value.T, "W_%06i.png"%jj)
 
-        if jj % 5 == 0:
-            rbm.U.value = numpy_project_onto_ball(rbm.U.value.T).T
+            print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize), 
+            print 'l2(U)', l2(rbm.U.value),
+            print 'l2(W)', l2(rbm.W.value),
+            print 'U min max', rbm.U.value.min(), rbm.U.value.max(),
+            print 'W min max', rbm.W.value.min(), rbm.W.value.max(),
+            print 'a min max', rbm.a.value.min(), rbm.a.value.max(),
+            print 'b min max', rbm.b.value.min(), rbm.b.value.max(),
+            print 'c min max', rbm.c.value.min(), rbm.c.value.max(),
 
-        if ((jj < 10) 
-                or (jj < 100 and 0==jj%10) 
-                or (jj < 1000 and 0==jj%100)
-                or (jj < 10000 and 0==jj%1000)):
-            print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize), l2_of_Ugrad
-            print 'neg particles', sampler.positions[0].value.min(), sampler.positions[0].value.max()
-            image_tiling.save_tiled_raster_images(
-                image_tiling.tile_raster_images(sampler.positions[0].value, (R,C)),
-                "sample_%06i.png"%jj)
-            image_tiling.save_tiled_raster_images(
-                image_tiling.tile_raster_images(rbm.U.value.T, (R,C)),
-                "U_%06i.png"%jj)
-            image_tiling.save_tiled_raster_images(
-                image_tiling.tile_raster_images(rbm.W.value.T, (R,C)),
-                "W_%06i.png"%jj)
+            print 'parts min', sampler.positions[0].value.min(), 
+            print 'max',sampler.positions[0].value.max(),
+            print 'HMC step', sampler.stepsize,
+            print 'arate', sampler.avg_acceptance_rate
+
+        sampler.simulate()
+
+        l2_of_Ugrad = learn_fn(jj, 
+                lr/max(1, jj/(20*epoch_size/batchsize)),
+                effective_l1_penalty)
 
+        if print_jj:
+            print 'l2(gU)', float(l2_of_Ugrad[0]),
+            print 'FE+', float(l2_of_Ugrad[1]),
+            print 'FE+[0]', float(l2_of_Ugrad[2]),
+            print 'FE+[1]', float(l2_of_Ugrad[3]),
+            print 'FE+[2]', float(l2_of_Ugrad[4]),
+            print 'FE+[3]', float(l2_of_Ugrad[5]),
+
+        if jj == no_l1_epochs * epoch_size/batchsize:
+            print "Activating L1 weight decay"
+            effective_l1_penalty = 1e-3
+
+        if 0:
+            rbm.U.value = numpy_project_onto_ball(rbm.U.value.T).T
+        else:
+            # weird normalization technique...
+            # It constrains all the columns of the matrix to have the same length
+            # But the matrix itself is re-scaled to have an arbitrary abslute size.
+            U = rbm.U.value
+            U_norms = np.sqrt((U*U).sum(axis=0))
+            assert len(U_norms) == n_F
+            normVF = .95 * normVF + .05 * np.mean(U_norms)
+            rbm.U.value = rbm.U.value * normVF/U_norms
 
 
 #