diff pylearn/algorithms/mcRBM.py @ 1000:d4a14c6c36e0

mcRBM - post code-review #1 with Guillaume
author James Bergstra <bergstrj@iro.umontreal.ca>
date Tue, 24 Aug 2010 19:24:54 -0400
parents c6d08a760960
children 075c193afd1b
line wrap: on
line diff
--- a/pylearn/algorithms/mcRBM.py	Tue Aug 24 17:01:09 2010 -0400
+++ b/pylearn/algorithms/mcRBM.py	Tue Aug 24 19:24:54 2010 -0400
@@ -190,8 +190,7 @@
 #    + 0.5 \sum_i v_i^2
 #    - \sum_i a_i v_i 
 
-import sys
-import logging
+import sys, os, logging
 import numpy as np
 import numpy
 
@@ -201,21 +200,54 @@
 floatX = theano.config.floatX
 
 import pylearn
+#TODO: clean up the HMC_sampler code
+#TODO: think of naming convention for acronyms + suffix?
 from pylearn.sampling.hmc import HMC_sampler
 from pylearn.io import image_tiling
 from pylearn.gd.sgd import sgd_updates
+import pylearn.dataset_ops.image_patches
 
-#TODO: This should be in the datasets folder
-import pylearn.datasets.config
-import pylearn.dataset_ops.image_patches
-from pylearn.dataset_ops.protocol import TensorFnDataset
-from pylearn.dataset_ops.memo import memo
-import pylearn
-import scipy.io
-import os
+###########################################
+#
+# Candidates for factoring
+#
+###########################################
+
+#TODO: Document, move to pylearn's math lib
+def l1(X):
+    return abs(X).sum()
+
+#TODO: Document, move to pylearn's math lib
+def l2(X):
+    return TT.sqrt((X**2).sum())
+
+#TODO: Document, move to pylearn's math lib
+def contrastive_cost(free_energy_fn, pos_v, neg_v):
+    return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum()
 
+#TODO: Typical use of contrastive_cost is to later use tensor.grad, but in that case we want to
+#      block  gradient going through neg_v
+def contrastive_grad(free_energy_fn, pos_v, neg_v, params, other_cost=0):
+    """
+    :param pos_v: positive-phase sample of visible units
+    :param neg_v: negative-phase sample of visible units
+    """
+    #block the grad through neg_v
+    cost=contrastive_cost(free_energy_fn, pos_v, neg_v)
+    if other_cost:
+        cost = cost + other_cost
+    return theano.tensor.grad(cost,
+            wrt=params,
+            consider_constant=[neg_v])
 
-#TODO: This should be in the nnet part of the library
+###########################################
+#
+# Expressions that are mcRBM-specific
+#
+###########################################
+
+#TODO: make global function to initialize parameter
+
 def hidden_cov_units_preactivation_given_v(rbm, v, small=0.5):
     """Return argument to the sigmoid that would give mean of covariance hid units
 
@@ -248,24 +280,6 @@
     """
     return sum(free_energy_terms_given_v(rbm,v))
 
-def contrastive_gradient(rbm, pos_v, neg_v, U_l1_penalty=0, W_l1_penalty=0):
-    """Return a list of gradient expressions for the rbm parameters
-
-    :param pos_v: positive-phase sample of visible units
-    :param neg_v: negative-phase sample of visible units
-    :param U_l1_penalty: a scalar-valued multiplier on the L1 penalty on U
-    :param W_l1_penalty: a scalar-valued multiplier on the L1 penalty on W
-    """
-    U, W, a, b, c = rbm
-    pos_FE = free_energy_given_v(rbm, pos_v)
-    neg_FE = free_energy_given_v(rbm, neg_v)
-    c0 = (pos_FE - neg_FE).sum()
-    c1 = abs(U).sum()*U_l1_penalty 
-    c2 = abs(W).sum()*W_l1_penalty 
-    cost = c0 + c1 + c2
-    rval = theano.tensor.grad(cost, list(rbm))
-    return rval
-
 def expected_h_g_given_v(rbm, v):
     """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM.
 
@@ -291,7 +305,6 @@
     except AttributeError:
         return W.shape[0]
 
-
 def sampler(rbm, n_particles, n_visible=None, rng=7823748):
     """Return an `HMC_sampler` that will draw samples from the distribution over visible
     units specified by this RBM.
@@ -313,6 +326,12 @@
         seed=int(rng.randint(2**30)))
     return rval
 
+#############################
+#
+# Convenient data container
+#
+#############################
+
 class MeanCovRBM(object):
     """Container for mcRBM parameters
 
@@ -380,140 +399,12 @@
             d[key] = shared(d[key], name=key)
         self.__init__(**d)
 
-if __name__ == '__main__':
-
-    dataset='MAR'
-    if dataset == 'MAR':
-        n_vis=105
-        n_patches=10240
-    else:
-        R,C= 16,16 # the size of image patches
-        n_vis=R*C
-        n_patches=100000
-
-    n_train_iters=5000
-
-    n_burnin_steps=10000
-
-    l1_penalty=1e-3
-    no_l1_epochs = 10
-    effective_l1_penalty=0.0
-
-    epoch_size=n_patches
-    batchsize = 128
-    lr = 0.075 / batchsize
-    s_lr = TT.scalar()
-    s_l1_penalty=TT.scalar()
-    n_K=256
-    n_J=100
 
-    rbm = MeanCovRBM.new_from_dims(n_I=n_vis, n_K=n_K, n_J=n_J) 
-
-    smplr = sampler(rbm, n_particles=batchsize)
-
-    def l2(X):
-        return numpy.sqrt((X**2).sum())
-    if dataset == 'MAR':
-        tile = pylearn.dataset_ops.image_patches.save_filters_of_ranzato_hinton_2010
-    else:
-        def tile(X, fname):
-            _img = image_tiling.tile_raster_images(X,
-                    img_shape=(R,C),
-                    min_dynamic_range=1e-2)
-            image_tiling.save_tiled_raster_images(_img, fname)
+#TODO: put the normalization of U as a global function
 
-    batch_idx = TT.iscalar()
-
-    if dataset == 'MAR':
-        train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_idx * batchsize + np.arange(batchsize))
-    else:
-        train_batch = pylearn.dataset_ops.image_patches.image_patches(
-                s_idx = (batch_idx * batchsize + np.arange(batchsize)),
-                dims = (n_patches,R,C),
-                center=True,
-                unitvar=True,
-                dtype=floatX,
-                rasterized=True)
-
-    imgs_fn = function([batch_idx], outputs=train_batch)
 
-    grads = contrastive_gradient(rbm,
-            pos_v=train_batch, 
-            neg_v=smplr.positions[0],
-            U_l1_penalty=s_l1_penalty,
-            W_l1_penalty=s_l1_penalty)
-    sgd_ups = sgd_updates(
-                rbm.params,
-                grads,
-                stepsizes=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ])
-    learn_fn = function([batch_idx, s_lr, s_l1_penalty], 
-            outputs=[ 
-                grads[0].norm(2),
-                (sgd_ups[0][1] - sgd_ups[0][0]).norm(2),
-                (sgd_ups[1][1] - sgd_ups[1][0]).norm(2),
-                ],
-            updates = sgd_ups)
-
-    print "Learning..."
-    normVF=1
-    last_epoch = -1
-    for jj in xrange(n_train_iters):
-        epoch = jj*batchsize / epoch_size
-
-        print_jj = epoch != last_epoch
-        last_epoch = epoch
-
-        if epoch > 10:
-            break
-
-        if print_jj:
-            tile(imgs_fn(jj), "imgs_%06i.png"%jj)
-            tile(smplr.positions[0].value, "sample_%06i.png"%jj)
-            tile(rbm.U.value.T, "U_%06i.png"%jj)
-            tile(rbm.W.value.T, "W_%06i.png"%jj)
-
-            print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize)
-
-            print 'l2(U)', l2(rbm.U.value),
-            print 'l2(W)', l2(rbm.W.value)
+#TODO: put the learning loop as a global function or class, so that someone could load and *TRAIN* an mcRBM!!!
 
-            print 'U min max', rbm.U.value.min(), rbm.U.value.max(),
-            print 'W min max', rbm.W.value.min(), rbm.W.value.max(),
-            print 'a min max', rbm.a.value.min(), rbm.a.value.max(),
-            print 'b min max', rbm.b.value.min(), rbm.b.value.max(),
-            print 'c min max', rbm.c.value.min(), rbm.c.value.max()
-
-            print 'parts min', smplr.positions[0].value.min(), 
-            print 'max',smplr.positions[0].value.max(),
-            print 'HMC step', smplr.stepsize,
-            print 'arate', smplr.avg_acceptance_rate
-
-        smplr.simulate()
-
-        l2_of_Ugrad = learn_fn(jj, 
-                lr/max(1, jj/(20*epoch_size/batchsize)),
-                effective_l1_penalty)
-
-        if print_jj:
-            print 'l2(U_grad)', float(l2_of_Ugrad[0]),
-            print 'l2(U_inc)', float(l2_of_Ugrad[1]),
-            print 'l2(W_inc)', float(l2_of_Ugrad[2]),
-            #print 'FE+', float(l2_of_Ugrad[2]),
-            #print 'FE+[0]', float(l2_of_Ugrad[3]),
-            #print 'FE+[1]', float(l2_of_Ugrad[4]),
-            #print 'FE+[2]', float(l2_of_Ugrad[5]),
-            #print 'FE+[3]', float(l2_of_Ugrad[6])
-
-        if jj == no_l1_epochs * epoch_size/batchsize:
-            print "Activating L1 weight decay"
-            effective_l1_penalty = 1e-3
-
-        # weird normalization technique...
-        # It constrains all the columns of the matrix to have the same length
-        # But the matrix itself is re-scaled to have an arbitrary abslute size.
-        U = rbm.U.value
-        U_norms = np.sqrt((U*U).sum(axis=0))
-        assert len(U_norms) == n_K
-        normVF = .95 * normVF + .05 * np.mean(U_norms)
-        rbm.U.value = rbm.U.value * normVF/U_norms
-
+if __name__ == '__main__':
+    import pylearn.algorithms.tests.test_mcRBM
+    pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True)