Mercurial > pylearn
diff pylearn/algorithms/mcRBM.py @ 1000:d4a14c6c36e0
mcRBM - post code-review #1 with Guillaume
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Tue, 24 Aug 2010 19:24:54 -0400 |
parents | c6d08a760960 |
children | 075c193afd1b |
line wrap: on
line diff
--- a/pylearn/algorithms/mcRBM.py Tue Aug 24 17:01:09 2010 -0400 +++ b/pylearn/algorithms/mcRBM.py Tue Aug 24 19:24:54 2010 -0400 @@ -190,8 +190,7 @@ # + 0.5 \sum_i v_i^2 # - \sum_i a_i v_i -import sys -import logging +import sys, os, logging import numpy as np import numpy @@ -201,21 +200,54 @@ floatX = theano.config.floatX import pylearn +#TODO: clean up the HMC_sampler code +#TODO: think of naming convention for acronyms + suffix? from pylearn.sampling.hmc import HMC_sampler from pylearn.io import image_tiling from pylearn.gd.sgd import sgd_updates +import pylearn.dataset_ops.image_patches -#TODO: This should be in the datasets folder -import pylearn.datasets.config -import pylearn.dataset_ops.image_patches -from pylearn.dataset_ops.protocol import TensorFnDataset -from pylearn.dataset_ops.memo import memo -import pylearn -import scipy.io -import os +########################################### +# +# Candidates for factoring +# +########################################### + +#TODO: Document, move to pylearn's math lib +def l1(X): + return abs(X).sum() + +#TODO: Document, move to pylearn's math lib +def l2(X): + return TT.sqrt((X**2).sum()) + +#TODO: Document, move to pylearn's math lib +def contrastive_cost(free_energy_fn, pos_v, neg_v): + return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum() +#TODO: Typical use of contrastive_cost is to later use tensor.grad, but in that case we want to +# block gradient going through neg_v +def contrastive_grad(free_energy_fn, pos_v, neg_v, params, other_cost=0): + """ + :param pos_v: positive-phase sample of visible units + :param neg_v: negative-phase sample of visible units + """ + #block the grad through neg_v + cost=contrastive_cost(free_energy_fn, pos_v, neg_v) + if other_cost: + cost = cost + other_cost + return theano.tensor.grad(cost, + wrt=params, + consider_constant=[neg_v]) -#TODO: This should be in the nnet part of the library +########################################### +# +# Expressions that are mcRBM-specific +# +########################################### + +#TODO: make global function to initialize parameter + def hidden_cov_units_preactivation_given_v(rbm, v, small=0.5): """Return argument to the sigmoid that would give mean of covariance hid units @@ -248,24 +280,6 @@ """ return sum(free_energy_terms_given_v(rbm,v)) -def contrastive_gradient(rbm, pos_v, neg_v, U_l1_penalty=0, W_l1_penalty=0): - """Return a list of gradient expressions for the rbm parameters - - :param pos_v: positive-phase sample of visible units - :param neg_v: negative-phase sample of visible units - :param U_l1_penalty: a scalar-valued multiplier on the L1 penalty on U - :param W_l1_penalty: a scalar-valued multiplier on the L1 penalty on W - """ - U, W, a, b, c = rbm - pos_FE = free_energy_given_v(rbm, pos_v) - neg_FE = free_energy_given_v(rbm, neg_v) - c0 = (pos_FE - neg_FE).sum() - c1 = abs(U).sum()*U_l1_penalty - c2 = abs(W).sum()*W_l1_penalty - cost = c0 + c1 + c2 - rval = theano.tensor.grad(cost, list(rbm)) - return rval - def expected_h_g_given_v(rbm, v): """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM. @@ -291,7 +305,6 @@ except AttributeError: return W.shape[0] - def sampler(rbm, n_particles, n_visible=None, rng=7823748): """Return an `HMC_sampler` that will draw samples from the distribution over visible units specified by this RBM. @@ -313,6 +326,12 @@ seed=int(rng.randint(2**30))) return rval +############################# +# +# Convenient data container +# +############################# + class MeanCovRBM(object): """Container for mcRBM parameters @@ -380,140 +399,12 @@ d[key] = shared(d[key], name=key) self.__init__(**d) -if __name__ == '__main__': - - dataset='MAR' - if dataset == 'MAR': - n_vis=105 - n_patches=10240 - else: - R,C= 16,16 # the size of image patches - n_vis=R*C - n_patches=100000 - - n_train_iters=5000 - - n_burnin_steps=10000 - - l1_penalty=1e-3 - no_l1_epochs = 10 - effective_l1_penalty=0.0 - - epoch_size=n_patches - batchsize = 128 - lr = 0.075 / batchsize - s_lr = TT.scalar() - s_l1_penalty=TT.scalar() - n_K=256 - n_J=100 - rbm = MeanCovRBM.new_from_dims(n_I=n_vis, n_K=n_K, n_J=n_J) - - smplr = sampler(rbm, n_particles=batchsize) - - def l2(X): - return numpy.sqrt((X**2).sum()) - if dataset == 'MAR': - tile = pylearn.dataset_ops.image_patches.save_filters_of_ranzato_hinton_2010 - else: - def tile(X, fname): - _img = image_tiling.tile_raster_images(X, - img_shape=(R,C), - min_dynamic_range=1e-2) - image_tiling.save_tiled_raster_images(_img, fname) +#TODO: put the normalization of U as a global function - batch_idx = TT.iscalar() - - if dataset == 'MAR': - train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_idx * batchsize + np.arange(batchsize)) - else: - train_batch = pylearn.dataset_ops.image_patches.image_patches( - s_idx = (batch_idx * batchsize + np.arange(batchsize)), - dims = (n_patches,R,C), - center=True, - unitvar=True, - dtype=floatX, - rasterized=True) - - imgs_fn = function([batch_idx], outputs=train_batch) - grads = contrastive_gradient(rbm, - pos_v=train_batch, - neg_v=smplr.positions[0], - U_l1_penalty=s_l1_penalty, - W_l1_penalty=s_l1_penalty) - sgd_ups = sgd_updates( - rbm.params, - grads, - stepsizes=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ]) - learn_fn = function([batch_idx, s_lr, s_l1_penalty], - outputs=[ - grads[0].norm(2), - (sgd_ups[0][1] - sgd_ups[0][0]).norm(2), - (sgd_ups[1][1] - sgd_ups[1][0]).norm(2), - ], - updates = sgd_ups) - - print "Learning..." - normVF=1 - last_epoch = -1 - for jj in xrange(n_train_iters): - epoch = jj*batchsize / epoch_size - - print_jj = epoch != last_epoch - last_epoch = epoch - - if epoch > 10: - break - - if print_jj: - tile(imgs_fn(jj), "imgs_%06i.png"%jj) - tile(smplr.positions[0].value, "sample_%06i.png"%jj) - tile(rbm.U.value.T, "U_%06i.png"%jj) - tile(rbm.W.value.T, "W_%06i.png"%jj) - - print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize) - - print 'l2(U)', l2(rbm.U.value), - print 'l2(W)', l2(rbm.W.value) +#TODO: put the learning loop as a global function or class, so that someone could load and *TRAIN* an mcRBM!!! - print 'U min max', rbm.U.value.min(), rbm.U.value.max(), - print 'W min max', rbm.W.value.min(), rbm.W.value.max(), - print 'a min max', rbm.a.value.min(), rbm.a.value.max(), - print 'b min max', rbm.b.value.min(), rbm.b.value.max(), - print 'c min max', rbm.c.value.min(), rbm.c.value.max() - - print 'parts min', smplr.positions[0].value.min(), - print 'max',smplr.positions[0].value.max(), - print 'HMC step', smplr.stepsize, - print 'arate', smplr.avg_acceptance_rate - - smplr.simulate() - - l2_of_Ugrad = learn_fn(jj, - lr/max(1, jj/(20*epoch_size/batchsize)), - effective_l1_penalty) - - if print_jj: - print 'l2(U_grad)', float(l2_of_Ugrad[0]), - print 'l2(U_inc)', float(l2_of_Ugrad[1]), - print 'l2(W_inc)', float(l2_of_Ugrad[2]), - #print 'FE+', float(l2_of_Ugrad[2]), - #print 'FE+[0]', float(l2_of_Ugrad[3]), - #print 'FE+[1]', float(l2_of_Ugrad[4]), - #print 'FE+[2]', float(l2_of_Ugrad[5]), - #print 'FE+[3]', float(l2_of_Ugrad[6]) - - if jj == no_l1_epochs * epoch_size/batchsize: - print "Activating L1 weight decay" - effective_l1_penalty = 1e-3 - - # weird normalization technique... - # It constrains all the columns of the matrix to have the same length - # But the matrix itself is re-scaled to have an arbitrary abslute size. - U = rbm.U.value - U_norms = np.sqrt((U*U).sum(axis=0)) - assert len(U_norms) == n_K - normVF = .95 * normVF + .05 * np.mean(U_norms) - rbm.U.value = rbm.U.value * normVF/U_norms - +if __name__ == '__main__': + import pylearn.algorithms.tests.test_mcRBM + pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True)