Mercurial > pylearn
changeset 992:30b7c4defb6c
mcRBM - it works and committing it is taking forever... lets try this approach
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Tue, 24 Aug 2010 14:52:09 -0400 |
parents | d68828c98c38 |
children | 88107ec01ce8 |
files | pylearn/algorithms/mcRBM.py |
diffstat | 1 files changed, 68 insertions(+), 58 deletions(-) [+] |
line wrap: on
line diff
--- a/pylearn/algorithms/mcRBM.py Tue Aug 24 14:12:53 2010 -0400 +++ b/pylearn/algorithms/mcRBM.py Tue Aug 24 14:52:09 2010 -0400 @@ -256,24 +256,26 @@ unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm return b - 0.5 * dot(unit_v, U)**2 -def free_energy_given_v(rbm, v): - """Returns theano expression for free energy of visible vector `v` in an mcRBM - - An mcRBM is parametrized - by `U`, `W`, `b`, `c`. - See module - level documentation for explanations of the `U`, `W`, `b` and `c` parameters. +def free_energy_terms_given_v(rbm, v): + """Returns theano expression for the terms that are added to form the free energy of + visible vector `v` in an mcRBM. - - The free energy of v is what we need for learning and hybrid Monte-carlo negative-phase - sampling. - + 1. Free energy related to covariance hiddens + 2. Free energy related to mean hiddens + 3. Free energy related to L2-Norm of `v` + 4. Free energy related to projection of `v` onto biases `a` """ U, W, a, b, c = rbm t0 = -TT.sum(TT.nnet.softplus(hidden_cov_units_preactivation_given_v(rbm, v)),axis=1) t1 = -TT.sum(TT.nnet.softplus(c + dot(v,W)), axis=1) t2 = 0.5 * TT.sum(v**2, axis=1) t3 = -TT.dot(v, a) - return t0 + t1 + t2 + t3, (t0, t1, t2, t3) + return [t0, t1, t2, t3] + +def free_energy_given_v(rbm, v): + """Returns theano expression for free energy of visible vector `v` in an mcRBM + """ + return sum(free_energy_terms_given_v(rbm,v)) def contrastive_gradient(rbm, pos_v, neg_v, U_l1_penalty=0, W_l1_penalty=0): """Return a list of gradient expressions for the rbm parameters @@ -363,20 +365,22 @@ np.random.RandomState(seed^20893).randn( n_particles, self.n_visible ))], - energy_fn = lambda p : self.free_energy_given_v(p[0]), + energy_fn = lambda p : free_energy_given_v(self.params, p[0]), seed=seed) def free_energy_given_v(self, v, extra=False): + assert 0 rval = free_energy_given_v(self.params, v) if extra: return rval else: return rval[0] - def contrastive_gradient(self, *args, **kwargs) + def contrastive_gradient(self, *args, **kwargs): """Return a list of gradient expressions for self.params - See `contrastive_gradient` for parameters. + :param pos_v: positive-phase sample of visible units + :param neg_v: negative-phase sample of visible units """ return contrastive_gradient(self.params, *args, **kwargs) @@ -394,7 +398,7 @@ R,C= 16,16 # the size of image patches n_patches=100000 - n_train_iters=30000 + n_train_iters=5000 n_burnin_steps=10000 @@ -402,7 +406,7 @@ no_l1_epochs = 10 effective_l1_penalty=0.0 - epoch_size=50000 + epoch_size=n_patches batchsize = 128 lr = 0.075 / batchsize s_lr = TT.scalar() @@ -420,7 +424,7 @@ sampler = rbm.hmc_sampler(n_particles=batchsize) def l2(X): - return (X**2).sum() + return numpy.sqrt((X**2).sum()) def tile(X, fname): if dataset == 'MAR': X = np.dot(X, demodata['invpcatransf'].T) @@ -450,7 +454,13 @@ batch_idx = TT.iscalar() - if 0: + if dataset == 'MAR': + op = TensorFnDataset(floatX, + bcast=(False,), + fn=load_mcRBM_demo_patches, + single_shape=(105,)) + train_batch = op((batch_idx * batchsize + np.arange(batchsize))%n_patches) + else: from pylearn.dataset_ops import image_patches train_batch = image_patches.image_patches( s_idx = (batch_idx * batchsize + np.arange(batchsize)), @@ -459,12 +469,6 @@ unitvar=True, dtype=floatX, rasterized=True) - else: - op = TensorFnDataset(floatX, - bcast=(False,), - fn=load_mcRBM_demo_patches, - single_shape=(105,)) - train_batch = op((batch_idx * batchsize + np.arange(batchsize))%n_patches) imgs_fn = function([batch_idx], outputs=train_batch) @@ -473,31 +477,36 @@ neg_v=sampler.positions[0], U_l1_penalty=s_l1_penalty, W_l1_penalty=s_l1_penalty) + sgd_ups = sgd_updates( + rbm.params, + grads, + lr=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ]) learn_fn = function([batch_idx, s_lr, s_l1_penalty], outputs=[ grads[0].norm(2), - rbm.free_energy_given_v(train_batch).sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][0].sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][1].sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][2].sum(), - rbm.free_energy_given_v(train_batch,extra=1)[1][3].sum(), + (sgd_ups[0][1] - sgd_ups[0][0]).norm(2), + (sgd_ups[1][1] - sgd_ups[1][0]).norm(2), ], - updates = sgd_updates( - rbm.params, - grads, - lr=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ])) - theano.printing.pydotprint(learn_fn, 'learn_fn.png') + updates = sgd_ups) + #rbm.free_energy_given_v(train_batch).sum(), + #rbm.free_energy_given_v(train_batch,extra=1)[1][0].sum(), + #rbm.free_energy_given_v(train_batch,extra=1)[1][1].sum(), + #rbm.free_energy_given_v(train_batch,extra=1)[1][2].sum(), + #rbm.free_energy_given_v(train_batch,extra=1)[1][3].sum(), + theano.printing.pydotprint(function([batch_idx, s_l1_penalty], grads[0]), 'grads0.png') print "Learning..." normVF=1 + last_epoch = -1 for jj in xrange(n_train_iters): + epoch = jj*batchsize / epoch_size - print_jj = ((1 and jj < 100) - or (0 and jj < 100 and 0==jj%10) - or (jj < 1000 and 0==jj%100) - or (1 and jj < 10000 and 0==jj%1000)) + print_jj = epoch != last_epoch + last_epoch = epoch + if epoch > 10: + break if print_jj: tile(imgs_fn(jj), "imgs_%06i.png"%jj) @@ -505,14 +514,16 @@ tile(rbm.U.value.T, "U_%06i.png"%jj) tile(rbm.W.value.T, "W_%06i.png"%jj) - print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize), + print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize) + print 'l2(U)', l2(rbm.U.value), - print 'l2(W)', l2(rbm.W.value), + print 'l2(W)', l2(rbm.W.value) + print 'U min max', rbm.U.value.min(), rbm.U.value.max(), print 'W min max', rbm.W.value.min(), rbm.W.value.max(), print 'a min max', rbm.a.value.min(), rbm.a.value.max(), print 'b min max', rbm.b.value.min(), rbm.b.value.max(), - print 'c min max', rbm.c.value.min(), rbm.c.value.max(), + print 'c min max', rbm.c.value.min(), rbm.c.value.max() print 'parts min', sampler.positions[0].value.min(), print 'max',sampler.positions[0].value.max(), @@ -526,28 +537,27 @@ effective_l1_penalty) if print_jj: - print 'l2(gU)', float(l2_of_Ugrad[0]), - print 'FE+', float(l2_of_Ugrad[1]), - print 'FE+[0]', float(l2_of_Ugrad[2]), - print 'FE+[1]', float(l2_of_Ugrad[3]), - print 'FE+[2]', float(l2_of_Ugrad[4]), - print 'FE+[3]', float(l2_of_Ugrad[5]), + print 'l2(U_grad)', float(l2_of_Ugrad[0]), + print 'l2(U_inc)', float(l2_of_Ugrad[1]), + print 'l2(W_inc)', float(l2_of_Ugrad[2]), + #print 'FE+', float(l2_of_Ugrad[2]), + #print 'FE+[0]', float(l2_of_Ugrad[3]), + #print 'FE+[1]', float(l2_of_Ugrad[4]), + #print 'FE+[2]', float(l2_of_Ugrad[5]), + #print 'FE+[3]', float(l2_of_Ugrad[6]) if jj == no_l1_epochs * epoch_size/batchsize: print "Activating L1 weight decay" effective_l1_penalty = 1e-3 - if 0: - rbm.U.value = numpy_project_onto_ball(rbm.U.value.T).T - else: - # weird normalization technique... - # It constrains all the columns of the matrix to have the same length - # But the matrix itself is re-scaled to have an arbitrary abslute size. - U = rbm.U.value - U_norms = np.sqrt((U*U).sum(axis=0)) - assert len(U_norms) == n_F - normVF = .95 * normVF + .05 * np.mean(U_norms) - rbm.U.value = rbm.U.value * normVF/U_norms + # weird normalization technique... + # It constrains all the columns of the matrix to have the same length + # But the matrix itself is re-scaled to have an arbitrary abslute size. + U = rbm.U.value + U_norms = np.sqrt((U*U).sum(axis=0)) + assert len(U_norms) == n_F + normVF = .95 * normVF + .05 * np.mean(U_norms) + rbm.U.value = rbm.U.value * normVF/U_norms #