# HG changeset patch # User James Bergstra # Date 1282593931 14400 # Node ID 2a53384d97429977f89a2f82543c29a867d48f2f # Parent ab4bc97ca060c3fbabfa6315a76467ff122e992a mcRBM - hacks to driver diff -r ab4bc97ca060 -r 2a53384d9742 pylearn/algorithms/mcRBM.py --- a/pylearn/algorithms/mcRBM.py Mon Aug 23 16:04:31 2010 -0400 +++ b/pylearn/algorithms/mcRBM.py Mon Aug 23 16:05:31 2010 -0400 @@ -332,14 +332,28 @@ print >> sys.stderr, "TODO: use P matrix (aka FH matrix)" - R,C= 8,8 # the size of image patches + dataset='MAR' + if dataset == 'MAR': + R,C= 21,5 + n_patches=10240 + demodata = scipy.io.loadmat('/u/bergstrj/cvs/articles/2010/spike_slab_RBM/src/marcaurelio/training_colorpatches_16x16_demo.mat') + else: + R,C= 16,16 # the size of image patches + n_patches=100000 + + n_train_iters=30000 + + n_burnin_steps=10000 + l1_penalty=1e-3 no_l1_epochs = 10 + effective_l1_penalty=0.0 epoch_size=50000 batchsize = 128 lr = 0.075 / batchsize s_lr = TT.scalar() + s_l1_penalty=TT.scalar() n_K=256 n_F=256 n_J=100 @@ -350,56 +364,137 @@ n_F=n_F, ) - sampler = rbm.hmc_sampler(n_particles=100) + sampler = rbm.hmc_sampler(n_particles=batchsize) - from pylearn.dataset_ops import image_patches + def l2(X): + return (X**2).sum() + def tile(X, fname): + if dataset == 'MAR': + X = np.dot(X, demodata['invpcatransf'].T) + R=16 + C=16 + #X = X.reshape((X.shape[0], 3, 16, 16)).transpose([0,2,3,1]).copy() + X = (X[:,:256], X[:,256:512], X[:,512:], None) + _img = image_tiling.tile_raster_images(X, + img_shape=(R,C), + min_dynamic_range=1e-2) + image_tiling.save_tiled_raster_images(_img, fname) + #print "Burning in..." + #for burnin in xrange(n_burnin_steps): + #sampler.simulate() + + if 0: + print "Just SAMPLING..." + for jj in xrange(n_burnin_steps): + if 0 == jj % 100: + tile(sampler.positions[0].value, "sampler_%06i.png"%jj) + tile(numpy.random.randn(100, 105), "random_%06i.png"%jj) + print "burning in... ", jj + sys.stdout.flush() + sampler.simulate() + + sys.exit() batch_idx = TT.iscalar() - train_batch = image_patches.image_patches( - s_idx = (batch_idx * batchsize + np.arange(batchsize)), - dims = (1000,R,C), - dtype=floatX, - rasterized=True) - grads = rbm.contrastive_gradient(pos_v=train_batch, neg_v=sampler.positions[0]) + if 0: + from pylearn.dataset_ops import image_patches + train_batch = image_patches.image_patches( + s_idx = (batch_idx * batchsize + np.arange(batchsize)), + dims = (n_patches,R,C), + center=True, + unitvar=True, + dtype=floatX, + rasterized=True) + else: + op = TensorFnDataset(floatX, + bcast=(False,), + fn=load_mcRBM_demo_patches, + single_shape=(105,)) + train_batch = op((batch_idx * batchsize + np.arange(batchsize))%n_patches) - learn_fn = function([batch_idx, s_lr], + imgs_fn = function([batch_idx], outputs=train_batch) + + grads = rbm.contrastive_gradient( + pos_v=train_batch, + neg_v=sampler.positions[0], + U_l1_penalty=s_l1_penalty, + W_l1_penalty=s_l1_penalty) + + learn_fn = function([batch_idx, s_lr, s_l1_penalty], outputs=[ grads[0].norm(2), - rbm.U.norm(2) + rbm.free_energy_given_v(train_batch).sum(), + rbm.free_energy_given_v(train_batch,extra=1)[1][0].sum(), + rbm.free_energy_given_v(train_batch,extra=1)[1][1].sum(), + rbm.free_energy_given_v(train_batch,extra=1)[1][2].sum(), + rbm.free_energy_given_v(train_batch,extra=1)[1][3].sum(), ], updates = sgd_updates( rbm.params, grads, lr=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ])) + theano.printing.pydotprint(learn_fn, 'learn_fn.png') - for jj in xrange(10000): - sampler.simulate() - l2_of_Ugrad = learn_fn(jj, lr/max(1, jj/(20*epoch_size/batchsize))) + print "Learning..." + normVF=1 + for jj in xrange(n_train_iters): + + print_jj = ((1 and jj < 100) + or (0 and jj < 100 and 0==jj%10) + or (jj < 1000 and 0==jj%100) + or (1 and jj < 10000 and 0==jj%1000)) + - if jj > no_l1_epochs * epoch_size/batchsize: - rbm.U.value -= l1_penalty * np.sign(rbm.U.value) - rbm.W.value -= l1_penalty * np.sign(rbm.W.value) + if print_jj: + tile(imgs_fn(jj), "imgs_%06i.png"%jj) + tile(sampler.positions[0].value, "sample_%06i.png"%jj) + tile(rbm.U.value.T, "U_%06i.png"%jj) + tile(rbm.W.value.T, "W_%06i.png"%jj) - if jj % 5 == 0: - rbm.U.value = numpy_project_onto_ball(rbm.U.value.T).T + print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize), + print 'l2(U)', l2(rbm.U.value), + print 'l2(W)', l2(rbm.W.value), + print 'U min max', rbm.U.value.min(), rbm.U.value.max(), + print 'W min max', rbm.W.value.min(), rbm.W.value.max(), + print 'a min max', rbm.a.value.min(), rbm.a.value.max(), + print 'b min max', rbm.b.value.min(), rbm.b.value.max(), + print 'c min max', rbm.c.value.min(), rbm.c.value.max(), - if ((jj < 10) - or (jj < 100 and 0==jj%10) - or (jj < 1000 and 0==jj%100) - or (jj < 10000 and 0==jj%1000)): - print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize), l2_of_Ugrad - print 'neg particles', sampler.positions[0].value.min(), sampler.positions[0].value.max() - image_tiling.save_tiled_raster_images( - image_tiling.tile_raster_images(sampler.positions[0].value, (R,C)), - "sample_%06i.png"%jj) - image_tiling.save_tiled_raster_images( - image_tiling.tile_raster_images(rbm.U.value.T, (R,C)), - "U_%06i.png"%jj) - image_tiling.save_tiled_raster_images( - image_tiling.tile_raster_images(rbm.W.value.T, (R,C)), - "W_%06i.png"%jj) + print 'parts min', sampler.positions[0].value.min(), + print 'max',sampler.positions[0].value.max(), + print 'HMC step', sampler.stepsize, + print 'arate', sampler.avg_acceptance_rate + + sampler.simulate() + + l2_of_Ugrad = learn_fn(jj, + lr/max(1, jj/(20*epoch_size/batchsize)), + effective_l1_penalty) + if print_jj: + print 'l2(gU)', float(l2_of_Ugrad[0]), + print 'FE+', float(l2_of_Ugrad[1]), + print 'FE+[0]', float(l2_of_Ugrad[2]), + print 'FE+[1]', float(l2_of_Ugrad[3]), + print 'FE+[2]', float(l2_of_Ugrad[4]), + print 'FE+[3]', float(l2_of_Ugrad[5]), + + if jj == no_l1_epochs * epoch_size/batchsize: + print "Activating L1 weight decay" + effective_l1_penalty = 1e-3 + + if 0: + rbm.U.value = numpy_project_onto_ball(rbm.U.value.T).T + else: + # weird normalization technique... + # It constrains all the columns of the matrix to have the same length + # But the matrix itself is re-scaled to have an arbitrary abslute size. + U = rbm.U.value + U_norms = np.sqrt((U*U).sum(axis=0)) + assert len(U_norms) == n_F + normVF = .95 * normVF + .05 * np.mean(U_norms) + rbm.U.value = rbm.U.value * normVF/U_norms #