# HG changeset patch # User James Bergstra # Date 1282676081 14400 # Node ID 610f563fb24a16b81c6fad0d10534fc7047ae0b1 # Parent 88107ec01ce89684b6a99b17487cba4d62fa1497 mcRBM - removed old train_mcRBM cut-and-paste diff -r 88107ec01ce8 -r 610f563fb24a pylearn/algorithms/mcRBM.py --- a/pylearn/algorithms/mcRBM.py Tue Aug 24 14:54:25 2010 -0400 +++ b/pylearn/algorithms/mcRBM.py Tue Aug 24 14:54:41 2010 -0400 @@ -563,312 +563,3 @@ normVF = .95 * normVF + .05 * np.mean(U_norms) rbm.U.value = rbm.U.value * normVF/U_norms - -# -# -# Marc'Aurelio Ranzato's code -# -###################################################################### -# compute the value of the free energy at a given input -# F = - sum log(1+exp(- .5 FH (VF data/norm(data))^2 + bias_cov)) +... -# - sum log(1+exp(w_mean data + bias_mean)) + ... -# - bias_vis data + 0.5 data^2 -# NOTE: FH is constrained to be positive -# (in the paper the sign is negative but the sign in front of it is also flipped) -def compute_energy_mcRBM(data,normdata,vel,energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis): - # normalize input data vectors - data.mult(data, target = t6) # DxP (nr input dims x nr samples) - t6.sum(axis = 0, target = lengthsq) # 1xP - lengthsq.mult(0.5, target = energy) # energy of quadratic regularization term - lengthsq.mult(1./num_vis) # normalize by number of components (like std) - - lengthsq.add(small) # small prevents division by 0 - # energy_j = \sum_i 0.5 data_ij ^2 - # lengthsq_j = 1/ (\sum_i data_ij ^2 + small) - cmt.sqrt(lengthsq, target = length) - # length_j = sqrt(lengthsq_j) - length.reciprocal(target = normcoeff) # 1xP - # normcoef_j = 1/sqrt(lengthsq_j) - data.mult_by_row(normcoeff, target = normdata) # normalized data - # normdata is like data, but cols have unit L2 norm - - ## potential - # covariance contribution - cmt.dot(VF.T, normdata, target = feat) # HxP (nr factors x nr samples) - feat.mult(feat, target = featsq) # HxP - - # featsq is the squared cosines (VF with data) - cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples) - t1.mult(-0.5) - t1.add_col_vec(bias_cov) # OxP - cmt.exp(t1) # OxP - t1.add(1, target = t2) # OxP - cmt.log(t2) - t2.mult(-1) - energy.add_sums(t2, axis=0) - # mean contribution - cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples) - feat_mean.add_col_vec(bias_mean) # HxP - cmt.exp(feat_mean) - feat_mean.add(1) - cmt.log(feat_mean) - feat_mean.mult(-1) - energy.add_sums(feat_mean, axis=0) - # visible bias term - data.mult_by_col(bias_vis, target = t6) - t6.mult(-1) # DxP - energy.add_sums(t6, axis=0) # 1xP - # kinetic - vel.mult(vel, target = t6) - energy.add_sums(t6, axis = 0, mult = .5) - -###################################################### -# mcRBM trainer: sweeps over the training set. -# For each batch of samples compute derivatives to update the parameters -# at the training samples and at the negative samples drawn calling HMC sampler. -def train_mcRBM(): - - config = ConfigParser() - config.read('input_configuration') - - verbose = config.getint('VERBOSITY','verbose') - - num_epochs = config.getint('MAIN_PARAMETER_SETTING','num_epochs') - batch_size = config.getint('MAIN_PARAMETER_SETTING','batch_size') - startFH = config.getint('MAIN_PARAMETER_SETTING','startFH') - startwd = config.getint('MAIN_PARAMETER_SETTING','startwd') - doPCD = config.getint('MAIN_PARAMETER_SETTING','doPCD') - - # model parameters - num_fac = config.getint('MODEL_PARAMETER_SETTING','num_fac') - num_hid_cov = config.getint('MODEL_PARAMETER_SETTING','num_hid_cov') - num_hid_mean = config.getint('MODEL_PARAMETER_SETTING','num_hid_mean') - apply_mask = config.getint('MODEL_PARAMETER_SETTING','apply_mask') - - # load data - data_file_name = config.get('DATA','data_file_name') - d = loadmat(data_file_name) # input in the format PxD (P vectorized samples with D dimensions) - totnumcases = d["whitendata"].shape[0] - d = d["whitendata"][0:floor(totnumcases/batch_size)*batch_size,:].copy() - totnumcases = d.shape[0] - num_vis = d.shape[1] - num_batches = int(totnumcases/batch_size) - dev_dat = cmt.CUDAMatrix(d.T) # VxP - - # training parameters - epsilon = config.getfloat('OPTIMIZER_PARAMETERS','epsilon') - epsilonVF = 2*epsilon - epsilonFH = 0.02*epsilon - epsilonb = 0.02*epsilon - epsilonw_mean = 0.2*epsilon - epsilonb_mean = 0.1*epsilon - weightcost_final = config.getfloat('OPTIMIZER_PARAMETERS','weightcost_final') - - # HMC setting - hmc_step_nr = config.getint('HMC_PARAMETERS','hmc_step_nr') - hmc_step = 0.01 - hmc_target_ave_rej = config.getfloat('HMC_PARAMETERS','hmc_target_ave_rej') - hmc_ave_rej = hmc_target_ave_rej - - # initialize weights - VF = cmt.CUDAMatrix(np.array(0.02 * np.random.randn(num_vis, num_fac), dtype=np.float32, order='F')) # VxH - if apply_mask == 0: - FH = cmt.CUDAMatrix( np.array( np.eye(num_fac,num_hid_cov), dtype=np.float32, order='F') ) # HxO - else: - dd = loadmat('your_FHinit_mask_file.mat') # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat for an example - FH = cmt.CUDAMatrix( np.array( dd["FH"], dtype=np.float32, order='F') ) - bias_cov = cmt.CUDAMatrix( np.array(2.0*np.ones((num_hid_cov, 1)), dtype=np.float32, order='F') ) - bias_vis = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F') ) - w_mean = cmt.CUDAMatrix( np.array( 0.05 * np.random.randn(num_vis, num_hid_mean), dtype=np.float32, order='F') ) # VxH - bias_mean = cmt.CUDAMatrix( np.array( -2.0*np.ones((num_hid_mean,1)), dtype=np.float32, order='F') ) - - # initialize variables to store derivatives - VFinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_fac)), dtype=np.float32, order='F')) - FHinc = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F')) - bias_covinc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_cov, 1)), dtype=np.float32, order='F')) - bias_visinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F')) - w_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_hid_mean)), dtype=np.float32, order='F')) - bias_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_mean, 1)), dtype=np.float32, order='F')) - - # initialize temporary storage - data = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - normdata = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - negdataini = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - feat = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) - featsq = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) - negdata = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F')) - old_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) - new_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) - gradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - normgradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP - thresh = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) - feat_mean = cmt.CUDAMatrix( np.array(np.empty((num_hid_mean, batch_size)), dtype=np.float32, order='F')) - vel = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F')) - length = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP - lengthsq = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP - normcoeff = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP - if apply_mask==1: # this used to constrain very large FH matrices only allowing to change values in a neighborhood - dd = loadmat('your_FHinit_mask_file.mat') - mask = cmt.CUDAMatrix( np.array(dd["mask"], dtype=np.float32, order='F')) - normVF = 1 - small = 0.5 - - # other temporary vars - t1 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F')) - t2 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F')) - t3 = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F')) - t4 = cmt.CUDAMatrix( np.array(np.empty((1,batch_size)), dtype=np.float32, order='F')) - t5 = cmt.CUDAMatrix( np.array(np.empty((1,1)), dtype=np.float32, order='F')) - t6 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) - t7 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) - t8 = cmt.CUDAMatrix( np.array(np.empty((num_vis, num_fac)), dtype=np.float32, order='F')) - t9 = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F')) - t10 = cmt.CUDAMatrix( np.array(np.empty((1,num_fac)), dtype=np.float32, order='F')) - t11 = cmt.CUDAMatrix( np.array(np.empty((1,num_hid_cov)), dtype=np.float32, order='F')) - - # start training - for epoch in range(num_epochs): - - print "Epoch " + str(epoch + 1) - - # anneal learning rates - epsilonVFc = epsilonVF/max(1,epoch/20) - epsilonFHc = epsilonFH/max(1,epoch/20) - epsilonbc = epsilonb/max(1,epoch/20) - epsilonw_meanc = epsilonw_mean/max(1,epoch/20) - epsilonb_meanc = epsilonb_mean/max(1,epoch/20) - weightcost = weightcost_final - - if epoch <= startFH: - epsilonFHc = 0 - if epoch <= startwd: - weightcost = 0 - - for batch in range(num_batches): - - # get current minibatch - data = dev_dat.slice(batch*batch_size,(batch + 1)*batch_size) # DxP (nr dims x nr samples) - - # normalize input data - data.mult(data, target = t6) # DxP - t6.sum(axis = 0, target = lengthsq) # 1xP - lengthsq.mult(1./num_vis) # normalize by number of components (like std) - lengthsq.add(small) # small avoids division by 0 - cmt.sqrt(lengthsq, target = length) - length.reciprocal(target = normcoeff) # 1xP - data.mult_by_row(normcoeff, target = normdata) # normalized data - ## compute positive sample derivatives - # covariance part - cmt.dot(VF.T, normdata, target = feat) # HxP (nr facs x nr samples) - feat.mult(feat, target = featsq) # HxP - cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples) - t1.mult(-0.5) - t1.add_col_vec(bias_cov) # OxP - t1.apply_sigmoid(target = t2) # OxP - cmt.dot(featsq, t2.T, target = FHinc) # HxO - cmt.dot(FH,t2, target = t3) # HxP - t3.mult(feat) - cmt.dot(normdata, t3.T, target = VFinc) # VxH - t2.sum(axis = 1, target = bias_covinc) - bias_covinc.mult(-1) - # visible bias - data.sum(axis = 1, target = bias_visinc) - bias_visinc.mult(-1) - # mean part - cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples) - feat_mean.add_col_vec(bias_mean) # HxP - feat_mean.apply_sigmoid() # HxP - feat_mean.mult(-1) - cmt.dot(data, feat_mean.T, target = w_meaninc) - feat_mean.sum(axis = 1, target = bias_meaninc) - - # HMC sampling: draw an approximate sample from the model - if doPCD == 0: # CD-1 (set negative data to current training samples) - hmc_step, hmc_ave_rej = draw_HMC_samples(data,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis) - else: # PCD-1 (use previous negative data as starting point for chain) - negdataini.assign(negdata) - hmc_step, hmc_ave_rej = draw_HMC_samples(negdataini,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis) - - # compute derivatives at the negative samples - # normalize input data - negdata.mult(negdata, target = t6) # DxP - t6.sum(axis = 0, target = lengthsq) # 1xP - lengthsq.mult(1./num_vis) # normalize by number of components (like std) - lengthsq.add(small) - cmt.sqrt(lengthsq, target = length) - length.reciprocal(target = normcoeff) # 1xP - negdata.mult_by_row(normcoeff, target = normdata) # normalized data - # covariance part - cmt.dot(VF.T, normdata, target = feat) # HxP - feat.mult(feat, target = featsq) # HxP - cmt.dot(FH.T,featsq, target = t1) # OxP - t1.mult(-0.5) - t1.add_col_vec(bias_cov) # OxP - t1.apply_sigmoid(target = t2) # OxP - FHinc.subtract_dot(featsq, t2.T) # HxO - FHinc.mult(0.5) - cmt.dot(FH,t2, target = t3) # HxP - t3.mult(feat) - VFinc.subtract_dot(normdata, t3.T) # VxH - bias_covinc.add_sums(t2, axis = 1) - # visible bias - bias_visinc.add_sums(negdata, axis = 1) - # mean part - cmt.dot(w_mean.T, negdata, target = feat_mean) # HxP - feat_mean.add_col_vec(bias_mean) # HxP - feat_mean.apply_sigmoid() # HxP - w_meaninc.add_dot(negdata, feat_mean.T) - bias_meaninc.add_sums(feat_mean, axis = 1) - - # update parameters - VFinc.add_mult(VF.sign(), weightcost) # L1 regularization - VF.add_mult(VFinc, -epsilonVFc/batch_size) - # normalize columns of VF: normalize by running average of their norm - VF.mult(VF, target = t8) - t8.sum(axis = 0, target = t10) - cmt.sqrt(t10) - t10.sum(axis=1,target = t5) - t5.copy_to_host() - normVF = .95*normVF + (.05/num_fac) * t5.numpy_array[0,0] # estimate norm - t10.reciprocal() - VF.mult_by_row(t10) - VF.mult(normVF) - bias_cov.add_mult(bias_covinc, -epsilonbc/batch_size) - bias_vis.add_mult(bias_visinc, -epsilonbc/batch_size) - - if epoch > startFH: - FHinc.add_mult(FH.sign(), weightcost) # L1 regularization - FH.add_mult(FHinc, -epsilonFHc/batch_size) # update - # set to 0 negative entries in FH - FH.greater_than(0, target = t9) - FH.mult(t9) - if apply_mask==1: - FH.mult(mask) - # normalize columns of FH: L1 norm set to 1 in each column - FH.sum(axis = 0, target = t11) - t11.reciprocal() - FH.mult_by_row(t11) - w_meaninc.add_mult(w_mean.sign(),weightcost) - w_mean.add_mult(w_meaninc, -epsilonw_meanc/batch_size) - bias_mean.add_mult(bias_meaninc, -epsilonb_meanc/batch_size) - - if verbose == 1: - print "VF: " + '%3.2e' % VF.euclid_norm() + ", DVF: " + '%3.2e' % (VFinc.euclid_norm()*(epsilonVFc/batch_size)) + ", FH: " + '%3.2e' % FH.euclid_norm() + ", DFH: " + '%3.2e' % (FHinc.euclid_norm()*(epsilonFHc/batch_size)) + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm() + ", Dbias_cov: " + '%3.2e' % (bias_covinc.euclid_norm()*(epsilonbc/batch_size)) + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm() + ", Dbias_vis: " + '%3.2e' % (bias_visinc.euclid_norm()*(epsilonbc/batch_size)) + ", wm: " + '%3.2e' % w_mean.euclid_norm() + ", Dwm: " + '%3.2e' % (w_meaninc.euclid_norm()*(epsilonw_meanc/batch_size)) + ", bm: " + '%3.2e' % bias_mean.euclid_norm() + ", Dbm: " + '%3.2e' % (bias_meaninc.euclid_norm()*(epsilonb_meanc/batch_size)) + ", step: " + '%3.2e' % hmc_step + ", rej: " + '%3.2e' % hmc_ave_rej - sys.stdout.flush() - # back-up every once in a while - if np.mod(epoch,10) == 0: - VF.copy_to_host() - FH.copy_to_host() - bias_cov.copy_to_host() - w_mean.copy_to_host() - bias_mean.copy_to_host() - bias_vis.copy_to_host() - savemat("ws_temp", {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array,'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch}) - # final back-up - VF.copy_to_host() - FH.copy_to_host() - bias_cov.copy_to_host() - bias_vis.copy_to_host() - w_mean.copy_to_host() - bias_mean.copy_to_host() - savemat("ws_fac" + str(num_fac) + "_cov" + str(num_hid_cov) + "_mean" + str(num_hid_mean), {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array, 'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch})