changeset 994:610f563fb24a

mcRBM - removed old train_mcRBM cut-and-paste
author James Bergstra <bergstrj@iro.umontreal.ca>
date Tue, 24 Aug 2010 14:54:41 -0400
parents 88107ec01ce8
children 68ca3ea34e72
files pylearn/algorithms/mcRBM.py
diffstat 1 files changed, 0 insertions(+), 309 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/algorithms/mcRBM.py	Tue Aug 24 14:54:25 2010 -0400
+++ b/pylearn/algorithms/mcRBM.py	Tue Aug 24 14:54:41 2010 -0400
@@ -563,312 +563,3 @@
         normVF = .95 * normVF + .05 * np.mean(U_norms)
         rbm.U.value = rbm.U.value * normVF/U_norms
 
-
-#
-#
-# Marc'Aurelio Ranzato's code
-#
-######################################################################
-# compute the value of the free energy at a given input
-# F = - sum log(1+exp(- .5 FH (VF data/norm(data))^2 + bias_cov)) +...
-#     - sum log(1+exp(w_mean data + bias_mean)) + ...
-#     - bias_vis data + 0.5 data^2
-# NOTE: FH is constrained to be positive 
-# (in the paper the sign is negative but the sign in front of it is also flipped)
-def compute_energy_mcRBM(data,normdata,vel,energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis):
-    # normalize input data vectors
-    data.mult(data, target = t6) # DxP (nr input dims x nr samples)
-    t6.sum(axis = 0, target = lengthsq) # 1xP
-    lengthsq.mult(0.5, target = energy) # energy of quadratic regularization term   
-    lengthsq.mult(1./num_vis) # normalize by number of components (like std)    
-
-    lengthsq.add(small) # small prevents division by 0
-    # energy_j = \sum_i 0.5 data_ij ^2
-    # lengthsq_j = 1/ (\sum_i data_ij ^2 + small)
-    cmt.sqrt(lengthsq, target = length) 
-    # length_j = sqrt(lengthsq_j)
-    length.reciprocal(target = normcoeff) # 1xP
-    # normcoef_j = 1/sqrt(lengthsq_j)
-    data.mult_by_row(normcoeff, target = normdata) # normalized data    
-    # normdata is like data, but cols have unit L2 norm
-
-    ## potential
-    # covariance contribution
-    cmt.dot(VF.T, normdata, target = feat) # HxP (nr factors x nr samples)
-    feat.mult(feat, target = featsq)   # HxP
-
-    # featsq is the squared cosines (VF with data)
-    cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples)
-    t1.mult(-0.5)
-    t1.add_col_vec(bias_cov) # OxP
-    cmt.exp(t1) # OxP
-    t1.add(1, target = t2) # OxP
-    cmt.log(t2)
-    t2.mult(-1)
-    energy.add_sums(t2, axis=0)
-    # mean contribution
-    cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples)
-    feat_mean.add_col_vec(bias_mean) # HxP
-    cmt.exp(feat_mean) 
-    feat_mean.add(1)
-    cmt.log(feat_mean)
-    feat_mean.mult(-1)
-    energy.add_sums(feat_mean,  axis=0)
-    # visible bias term
-    data.mult_by_col(bias_vis, target = t6)
-    t6.mult(-1) # DxP
-    energy.add_sums(t6,  axis=0) # 1xP
-    # kinetic
-    vel.mult(vel, target = t6)
-    energy.add_sums(t6, axis = 0, mult = .5)
-
-######################################################
-# mcRBM trainer: sweeps over the training set.
-# For each batch of samples compute derivatives to update the parameters
-# at the training samples and at the negative samples drawn calling HMC sampler.
-def train_mcRBM():
-
-    config = ConfigParser()
-    config.read('input_configuration')
-
-    verbose = config.getint('VERBOSITY','verbose')
-
-    num_epochs = config.getint('MAIN_PARAMETER_SETTING','num_epochs')
-    batch_size = config.getint('MAIN_PARAMETER_SETTING','batch_size')
-    startFH = config.getint('MAIN_PARAMETER_SETTING','startFH')
-    startwd = config.getint('MAIN_PARAMETER_SETTING','startwd')
-    doPCD = config.getint('MAIN_PARAMETER_SETTING','doPCD')
-
-    # model parameters
-    num_fac = config.getint('MODEL_PARAMETER_SETTING','num_fac')
-    num_hid_cov =  config.getint('MODEL_PARAMETER_SETTING','num_hid_cov')
-    num_hid_mean =  config.getint('MODEL_PARAMETER_SETTING','num_hid_mean')
-    apply_mask =  config.getint('MODEL_PARAMETER_SETTING','apply_mask')
-
-    # load data
-    data_file_name =  config.get('DATA','data_file_name')
-    d = loadmat(data_file_name) # input in the format PxD (P vectorized samples with D dimensions)
-    totnumcases = d["whitendata"].shape[0]
-    d = d["whitendata"][0:floor(totnumcases/batch_size)*batch_size,:].copy() 
-    totnumcases = d.shape[0]
-    num_vis =  d.shape[1]
-    num_batches = int(totnumcases/batch_size)
-    dev_dat = cmt.CUDAMatrix(d.T) # VxP 
-
-    # training parameters
-    epsilon = config.getfloat('OPTIMIZER_PARAMETERS','epsilon')
-    epsilonVF = 2*epsilon
-    epsilonFH = 0.02*epsilon
-    epsilonb = 0.02*epsilon
-    epsilonw_mean = 0.2*epsilon
-    epsilonb_mean = 0.1*epsilon
-    weightcost_final =  config.getfloat('OPTIMIZER_PARAMETERS','weightcost_final')
-
-    # HMC setting
-    hmc_step_nr = config.getint('HMC_PARAMETERS','hmc_step_nr')
-    hmc_step =  0.01
-    hmc_target_ave_rej =  config.getfloat('HMC_PARAMETERS','hmc_target_ave_rej')
-    hmc_ave_rej =  hmc_target_ave_rej
-
-    # initialize weights
-    VF = cmt.CUDAMatrix(np.array(0.02 * np.random.randn(num_vis, num_fac), dtype=np.float32, order='F')) # VxH
-    if apply_mask == 0:
-        FH = cmt.CUDAMatrix( np.array( np.eye(num_fac,num_hid_cov), dtype=np.float32, order='F')  ) # HxO
-    else:
-        dd = loadmat('your_FHinit_mask_file.mat') # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat for an example
-        FH = cmt.CUDAMatrix( np.array( dd["FH"], dtype=np.float32, order='F')  )
-    bias_cov = cmt.CUDAMatrix( np.array(2.0*np.ones((num_hid_cov, 1)), dtype=np.float32, order='F') )
-    bias_vis = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F') )
-    w_mean = cmt.CUDAMatrix( np.array( 0.05 * np.random.randn(num_vis, num_hid_mean), dtype=np.float32, order='F') ) # VxH
-    bias_mean = cmt.CUDAMatrix( np.array( -2.0*np.ones((num_hid_mean,1)), dtype=np.float32, order='F') )
-
-    # initialize variables to store derivatives 
-    VFinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_fac)), dtype=np.float32, order='F'))
-    FHinc = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F'))
-    bias_covinc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_cov, 1)), dtype=np.float32, order='F'))
-    bias_visinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F'))
-    w_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_hid_mean)), dtype=np.float32, order='F'))
-    bias_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_mean, 1)), dtype=np.float32, order='F'))
-
-    # initialize temporary storage
-    data = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
-    normdata = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
-    negdataini = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
-    feat = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F'))
-    featsq = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F'))
-    negdata = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F'))
-    old_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F'))
-    new_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F'))
-    gradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
-    normgradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
-    thresh = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F'))
-    feat_mean = cmt.CUDAMatrix( np.array(np.empty((num_hid_mean, batch_size)), dtype=np.float32, order='F'))
-    vel = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F'))
-    length = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP
-    lengthsq = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP
-    normcoeff = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP
-    if apply_mask==1: # this used to constrain very large FH matrices only allowing to change values in a neighborhood
-        dd = loadmat('your_FHinit_mask_file.mat') 
-        mask = cmt.CUDAMatrix( np.array(dd["mask"], dtype=np.float32, order='F'))
-    normVF = 1    
-    small = 0.5
-    
-    # other temporary vars
-    t1 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F'))
-    t2 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F'))
-    t3 = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F'))
-    t4 = cmt.CUDAMatrix( np.array(np.empty((1,batch_size)), dtype=np.float32, order='F'))
-    t5 = cmt.CUDAMatrix( np.array(np.empty((1,1)), dtype=np.float32, order='F'))
-    t6 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F'))
-    t7 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F'))
-    t8 = cmt.CUDAMatrix( np.array(np.empty((num_vis, num_fac)), dtype=np.float32, order='F'))
-    t9 = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F'))
-    t10 = cmt.CUDAMatrix( np.array(np.empty((1,num_fac)), dtype=np.float32, order='F'))
-    t11 = cmt.CUDAMatrix( np.array(np.empty((1,num_hid_cov)), dtype=np.float32, order='F'))
-
-    # start training
-    for epoch in range(num_epochs):
-
-        print "Epoch " + str(epoch + 1)
-    
-        # anneal learning rates
-        epsilonVFc    = epsilonVF/max(1,epoch/20)
-        epsilonFHc    = epsilonFH/max(1,epoch/20)
-        epsilonbc    = epsilonb/max(1,epoch/20)
-        epsilonw_meanc = epsilonw_mean/max(1,epoch/20)
-        epsilonb_meanc = epsilonb_mean/max(1,epoch/20)
-        weightcost = weightcost_final
-
-        if epoch <= startFH:
-            epsilonFHc = 0 
-        if epoch <= startwd:	
-            weightcost = 0
-
-        for batch in range(num_batches):
-
-            # get current minibatch
-            data = dev_dat.slice(batch*batch_size,(batch + 1)*batch_size) # DxP (nr dims x nr samples)
-
-            # normalize input data
-            data.mult(data, target = t6) # DxP
-            t6.sum(axis = 0, target = lengthsq) # 1xP
-            lengthsq.mult(1./num_vis) # normalize by number of components (like std)
-            lengthsq.add(small) # small avoids division by 0
-            cmt.sqrt(lengthsq, target = length)
-            length.reciprocal(target = normcoeff) # 1xP
-            data.mult_by_row(normcoeff, target = normdata) # normalized data 
-            ## compute positive sample derivatives
-            # covariance part
-            cmt.dot(VF.T, normdata, target = feat) # HxP (nr facs x nr samples)
-            feat.mult(feat, target = featsq)   # HxP
-            cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples)
-            t1.mult(-0.5)
-            t1.add_col_vec(bias_cov) # OxP
-            t1.apply_sigmoid(target = t2) # OxP
-            cmt.dot(featsq, t2.T, target = FHinc) # HxO
-            cmt.dot(FH,t2, target = t3) # HxP
-            t3.mult(feat)
-            cmt.dot(normdata, t3.T, target = VFinc) # VxH
-            t2.sum(axis = 1, target = bias_covinc)
-            bias_covinc.mult(-1)  
-            # visible bias
-            data.sum(axis = 1, target = bias_visinc)
-            bias_visinc.mult(-1)
-            # mean part
-            cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples)
-            feat_mean.add_col_vec(bias_mean) # HxP
-            feat_mean.apply_sigmoid() # HxP
-            feat_mean.mult(-1)
-            cmt.dot(data, feat_mean.T, target = w_meaninc)
-            feat_mean.sum(axis = 1, target = bias_meaninc)
-            
-            # HMC sampling: draw an approximate sample from the model
-            if doPCD == 0: # CD-1 (set negative data to current training samples)
-                hmc_step, hmc_ave_rej = draw_HMC_samples(data,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis)
-            else: # PCD-1 (use previous negative data as starting point for chain)
-                negdataini.assign(negdata)
-                hmc_step, hmc_ave_rej = draw_HMC_samples(negdataini,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis)
-                
-            # compute derivatives at the negative samples
-            # normalize input data
-            negdata.mult(negdata, target = t6) # DxP
-            t6.sum(axis = 0, target = lengthsq) # 1xP
-            lengthsq.mult(1./num_vis) # normalize by number of components (like std)
-            lengthsq.add(small)
-            cmt.sqrt(lengthsq, target = length)
-            length.reciprocal(target = normcoeff) # 1xP
-            negdata.mult_by_row(normcoeff, target = normdata) # normalized data 
-            # covariance part
-            cmt.dot(VF.T, normdata, target = feat) # HxP 
-            feat.mult(feat, target = featsq)   # HxP
-            cmt.dot(FH.T,featsq, target = t1) # OxP
-            t1.mult(-0.5)
-            t1.add_col_vec(bias_cov) # OxP
-            t1.apply_sigmoid(target = t2) # OxP
-            FHinc.subtract_dot(featsq, t2.T) # HxO
-            FHinc.mult(0.5)
-            cmt.dot(FH,t2, target = t3) # HxP
-            t3.mult(feat)
-            VFinc.subtract_dot(normdata, t3.T) # VxH
-            bias_covinc.add_sums(t2, axis = 1)
-            # visible bias
-            bias_visinc.add_sums(negdata, axis = 1)
-            # mean part
-            cmt.dot(w_mean.T, negdata, target = feat_mean) # HxP 
-            feat_mean.add_col_vec(bias_mean) # HxP
-            feat_mean.apply_sigmoid() # HxP
-            w_meaninc.add_dot(negdata, feat_mean.T)
-            bias_meaninc.add_sums(feat_mean, axis = 1)
-
-            # update parameters
-            VFinc.add_mult(VF.sign(), weightcost) # L1 regularization
-            VF.add_mult(VFinc, -epsilonVFc/batch_size)
-            # normalize columns of VF: normalize by running average of their norm 
-            VF.mult(VF, target = t8)
-            t8.sum(axis = 0, target = t10)
-            cmt.sqrt(t10)
-            t10.sum(axis=1,target = t5)
-            t5.copy_to_host()
-            normVF = .95*normVF + (.05/num_fac) * t5.numpy_array[0,0] # estimate norm
-            t10.reciprocal()
-            VF.mult_by_row(t10) 
-            VF.mult(normVF) 
-            bias_cov.add_mult(bias_covinc, -epsilonbc/batch_size)
-            bias_vis.add_mult(bias_visinc, -epsilonbc/batch_size)
-
-            if epoch > startFH:
-                FHinc.add_mult(FH.sign(), weightcost) # L1 regularization
-       		FH.add_mult(FHinc, -epsilonFHc/batch_size) # update
-	        # set to 0 negative entries in FH
-        	FH.greater_than(0, target = t9)
-	        FH.mult(t9)
-                if apply_mask==1:
-                    FH.mult(mask)
-		# normalize columns of FH: L1 norm set to 1 in each column
-		FH.sum(axis = 0, target = t11)               
-		t11.reciprocal()
-		FH.mult_by_row(t11) 
-            w_meaninc.add_mult(w_mean.sign(),weightcost)
-            w_mean.add_mult(w_meaninc, -epsilonw_meanc/batch_size)
-            bias_mean.add_mult(bias_meaninc, -epsilonb_meanc/batch_size)
-
-        if verbose == 1:
-            print "VF: " + '%3.2e' % VF.euclid_norm() + ", DVF: " + '%3.2e' % (VFinc.euclid_norm()*(epsilonVFc/batch_size)) + ", FH: " + '%3.2e' % FH.euclid_norm() + ", DFH: " + '%3.2e' % (FHinc.euclid_norm()*(epsilonFHc/batch_size)) + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm() + ", Dbias_cov: " + '%3.2e' % (bias_covinc.euclid_norm()*(epsilonbc/batch_size)) + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm() + ", Dbias_vis: " + '%3.2e' % (bias_visinc.euclid_norm()*(epsilonbc/batch_size)) + ", wm: " + '%3.2e' % w_mean.euclid_norm() + ", Dwm: " + '%3.2e' % (w_meaninc.euclid_norm()*(epsilonw_meanc/batch_size)) + ", bm: " + '%3.2e' % bias_mean.euclid_norm() + ", Dbm: " + '%3.2e' % (bias_meaninc.euclid_norm()*(epsilonb_meanc/batch_size)) + ", step: " + '%3.2e' % hmc_step  +  ", rej: " + '%3.2e' % hmc_ave_rej 
-            sys.stdout.flush()
-        # back-up every once in a while 
-        if np.mod(epoch,10) == 0:
-            VF.copy_to_host()
-            FH.copy_to_host()
-            bias_cov.copy_to_host()
-            w_mean.copy_to_host()
-            bias_mean.copy_to_host()
-            bias_vis.copy_to_host()
-            savemat("ws_temp", {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array,'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch})    
-    # final back-up
-    VF.copy_to_host()
-    FH.copy_to_host()
-    bias_cov.copy_to_host()
-    bias_vis.copy_to_host()
-    w_mean.copy_to_host()
-    bias_mean.copy_to_host()
-    savemat("ws_fac" + str(num_fac) + "_cov" + str(num_hid_cov) + "_mean" + str(num_hid_mean), {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array, 'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch})