view pylearn/sandbox/train_mcRBM.py @ 1524:9d21919e2332

autopep8
author Frederic Bastien <nouiz@nouiz.org>
date Fri, 02 Nov 2012 13:02:18 -0400
parents d4a14c6c36e0
children
line wrap: on
line source

"""
This is a copy of mcRBM training that James modified to print out more information, visualize
filters, etc.  Once mcRBM is stable, it can be deleted.
"""
# mcRBM training
# Refer to Ranzato and Hinton CVPR 2010 "Modeling Pixel Means and Covariances Using Factorized Third-Order BMs"
#
# Marc'Aurelio Ranzato
# 28 July 2010

import sys
import numpy as np
import cudamat as cmt
from scipy.io import loadmat, savemat
#import gpu_lock # put here you locking system package, if any
from ConfigParser import *

demodata = None

from pylearn.io import image_tiling
def tile(X, fname):
    X = np.dot(X, demodata['invpcatransf'].T)
    R=16
    C=16
    X = (X[:,:256], X[:,256:512], X[:,512:], None)
    #X = (X[:,0::3], X[:,1::3], X[:,2::3], None)
    _img = image_tiling.tile_raster_images(X,
            img_shape=(R,C),
            min_dynamic_range=1e-2)
    image_tiling.save_tiled_raster_images(_img, fname)

def save_imshow(X, fname):
    image_tiling.Image.fromarray(
            (image_tiling.scale_to_unit_interval(X)*255).astype('uint8'),
            'L').save(fname)

######################################################################
# compute the value of the free energy at a given input
# F = - sum log(1+exp(- .5 FH (VF data/norm(data))^2 + bias_cov)) +...
#     - sum log(1+exp(w_mean data + bias_mean)) + ...
#     - bias_vis data + 0.5 data^2
# NOTE: FH is constrained to be positive 
# (in the paper the sign is negative but the sign in front of it is also flipped)
def compute_energy_mcRBM(data,normdata,vel,energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis):
    # normalize input data vectors
    data.mult(data, target = t6) # DxP (nr input dims x nr samples)
    t6.sum(axis = 0, target = lengthsq) # 1xP
    lengthsq.mult(0.5, target = energy) # energy of quadratic regularization term   
    lengthsq.mult(1./num_vis) # normalize by number of components (like std)    
    lengthsq.add(small) # small prevents division by 0
    cmt.sqrt(lengthsq, target = length) 
    length.reciprocal(target = normcoeff) # 1xP
    data.mult_by_row(normcoeff, target = normdata) # normalized data    
    ## potential
    # covariance contribution
    cmt.dot(VF.T, normdata, target = feat) # HxP (nr factors x nr samples)
    feat.mult(feat, target = featsq)   # HxP
    cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples)
    t1.mult(-0.5)
    t1.add_col_vec(bias_cov) # OxP
    cmt.exp(t1) # OxP
    t1.add(1, target = t2) # OxP
    cmt.log(t2)
    t2.mult(-1)
    energy.add_sums(t2, axis=0)
    # mean contribution
    cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples)
    feat_mean.add_col_vec(bias_mean) # HxP
    cmt.exp(feat_mean) 
    feat_mean.add(1)
    cmt.log(feat_mean)
    feat_mean.mult(-1)
    energy.add_sums(feat_mean,  axis=0)
    # visible bias term
    data.mult_by_col(bias_vis, target = t6)
    t6.mult(-1) # DxP
    energy.add_sums(t6,  axis=0) # 1xP
    # kinetic
    vel.mult(vel, target = t6)
    energy.add_sums(t6, axis = 0, mult = .5)

#################################################################
# compute the derivative if the free energy at a given input
def compute_gradient_mcRBM(data,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis):
    # normalize input data
    data.mult(data, target = t6) # DxP
    t6.sum(axis = 0, target = lengthsq) # 1xP
    lengthsq.mult(1./num_vis) # normalize by number of components (like std)
    lengthsq.add(small)
    cmt.sqrt(lengthsq, target = length)
    length.reciprocal(target = normcoeff) # 1xP
    data.mult_by_row(normcoeff, target = normdata) # normalized data    
    cmt.dot(VF.T, normdata, target = feat) # HxP 
    feat.mult(feat, target = featsq)   # HxP
    cmt.dot(FH.T,featsq, target = t1) # OxP
    t1.mult(-.5)
    t1.add_col_vec(bias_cov) # OxP
    t1.apply_sigmoid(target = t2) # OxP
    cmt.dot(FH,t2, target = t3) # HxP
    t3.mult(feat)
    cmt.dot(VF, t3, target = normgradient) # VxP
    # final bprop through normalization
    length.mult(lengthsq, target = normcoeff)
    normcoeff.reciprocal() # 1xP
    normgradient.mult(data, target = gradient) # VxP
    gradient.sum(axis = 0, target = t4) # 1xP
    t4.mult(-1./num_vis)
    data.mult_by_row(t4, target = gradient)
    normgradient.mult_by_row(lengthsq, target = t6)
    gradient.add(t6)
    gradient.mult_by_row(normcoeff)
    # add quadratic term gradient
    gradient.add(data)
    # add visible bias term
    gradient.add_col_mult(bias_vis, -1)
    # add MEAN contribution to gradient
    cmt.dot(w_mean.T, data, target = feat_mean) # HxP 
    feat_mean.add_col_vec(bias_mean) # HxP
    feat_mean.apply_sigmoid() # HxP
    gradient.subtract_dot(w_mean,feat_mean) # VxP 

############################################################3
# Hybrid Monte Carlo sampler
def draw_HMC_samples(data,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis):
    vel.fill_with_randn()
    negdata.assign(data)
    compute_energy_mcRBM(negdata,normdata,vel,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis)
    compute_gradient_mcRBM(negdata,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis)
    # half step
    vel.add_mult(gradient, -0.5*hmc_step)
    negdata.add_mult(vel,hmc_step)
    # full leap-frog steps
    for ss in range(hmc_step_nr - 1):
        ## re-evaluate the gradient
        compute_gradient_mcRBM(negdata,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis)
        # update variables
        vel.add_mult(gradient, -hmc_step)
        negdata.add_mult(vel,hmc_step)
    # final half-step
    compute_gradient_mcRBM(negdata,normdata,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t3,t4,t6,feat,featsq,feat_mean,gradient,normgradient,length,lengthsq,normcoeff,small,num_vis)
    vel.add_mult(gradient, -0.5*hmc_step)
    # compute new energy
    compute_energy_mcRBM(negdata,normdata,vel,new_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1,t2,t6,feat,featsq,feat_mean,length,lengthsq,normcoeff,small,num_vis)
    # rejecton
    old_energy.subtract(new_energy, target = thresh)
    cmt.exp(thresh)
    t4.fill_with_rand()
    t4.less_than(thresh)
    #    update negdata and rejection rate
    t4.mult(-1)
    t4.add(1) # now 1's detect rejections
    t4.sum(axis = 1, target = t5)
    t5.copy_to_host()
    rej = t5.numpy_array[0,0]/batch_size
    data.mult_by_row(t4, target = t6)
    negdata.mult_by_row(t4, target = t7)
    negdata.subtract(t7)
    negdata.add(t6)
    hmc_ave_rej = 0.9*hmc_ave_rej + 0.1*rej
    if hmc_ave_rej < hmc_target_ave_rej:
        hmc_step = min(hmc_step*1.01,0.25)
    else:
        hmc_step = max(hmc_step*0.99,.001)
    return hmc_step, hmc_ave_rej


######################################################
# mcRBM trainer: sweeps over the training set.
# For each batch of samples compute derivatives to update the parameters
# at the training samples and at the negative samples drawn calling HMC sampler.
def train_mcRBM():

    config = ConfigParser()
    config.read('input_configuration')

    verbose = config.getint('VERBOSITY','verbose')

    num_epochs = config.getint('MAIN_PARAMETER_SETTING','num_epochs')
    batch_size = config.getint('MAIN_PARAMETER_SETTING','batch_size')
    startFH = config.getint('MAIN_PARAMETER_SETTING','startFH')
    startwd = config.getint('MAIN_PARAMETER_SETTING','startwd')
    doPCD = config.getint('MAIN_PARAMETER_SETTING','doPCD')

    # model parameters
    num_fac = config.getint('MODEL_PARAMETER_SETTING','num_fac')
    num_hid_cov =  config.getint('MODEL_PARAMETER_SETTING','num_hid_cov')
    num_hid_mean =  config.getint('MODEL_PARAMETER_SETTING','num_hid_mean')
    apply_mask =  config.getint('MODEL_PARAMETER_SETTING','apply_mask')

    # load data
    data_file_name =  config.get('DATA','data_file_name')
    d = loadmat(data_file_name) # input in the format PxD (P vectorized samples with D dimensions)
    global demodata
    demodata = d
    totnumcases = d["whitendata"].shape[0]
    d = d["whitendata"][0:np.floor(totnumcases/batch_size)*batch_size,:].copy() 
    totnumcases = d.shape[0]
    num_vis =  d.shape[1]
    num_batches = int(totnumcases/batch_size)
    dev_dat = cmt.CUDAMatrix(d.T) # VxP 

    tile(d[:100], "100_whitened_data.png")

    # training parameters
    epsilon = config.getfloat('OPTIMIZER_PARAMETERS','epsilon')
    epsilonVF = 2*epsilon
    epsilonFH = 0.02*epsilon
    epsilonb = 0.02*epsilon
    epsilonw_mean = 0.2*epsilon
    epsilonb_mean = 0.1*epsilon
    weightcost_final =  config.getfloat('OPTIMIZER_PARAMETERS','weightcost_final')

    # HMC setting
    hmc_step_nr = config.getint('HMC_PARAMETERS','hmc_step_nr')
    hmc_step =  0.01
    hmc_target_ave_rej =  config.getfloat('HMC_PARAMETERS','hmc_target_ave_rej')
    hmc_ave_rej =  hmc_target_ave_rej

    # initialize weights
    VF = cmt.CUDAMatrix(np.array(0.02 * np.random.randn(num_vis, num_fac), dtype=np.float32, order='F')) # VxH
    if apply_mask == 0:
        FH = cmt.CUDAMatrix( np.array( np.eye(num_fac,num_hid_cov), dtype=np.float32, order='F')  ) # HxO
    else:
        dd = loadmat('your_FHinit_mask_file.mat') # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat for an example
        FH = cmt.CUDAMatrix( np.array( dd["FH"], dtype=np.float32, order='F')  )
    bias_cov = cmt.CUDAMatrix( np.array(2.0*np.ones((num_hid_cov, 1)), dtype=np.float32, order='F') )
    bias_vis = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F') )
    w_mean = cmt.CUDAMatrix( np.array( 0.05 * np.random.randn(num_vis, num_hid_mean), dtype=np.float32, order='F') ) # VxH
    bias_mean = cmt.CUDAMatrix( np.array( -2.0*np.ones((num_hid_mean,1)), dtype=np.float32, order='F') )

    # initialize variables to store derivatives 
    VFinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_fac)), dtype=np.float32, order='F'))
    FHinc = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F'))
    bias_covinc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_cov, 1)), dtype=np.float32, order='F'))
    bias_visinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F'))
    w_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, num_hid_mean)), dtype=np.float32, order='F'))
    bias_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_hid_mean, 1)), dtype=np.float32, order='F'))

    # initialize temporary storage
    data = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
    normdata = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
    negdataini = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
    feat = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F'))
    featsq = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F'))
    negdata = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F'))
    old_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F'))
    new_energy = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F'))
    gradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
    normgradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F')) # VxP
    thresh = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F'))
    feat_mean = cmt.CUDAMatrix( np.array(np.empty((num_hid_mean, batch_size)), dtype=np.float32, order='F'))
    vel = cmt.CUDAMatrix( np.array(np.random.randn(num_vis, batch_size), dtype=np.float32, order='F'))
    length = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP
    lengthsq = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP
    normcoeff = cmt.CUDAMatrix( np.array(np.zeros((1, batch_size)), dtype=np.float32, order='F')) # 1xP
    if apply_mask==1: # this used to constrain very large FH matrices only allowing to change values in a neighborhood
        dd = loadmat('your_FHinit_mask_file.mat') 
        mask = cmt.CUDAMatrix( np.array(dd["mask"], dtype=np.float32, order='F'))
    normVF = 1    
    small = 0.5
    
    # other temporary vars
    t1 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F'))
    t2 = cmt.CUDAMatrix( np.array(np.empty((num_hid_cov, batch_size)), dtype=np.float32, order='F'))
    t3 = cmt.CUDAMatrix( np.array(np.empty((num_fac, batch_size)), dtype=np.float32, order='F'))
    t4 = cmt.CUDAMatrix( np.array(np.empty((1,batch_size)), dtype=np.float32, order='F'))
    t5 = cmt.CUDAMatrix( np.array(np.empty((1,1)), dtype=np.float32, order='F'))
    t6 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F'))
    t7 = cmt.CUDAMatrix( np.array(np.empty((num_vis, batch_size)), dtype=np.float32, order='F'))
    t8 = cmt.CUDAMatrix( np.array(np.empty((num_vis, num_fac)), dtype=np.float32, order='F'))
    t9 = cmt.CUDAMatrix( np.array(np.zeros((num_fac, num_hid_cov)), dtype=np.float32, order='F'))
    t10 = cmt.CUDAMatrix( np.array(np.empty((1,num_fac)), dtype=np.float32, order='F'))
    t11 = cmt.CUDAMatrix( np.array(np.empty((1,num_hid_cov)), dtype=np.float32, order='F'))
    # start training
    for epoch in range(num_epochs):

        def print_stuff():
            print "VF: " + '%3.2e' % VF.euclid_norm() \
                    + ", DVF: " + '%3.2e' % (VFinc.euclid_norm()*(epsilonVFc/batch_size))\
                    + ", VF_inc: " + '%3.2e' % (VFinc.euclid_norm())\
                    + ", FH: " + '%3.2e' % FH.euclid_norm() \
                    + ", DFH: " + '%3.2e' % (FHinc.euclid_norm()*(epsilonFHc/batch_size)) \
                    + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm() \
                    + ", Dbias_cov: " + '%3.2e' % (bias_covinc.euclid_norm()*(epsilonbc/batch_size)) \
                    + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm() \
                    + ", Dbias_vis: " + '%3.2e' % (bias_visinc.euclid_norm()*(epsilonbc/batch_size)) \
                    + ", wm: " + '%3.2e' % w_mean.euclid_norm() \
                    + ", Dwm: " + '%3.2e' % (w_meaninc.euclid_norm()*(epsilonw_meanc/batch_size)) \
                    + ", bm: " + '%3.2e' % bias_mean.euclid_norm() \
                    + ", Dbm: " + '%3.2e' % (bias_meaninc.euclid_norm()*(epsilonb_meanc/batch_size)) \
                    + ", step: " + '%3.2e' % hmc_step  \
                    + ", rej: " + '%3.2e' % hmc_ave_rej 
            sys.stdout.flush()

        def save_stuff():
            VF.copy_to_host()
            FH.copy_to_host()
            bias_cov.copy_to_host()
            w_mean.copy_to_host()
            bias_mean.copy_to_host()
            bias_vis.copy_to_host()
            savemat("ws_temp", {
                'VF':VF.numpy_array,
                'FH':FH.numpy_array,
                'bias_cov': bias_cov.numpy_array, 
                'bias_vis': bias_vis.numpy_array,
                'w_mean': w_mean.numpy_array, 
                'bias_mean': bias_mean.numpy_array,
                'epoch':epoch})    

            tile(VF.numpy_array.T, 'VF_%000i.png'%epoch)
            tile(w_mean.numpy_array.T, 'w_mean_%000i.png'%epoch)
            save_imshow(FH.numpy_array, 'FH_%000i.png'%epoch)

        # anneal learning rates
        epsilonVFc    = epsilonVF/max(1,epoch/20)
        epsilonFHc    = epsilonFH/max(1,epoch/20)
        epsilonbc    = epsilonb/max(1,epoch/20)
        epsilonw_meanc = epsilonw_mean/max(1,epoch/20)
        epsilonb_meanc = epsilonb_mean/max(1,epoch/20)
        weightcost = weightcost_final

        if epoch <= startFH:
            epsilonFHc = 0 
        if epoch <= startwd:	
            weightcost = 0

        print "Epoch " + str(epoch + 1), 'num_batches', num_batches
        if epoch == 0:
            print_stuff()

        for batch in range(num_batches):

            # get current minibatch
            data = dev_dat.slice(batch*batch_size,(batch + 1)*batch_size) # DxP (nr dims x nr samples)

            # normalize input data
            data.mult(data, target = t6) # DxP
            t6.sum(axis = 0, target = lengthsq) # 1xP
            lengthsq.mult(1./num_vis) # normalize by number of components (like std)
            lengthsq.add(small) # small avoids division by 0
            cmt.sqrt(lengthsq, target = length)
            length.reciprocal(target = normcoeff) # 1xP
            data.mult_by_row(normcoeff, target = normdata) # normalized data 
            ## compute positive sample derivatives
            # covariance part
            cmt.dot(VF.T, normdata, target = feat) # HxP (nr facs x nr samples)
            feat.mult(feat, target = featsq)   # HxP
            cmt.dot(FH.T,featsq, target = t1) # OxP (nr cov hiddens x nr samples)
            t1.mult(-0.5)
            t1.add_col_vec(bias_cov) # OxP
            t1.apply_sigmoid(target = t2) # OxP
            cmt.dot(featsq, t2.T, target = FHinc) # HxO
            cmt.dot(FH,t2, target = t3) # HxP
            t3.mult(feat)
            cmt.dot(normdata, t3.T, target = VFinc) # VxH
            t2.sum(axis = 1, target = bias_covinc)
            bias_covinc.mult(-1)  
            # visible bias
            data.sum(axis = 1, target = bias_visinc)
            bias_visinc.mult(-1)
            # mean part
            cmt.dot(w_mean.T, data, target = feat_mean) # HxP (nr mean hiddens x nr samples)
            feat_mean.add_col_vec(bias_mean) # HxP
            feat_mean.apply_sigmoid() # HxP
            feat_mean.mult(-1)
            cmt.dot(data, feat_mean.T, target = w_meaninc)
            feat_mean.sum(axis = 1, target = bias_meaninc)
            
            # HMC sampling: draw an approximate sample from the model
            if doPCD == 0: # CD-1 (set negative data to current training samples)
                hmc_step, hmc_ave_rej = draw_HMC_samples(data,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis)
            else: # PCD-1 (use previous negative data as starting point for chain)
                negdataini.assign(negdata)
                hmc_step, hmc_ave_rej = draw_HMC_samples(negdataini,negdata,normdata,vel,gradient,normgradient,new_energy,old_energy,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,hmc_step,hmc_step_nr,hmc_ave_rej,hmc_target_ave_rej,t1,t2,t3,t4,t5,t6,t7,thresh,feat,featsq,batch_size,feat_mean,length,lengthsq,normcoeff,small,num_vis)
                
            # compute derivatives at the negative samples
            # normalize input data
            negdata.mult(negdata, target = t6) # DxP
            t6.sum(axis = 0, target = lengthsq) # 1xP
            lengthsq.mult(1./num_vis) # normalize by number of components (like std)
            lengthsq.add(small)
            cmt.sqrt(lengthsq, target = length)
            length.reciprocal(target = normcoeff) # 1xP
            negdata.mult_by_row(normcoeff, target = normdata) # normalized data 
            # covariance part
            cmt.dot(VF.T, normdata, target = feat) # HxP 
            feat.mult(feat, target = featsq)   # HxP
            cmt.dot(FH.T,featsq, target = t1) # OxP
            t1.mult(-0.5)
            t1.add_col_vec(bias_cov) # OxP
            t1.apply_sigmoid(target = t2) # OxP
            FHinc.subtract_dot(featsq, t2.T) # HxO
            FHinc.mult(0.5)
            cmt.dot(FH,t2, target = t3) # HxP
            t3.mult(feat)
            VFinc.subtract_dot(normdata, t3.T) # VxH
            bias_covinc.add_sums(t2, axis = 1)
            # visible bias
            bias_visinc.add_sums(negdata, axis = 1)
            # mean part
            cmt.dot(w_mean.T, negdata, target = feat_mean) # HxP 
            feat_mean.add_col_vec(bias_mean) # HxP
            feat_mean.apply_sigmoid() # HxP
            w_meaninc.add_dot(negdata, feat_mean.T)
            bias_meaninc.add_sums(feat_mean, axis = 1)

            # update parameters
            VFinc.add_mult(VF.sign(), weightcost) # L1 regularization
            VF.add_mult(VFinc, -epsilonVFc/batch_size)
            # normalize columns of VF: normalize by running average of their norm 
            VF.mult(VF, target = t8)
            t8.sum(axis = 0, target = t10)
            cmt.sqrt(t10)
            t10.sum(axis=1,target = t5)
            t5.copy_to_host()
            normVF = .95*normVF + (.05/num_fac) * t5.numpy_array[0,0] # estimate norm
            t10.reciprocal()
            VF.mult_by_row(t10) 
            VF.mult(normVF) 
            bias_cov.add_mult(bias_covinc, -epsilonbc/batch_size)
            bias_vis.add_mult(bias_visinc, -epsilonbc/batch_size)

            if epoch > startFH:
                FHinc.add_mult(FH.sign(), weightcost) # L1 regularization
       		FH.add_mult(FHinc, -epsilonFHc/batch_size) # update
	        # set to 0 negative entries in FH
        	FH.greater_than(0, target = t9)
	        FH.mult(t9)
                if apply_mask==1:
                    FH.mult(mask)
		# normalize columns of FH: L1 norm set to 1 in each column
		FH.sum(axis = 0, target = t11)               
		t11.reciprocal()
		FH.mult_by_row(t11) 
            w_meaninc.add_mult(w_mean.sign(),weightcost)
            w_mean.add_mult(w_meaninc, -epsilonw_meanc/batch_size)
            bias_mean.add_mult(bias_meaninc, -epsilonb_meanc/batch_size)

        if verbose == 1:
            print_stuff()
        # back-up every once in a while 
        if np.mod(epoch,1) == 0:
            save_stuff()
    # final back-up
    VF.copy_to_host()
    FH.copy_to_host()
    bias_cov.copy_to_host()
    bias_vis.copy_to_host()
    w_mean.copy_to_host()
    bias_mean.copy_to_host()
    savemat("ws_fac" + str(num_fac) + "_cov" + str(num_hid_cov) + "_mean" + str(num_hid_mean), {'VF':VF.numpy_array,'FH':FH.numpy_array,'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array, 'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch':epoch})



###################################33
# main
if __name__ == "__main__":
  # initialize CUDA
  #cmt.cuda_set_device(gpu_lock.obtain_lock_id()) # uncomment if you have a locking system or desire to choose the GPU board number
  cmt.cublas_init()
  cmt.CUDAMatrix.init_random(1)
  train_mcRBM()
  cmt.cublas_shutdown()