diff deep/crbm/crbm.py @ 377:0b7e64e8e93f

branch merge
author Arnaud Bergeron <abergeron@gmail.com>
date Sun, 25 Apr 2010 17:12:03 -0400
parents 8d116d4a7593
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/crbm/crbm.py	Sun Apr 25 17:12:03 2010 -0400
@@ -0,0 +1,374 @@
+import sys
+import os, os.path
+
+import numpy
+
+import theano
+
+USING_GPU = "gpu" in theano.config.device
+
+import theano.tensor as T
+from theano.tensor.nnet import conv, sigmoid
+
+if not USING_GPU:
+    from theano.tensor.shared_randomstreams import RandomStreams
+else:
+    from theano.sandbox.rng_mrg import MRG_RandomStreams
+
+_PRINT_GRAPHS = True
+
+def _init_conv_biases(num_filters, varname, rng=numpy.random):
+    b_shp = (num_filters,)
+    b = theano.shared( numpy.asarray(
+                rng.uniform(low=-.5, high=.5, size=b_shp),
+                dtype=theano.config.floatX), name=varname)
+    return b
+
+def _init_conv_weights(conv_params, varname, rng=numpy.random):
+    cp = conv_params
+
+    # initialize shared variable for weights.
+    w_shp = conv_params.as_conv2d_shape_tuple()
+    w_bound =  numpy.sqrt(cp.num_input_planes * \
+                    cp.height_filters * cp.width_filters)
+    W = theano.shared( numpy.asarray(
+                rng.uniform(
+                    low=-1.0 / w_bound,
+                    high=1.0 / w_bound,
+                    size=w_shp),
+                dtype=theano.config.floatX), name=varname)
+
+    return W
+
+# Shape of W for conv2d
+class ConvolutionParams:
+    def __init__(self, num_filters, num_input_planes, height_filters, width_filters):
+        self.num_filters = num_filters
+        self.num_input_planes = num_input_planes
+        self.height_filters = height_filters
+        self.width_filters = width_filters
+
+    def as_conv2d_shape_tuple(self):
+        cp = self
+        return (cp.num_filters, cp.num_input_planes,
+                    cp.height_filters, cp.width_filters)
+
+class CRBM:
+    def __init__(self, minibatch_size, image_size, conv_params,
+                 learning_rate, sparsity_lambda, sparsity_p):
+        '''
+        Parameters
+        ----------
+        image_size
+            height, width
+        '''
+        self.minibatch_size = minibatch_size
+        self.image_size = image_size
+        self.conv_params = conv_params
+
+        '''
+        Dimensions:
+        0- minibatch
+        1- plane/color
+        2- y (rows)
+        3- x (cols)
+        '''
+        self.x = T.tensor4('x')
+        self.h = T.tensor4('h')
+
+        self.lr = theano.shared(numpy.asarray(learning_rate,
+                                    dtype=theano.config.floatX))
+        self.sparsity_lambda = \
+                theano.shared( \
+                    numpy.asarray( \
+                        sparsity_lambda, 
+                        dtype=theano.config.floatX))
+        self.sparsity_p = \
+                theano.shared( \
+                    numpy.asarray(sparsity_p, \
+                            dtype=theano.config.floatX))
+
+        self.numpy_rng = numpy.random.RandomState(1234)
+
+        if not USING_GPU:
+            self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30))
+        else:
+            self.theano_rng = MRG_RandomStreams(234, use_cuda=True)
+
+        self._init_params()
+        self._init_functions()
+
+    def _get_visibles_shape(self):
+        imsz = self.image_size
+        return (self.minibatch_size,
+                    self.conv_params.num_input_planes,
+                    imsz[0], imsz[1])
+
+    def _get_hiddens_shape(self):
+        cp = self.conv_params
+        imsz = self.image_size
+        wf, hf = cp.height_filters, cp.width_filters
+        return (self.minibatch_size, cp.num_filters, 
+                    imsz[0] - hf + 1, imsz[1] - wf + 1)
+
+    def _init_params(self):
+        cp = self.conv_params
+
+        self.W = _init_conv_weights(cp, 'W')
+        self.b_h = _init_conv_biases(cp.num_filters, 'b_h')
+        '''
+        Lee09 mentions "all visible units share a single bias c"
+        but for upper layers it's pretty clear we need one
+        per plane, by symmetry
+        '''
+        self.b_x = _init_conv_biases(cp.num_input_planes, 'b_x')
+
+        self.params = [self.W, self.b_h, self.b_x]
+
+        # flip filters horizontally and vertically
+        W_flipped = self.W[:, :, ::-1, ::-1]
+        # also have to invert the filters/num_planes
+        self.W_tilde = W_flipped.dimshuffle(1,0,2,3)
+
+    '''
+    I_up and I_down come from the symbol used in the 
+    Lee 2009 CRBM paper
+    '''
+    def _I_up(self, visibles_mb):
+        '''
+        output of conv is features maps of size
+                image_size - filter_size + 1
+        The dimshuffle serves to broadcast b_h so that it 
+        corresponds to output planes
+        '''
+        fshp = self.conv_params.as_conv2d_shape_tuple()
+        return conv.conv2d(visibles_mb, self.W,
+                    filter_shape=fshp) + \
+                    self.b_h.dimshuffle('x',0,'x','x')
+
+    def _I_down(self, hiddens_mb):
+        '''
+        notice border_mode='full'... we want to get
+        back the original size
+        so we get feature_map_size + filter_size - 1
+        The dimshuffle serves to broadcast b_x so that
+        it corresponds to output planes
+        '''
+        fshp = list(self.conv_params.as_conv2d_shape_tuple())
+        # num_filters and num_planes swapped
+        fshp[0], fshp[1] = fshp[1], fshp[0]
+        return conv.conv2d(hiddens_mb, self.W_tilde, 
+                    border_mode='full',filter_shape=tuple(fshp)) + \
+                self.b_x.dimshuffle('x',0,'x','x')
+
+    def _mean_free_energy(self, visibles_mb):
+        '''
+        visibles_mb is mb_size x num_planes x h x w
+
+        we want to match the summed input planes
+            (second dimension, first is mb index)
+        to respective bias terms for the visibles
+        The dimshuffle isn't really necessary, 
+            but I put it there for clarity.
+        '''
+        vbias_term = \
+            self.b_x.dimshuffle('x',0) * \
+            T.sum(visibles_mb,axis=[2,3])
+        # now sum over term per planes, get one free energy 
+        # contribution per element of minibatch
+        vbias_term = - T.sum(vbias_term, axis=1)
+
+        '''
+        Here it's a bit more complex, a few points:
+        - The usual free energy, in the fully connected case,
+            is a sum over all hiddens.
+          We do the same thing here, but each unit has limited
+            connectivity and there's weight reuse.
+          Therefore we only need to first do the convolutions
+            (with I_up) which gives us
+          what would normally be the Wx+b_h for each hidden.
+            Once we have this,
+          we take the log(1+exp(sum for this hidden)) elemwise 
+            for each hidden,
+          then we sum for all hiddens in one example of the minibatch.
+          
+        - Notice that we reuse the same b_h everywhere instead of 
+            using one b per hidden,
+          so the broadcasting for b_h done in I_up is all right.
+        
+        That sum is over all hiddens, so all filters
+             (planes of hiddens), x, and y.
+        In the end we get one free energy contribution per
+            example of the minibatch.
+        '''
+        softplused = T.log(1.0+T.exp(self._I_up(visibles_mb)))
+        # h_sz = self._get_hiddens_shape()
+        # this simplifies the sum
+        # num_hiddens = h_sz[1] * h_sz[2] * h_sz[3]
+        # reshaped = T.reshape(softplused, 
+        #       (self.minibatch_size, num_hiddens))
+
+        # this is because the 0,1,1,1 sum pattern is not 
+        # implemented on gpu, but the 1,0,1,1 pattern is
+        dimshuffled = softplused.dimshuffle(1,0,2,3)
+        xh_and_hbias_term = - T.sum(dimshuffled, axis=[0,2,3])
+
+        '''
+        both bias_term and vbias_term end up with one
+        contributor to free energy per minibatch
+        so we mean over minibatches
+        '''
+        return T.mean(vbias_term + xh_and_hbias_term)
+
+    def _init_functions(self):
+        # propup
+        # b_h is broadcasted keeping in mind we want it to
+        # correspond to each new plane (corresponding to filters)
+        I_up = self._I_up(self.x)
+        # expected values for the distributions for each hidden
+        E_h_given_x = sigmoid(I_up) 
+        # might be needed if we ever want a version where we
+        # take expectations instead of samples for CD learning
+        self.E_h_given_x_func = theano.function([self.x], E_h_given_x)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nE_h_given_x_func"
+            theano.printing.debugprint(self.E_h_given_x_func)
+
+        h_sample_given_x = \
+            self.theano_rng.binomial( \
+                            size = self._get_hiddens_shape(),
+                            n = 1, 
+                            p = E_h_given_x, 
+                            dtype = theano.config.floatX)
+
+        self.h_sample_given_x_func = \
+            theano.function([self.x],
+                    h_sample_given_x)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nh_sample_given_x_func"
+            theano.printing.debugprint(self.h_sample_given_x_func)
+
+        # propdown
+        I_down = self._I_down(self.h)
+        E_x_given_h = sigmoid(I_down)
+        self.E_x_given_h_func = theano.function([self.h], E_x_given_h)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nE_x_given_h_func"
+            theano.printing.debugprint(self.E_x_given_h_func)
+
+        x_sample_given_h = \
+            self.theano_rng.binomial( \
+                            size = self._get_visibles_shape(),
+                            n = 1, 
+                            p = E_x_given_h, 
+                            dtype = theano.config.floatX)
+
+        self.x_sample_given_h_func = \
+            theano.function([self.h], 
+                    x_sample_given_h)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nx_sample_given_h_func"
+            theano.printing.debugprint(self.x_sample_given_h_func)
+
+        ##############################################
+        # cd update done by grad of free energy
+        
+        x_tilde = T.tensor4('x_tilde') 
+        cd_update_cost = self._mean_free_energy(self.x) - \
+                            self._mean_free_energy(x_tilde)
+
+        cd_grad = T.grad(cd_update_cost, self.params)
+        # This is NLL minimization so we use a -
+        cd_updates = {self.W: self.W - self.lr * cd_grad[0],
+                    self.b_h: self.b_h - self.lr * cd_grad[1],
+                    self.b_x: self.b_x - self.lr * cd_grad[2]}
+
+        cd_returned = [cd_update_cost,
+                        cd_grad[0], cd_grad[1], cd_grad[2],
+                        self.lr * cd_grad[0],
+                        self.lr * cd_grad[1],
+                        self.lr * cd_grad[2]]
+        self.cd_return_desc = \
+            ['cd_update_cost',
+                'cd_grad_W', 'cd_grad_b_h', 'cd_grad_b_x',
+                'lr_times_cd_grad_W',
+                'lr_times_cd_grad_b_h',
+                'lr_times_cd_grad_b_x']
+    
+        self.cd_update_function = \
+                theano.function([self.x, x_tilde], 
+                    cd_returned, updates=cd_updates)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\ncd_update_function"
+            theano.printing.debugprint(self.cd_update_function)
+
+        ##############
+        # sparsity update, based on grad for b_h only
+
+        '''
+        This mean returns an array of shape 
+            (num_hiddens_planes, feature_map_height, feature_map_width)
+        (so it's a mean over each unit's activation)
+        '''
+        mean_expected_activation = T.mean(E_h_given_x, axis=0)
+        # sparsity_p is broadcasted everywhere
+        sparsity_update_cost = \
+                T.sqr(self.sparsity_p - mean_expected_activation)
+        sparsity_update_cost = \
+            T.sum(T.sum(T.sum( \
+                sparsity_update_cost, axis=2), axis=1), axis=0)
+        sparsity_grad = T.grad(sparsity_update_cost, [self.W, self.b_h])
+
+        sparsity_returned = \
+            [sparsity_update_cost,
+             sparsity_grad[0], sparsity_grad[1],
+             self.sparsity_lambda * self.lr * sparsity_grad[0],
+             self.sparsity_lambda * self.lr * sparsity_grad[1]]
+        self.sparsity_return_desc = \
+            ['sparsity_update_cost',
+                'sparsity_grad_W',
+                'sparsity_grad_b_h',
+                'lambda_lr_times_sparsity_grad_W',
+                'lambda_lr_times_sparsity_grad_b_h']
+
+        # gradient _descent_ so we use a -
+        sparsity_update = \
+            {self.b_h: self.b_h - \
+                self.sparsity_lambda * self.lr * sparsity_grad[1],
+            self.W: self.W - \
+                self.sparsity_lambda * self.lr * sparsity_grad[0]}
+        self.sparsity_update_function = \
+            theano.function([self.x], 
+                sparsity_returned, updates=sparsity_update)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nsparsity_update_function"
+            theano.printing.debugprint(self.sparsity_update_function)
+
+    def CD_step(self, x):
+        h1 = self.h_sample_given_x_func(x)
+        x2 = self.x_sample_given_h_func(h1)
+        return self.cd_update_function(x, x2)
+
+    def sparsity_step(self, x):
+        return self.sparsity_update_function(x)
+
+    # these two also operate on minibatches
+
+    def random_gibbs_samples(self, num_updown_steps):
+        start_x = self.numpy_rng.rand(*self._get_visibles_shape())
+        return self.gibbs_samples_from(start_x, num_updown_steps)
+
+    def gibbs_samples_from(self, start_x, num_updown_steps):
+        x_sample = start_x
+        for i in xrange(num_updown_steps):
+            h_sample = self.h_sample_given_x_func(x_sample)
+            x_sample = self.x_sample_given_h_func(h_sample)
+        return x_sample
+
+