changeset 337:8d116d4a7593

Added convolutional RBM (ala Lee09) code, imported from my working dir elsewhere. Seems to work for one layer. No subsampling yet.
author fsavard
date Fri, 16 Apr 2010 16:05:55 -0400
parents a79db7cee035
children fca22114bb23
files deep/crbm/crbm.py deep/crbm/mnist_crbm.py
diffstat 2 files changed, 589 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/crbm/crbm.py	Fri Apr 16 16:05:55 2010 -0400
@@ -0,0 +1,374 @@
+import sys
+import os, os.path
+
+import numpy
+
+import theano
+
+USING_GPU = "gpu" in theano.config.device
+
+import theano.tensor as T
+from theano.tensor.nnet import conv, sigmoid
+
+if not USING_GPU:
+    from theano.tensor.shared_randomstreams import RandomStreams
+else:
+    from theano.sandbox.rng_mrg import MRG_RandomStreams
+
+_PRINT_GRAPHS = True
+
+def _init_conv_biases(num_filters, varname, rng=numpy.random):
+    b_shp = (num_filters,)
+    b = theano.shared( numpy.asarray(
+                rng.uniform(low=-.5, high=.5, size=b_shp),
+                dtype=theano.config.floatX), name=varname)
+    return b
+
+def _init_conv_weights(conv_params, varname, rng=numpy.random):
+    cp = conv_params
+
+    # initialize shared variable for weights.
+    w_shp = conv_params.as_conv2d_shape_tuple()
+    w_bound =  numpy.sqrt(cp.num_input_planes * \
+                    cp.height_filters * cp.width_filters)
+    W = theano.shared( numpy.asarray(
+                rng.uniform(
+                    low=-1.0 / w_bound,
+                    high=1.0 / w_bound,
+                    size=w_shp),
+                dtype=theano.config.floatX), name=varname)
+
+    return W
+
+# Shape of W for conv2d
+class ConvolutionParams:
+    def __init__(self, num_filters, num_input_planes, height_filters, width_filters):
+        self.num_filters = num_filters
+        self.num_input_planes = num_input_planes
+        self.height_filters = height_filters
+        self.width_filters = width_filters
+
+    def as_conv2d_shape_tuple(self):
+        cp = self
+        return (cp.num_filters, cp.num_input_planes,
+                    cp.height_filters, cp.width_filters)
+
+class CRBM:
+    def __init__(self, minibatch_size, image_size, conv_params,
+                 learning_rate, sparsity_lambda, sparsity_p):
+        '''
+        Parameters
+        ----------
+        image_size
+            height, width
+        '''
+        self.minibatch_size = minibatch_size
+        self.image_size = image_size
+        self.conv_params = conv_params
+
+        '''
+        Dimensions:
+        0- minibatch
+        1- plane/color
+        2- y (rows)
+        3- x (cols)
+        '''
+        self.x = T.tensor4('x')
+        self.h = T.tensor4('h')
+
+        self.lr = theano.shared(numpy.asarray(learning_rate,
+                                    dtype=theano.config.floatX))
+        self.sparsity_lambda = \
+                theano.shared( \
+                    numpy.asarray( \
+                        sparsity_lambda, 
+                        dtype=theano.config.floatX))
+        self.sparsity_p = \
+                theano.shared( \
+                    numpy.asarray(sparsity_p, \
+                            dtype=theano.config.floatX))
+
+        self.numpy_rng = numpy.random.RandomState(1234)
+
+        if not USING_GPU:
+            self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30))
+        else:
+            self.theano_rng = MRG_RandomStreams(234, use_cuda=True)
+
+        self._init_params()
+        self._init_functions()
+
+    def _get_visibles_shape(self):
+        imsz = self.image_size
+        return (self.minibatch_size,
+                    self.conv_params.num_input_planes,
+                    imsz[0], imsz[1])
+
+    def _get_hiddens_shape(self):
+        cp = self.conv_params
+        imsz = self.image_size
+        wf, hf = cp.height_filters, cp.width_filters
+        return (self.minibatch_size, cp.num_filters, 
+                    imsz[0] - hf + 1, imsz[1] - wf + 1)
+
+    def _init_params(self):
+        cp = self.conv_params
+
+        self.W = _init_conv_weights(cp, 'W')
+        self.b_h = _init_conv_biases(cp.num_filters, 'b_h')
+        '''
+        Lee09 mentions "all visible units share a single bias c"
+        but for upper layers it's pretty clear we need one
+        per plane, by symmetry
+        '''
+        self.b_x = _init_conv_biases(cp.num_input_planes, 'b_x')
+
+        self.params = [self.W, self.b_h, self.b_x]
+
+        # flip filters horizontally and vertically
+        W_flipped = self.W[:, :, ::-1, ::-1]
+        # also have to invert the filters/num_planes
+        self.W_tilde = W_flipped.dimshuffle(1,0,2,3)
+
+    '''
+    I_up and I_down come from the symbol used in the 
+    Lee 2009 CRBM paper
+    '''
+    def _I_up(self, visibles_mb):
+        '''
+        output of conv is features maps of size
+                image_size - filter_size + 1
+        The dimshuffle serves to broadcast b_h so that it 
+        corresponds to output planes
+        '''
+        fshp = self.conv_params.as_conv2d_shape_tuple()
+        return conv.conv2d(visibles_mb, self.W,
+                    filter_shape=fshp) + \
+                    self.b_h.dimshuffle('x',0,'x','x')
+
+    def _I_down(self, hiddens_mb):
+        '''
+        notice border_mode='full'... we want to get
+        back the original size
+        so we get feature_map_size + filter_size - 1
+        The dimshuffle serves to broadcast b_x so that
+        it corresponds to output planes
+        '''
+        fshp = list(self.conv_params.as_conv2d_shape_tuple())
+        # num_filters and num_planes swapped
+        fshp[0], fshp[1] = fshp[1], fshp[0]
+        return conv.conv2d(hiddens_mb, self.W_tilde, 
+                    border_mode='full',filter_shape=tuple(fshp)) + \
+                self.b_x.dimshuffle('x',0,'x','x')
+
+    def _mean_free_energy(self, visibles_mb):
+        '''
+        visibles_mb is mb_size x num_planes x h x w
+
+        we want to match the summed input planes
+            (second dimension, first is mb index)
+        to respective bias terms for the visibles
+        The dimshuffle isn't really necessary, 
+            but I put it there for clarity.
+        '''
+        vbias_term = \
+            self.b_x.dimshuffle('x',0) * \
+            T.sum(visibles_mb,axis=[2,3])
+        # now sum over term per planes, get one free energy 
+        # contribution per element of minibatch
+        vbias_term = - T.sum(vbias_term, axis=1)
+
+        '''
+        Here it's a bit more complex, a few points:
+        - The usual free energy, in the fully connected case,
+            is a sum over all hiddens.
+          We do the same thing here, but each unit has limited
+            connectivity and there's weight reuse.
+          Therefore we only need to first do the convolutions
+            (with I_up) which gives us
+          what would normally be the Wx+b_h for each hidden.
+            Once we have this,
+          we take the log(1+exp(sum for this hidden)) elemwise 
+            for each hidden,
+          then we sum for all hiddens in one example of the minibatch.
+          
+        - Notice that we reuse the same b_h everywhere instead of 
+            using one b per hidden,
+          so the broadcasting for b_h done in I_up is all right.
+        
+        That sum is over all hiddens, so all filters
+             (planes of hiddens), x, and y.
+        In the end we get one free energy contribution per
+            example of the minibatch.
+        '''
+        softplused = T.log(1.0+T.exp(self._I_up(visibles_mb)))
+        # h_sz = self._get_hiddens_shape()
+        # this simplifies the sum
+        # num_hiddens = h_sz[1] * h_sz[2] * h_sz[3]
+        # reshaped = T.reshape(softplused, 
+        #       (self.minibatch_size, num_hiddens))
+
+        # this is because the 0,1,1,1 sum pattern is not 
+        # implemented on gpu, but the 1,0,1,1 pattern is
+        dimshuffled = softplused.dimshuffle(1,0,2,3)
+        xh_and_hbias_term = - T.sum(dimshuffled, axis=[0,2,3])
+
+        '''
+        both bias_term and vbias_term end up with one
+        contributor to free energy per minibatch
+        so we mean over minibatches
+        '''
+        return T.mean(vbias_term + xh_and_hbias_term)
+
+    def _init_functions(self):
+        # propup
+        # b_h is broadcasted keeping in mind we want it to
+        # correspond to each new plane (corresponding to filters)
+        I_up = self._I_up(self.x)
+        # expected values for the distributions for each hidden
+        E_h_given_x = sigmoid(I_up) 
+        # might be needed if we ever want a version where we
+        # take expectations instead of samples for CD learning
+        self.E_h_given_x_func = theano.function([self.x], E_h_given_x)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nE_h_given_x_func"
+            theano.printing.debugprint(self.E_h_given_x_func)
+
+        h_sample_given_x = \
+            self.theano_rng.binomial( \
+                            size = self._get_hiddens_shape(),
+                            n = 1, 
+                            p = E_h_given_x, 
+                            dtype = theano.config.floatX)
+
+        self.h_sample_given_x_func = \
+            theano.function([self.x],
+                    h_sample_given_x)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nh_sample_given_x_func"
+            theano.printing.debugprint(self.h_sample_given_x_func)
+
+        # propdown
+        I_down = self._I_down(self.h)
+        E_x_given_h = sigmoid(I_down)
+        self.E_x_given_h_func = theano.function([self.h], E_x_given_h)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nE_x_given_h_func"
+            theano.printing.debugprint(self.E_x_given_h_func)
+
+        x_sample_given_h = \
+            self.theano_rng.binomial( \
+                            size = self._get_visibles_shape(),
+                            n = 1, 
+                            p = E_x_given_h, 
+                            dtype = theano.config.floatX)
+
+        self.x_sample_given_h_func = \
+            theano.function([self.h], 
+                    x_sample_given_h)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nx_sample_given_h_func"
+            theano.printing.debugprint(self.x_sample_given_h_func)
+
+        ##############################################
+        # cd update done by grad of free energy
+        
+        x_tilde = T.tensor4('x_tilde') 
+        cd_update_cost = self._mean_free_energy(self.x) - \
+                            self._mean_free_energy(x_tilde)
+
+        cd_grad = T.grad(cd_update_cost, self.params)
+        # This is NLL minimization so we use a -
+        cd_updates = {self.W: self.W - self.lr * cd_grad[0],
+                    self.b_h: self.b_h - self.lr * cd_grad[1],
+                    self.b_x: self.b_x - self.lr * cd_grad[2]}
+
+        cd_returned = [cd_update_cost,
+                        cd_grad[0], cd_grad[1], cd_grad[2],
+                        self.lr * cd_grad[0],
+                        self.lr * cd_grad[1],
+                        self.lr * cd_grad[2]]
+        self.cd_return_desc = \
+            ['cd_update_cost',
+                'cd_grad_W', 'cd_grad_b_h', 'cd_grad_b_x',
+                'lr_times_cd_grad_W',
+                'lr_times_cd_grad_b_h',
+                'lr_times_cd_grad_b_x']
+    
+        self.cd_update_function = \
+                theano.function([self.x, x_tilde], 
+                    cd_returned, updates=cd_updates)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\ncd_update_function"
+            theano.printing.debugprint(self.cd_update_function)
+
+        ##############
+        # sparsity update, based on grad for b_h only
+
+        '''
+        This mean returns an array of shape 
+            (num_hiddens_planes, feature_map_height, feature_map_width)
+        (so it's a mean over each unit's activation)
+        '''
+        mean_expected_activation = T.mean(E_h_given_x, axis=0)
+        # sparsity_p is broadcasted everywhere
+        sparsity_update_cost = \
+                T.sqr(self.sparsity_p - mean_expected_activation)
+        sparsity_update_cost = \
+            T.sum(T.sum(T.sum( \
+                sparsity_update_cost, axis=2), axis=1), axis=0)
+        sparsity_grad = T.grad(sparsity_update_cost, [self.W, self.b_h])
+
+        sparsity_returned = \
+            [sparsity_update_cost,
+             sparsity_grad[0], sparsity_grad[1],
+             self.sparsity_lambda * self.lr * sparsity_grad[0],
+             self.sparsity_lambda * self.lr * sparsity_grad[1]]
+        self.sparsity_return_desc = \
+            ['sparsity_update_cost',
+                'sparsity_grad_W',
+                'sparsity_grad_b_h',
+                'lambda_lr_times_sparsity_grad_W',
+                'lambda_lr_times_sparsity_grad_b_h']
+
+        # gradient _descent_ so we use a -
+        sparsity_update = \
+            {self.b_h: self.b_h - \
+                self.sparsity_lambda * self.lr * sparsity_grad[1],
+            self.W: self.W - \
+                self.sparsity_lambda * self.lr * sparsity_grad[0]}
+        self.sparsity_update_function = \
+            theano.function([self.x], 
+                sparsity_returned, updates=sparsity_update)
+
+        if _PRINT_GRAPHS:
+            print "----------------------\nsparsity_update_function"
+            theano.printing.debugprint(self.sparsity_update_function)
+
+    def CD_step(self, x):
+        h1 = self.h_sample_given_x_func(x)
+        x2 = self.x_sample_given_h_func(h1)
+        return self.cd_update_function(x, x2)
+
+    def sparsity_step(self, x):
+        return self.sparsity_update_function(x)
+
+    # these two also operate on minibatches
+
+    def random_gibbs_samples(self, num_updown_steps):
+        start_x = self.numpy_rng.rand(*self._get_visibles_shape())
+        return self.gibbs_samples_from(start_x, num_updown_steps)
+
+    def gibbs_samples_from(self, start_x, num_updown_steps):
+        x_sample = start_x
+        for i in xrange(num_updown_steps):
+            h_sample = self.h_sample_given_x_func(x_sample)
+            x_sample = self.x_sample_given_h_func(h_sample)
+        return x_sample
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/deep/crbm/mnist_crbm.py	Fri Apr 16 16:05:55 2010 -0400
@@ -0,0 +1,215 @@
+#!/usr/bin/python
+
+import sys
+import os, os.path
+
+import numpy as N
+
+import theano
+import theano.tensor as T
+
+from crbm import CRBM, ConvolutionParams
+
+from pylearn.datasets import MNIST
+from pylearn.io.image_tiling import tile_raster_images
+
+import Image
+
+from pylearn.io.seriestables import *
+import tables
+
+IMAGE_OUTPUT_DIR = 'img/'
+
+REDUCE_EVERY = 100
+
+def filename_from_time(suffix):
+    import datetime
+    return str(datetime.datetime.now()) + suffix + ".png"
+
+# Just a shortcut for a common case where we need a few
+# related Error (float) series
+def get_accumulator_series_array( \
+                hdf5_file, group_name, series_names, 
+                reduce_every,
+                index_names=('epoch','minibatch'),
+                stdout_too=True,
+                skip_hdf5_append=False):
+    all_series = []
+
+    hdf5_file.createGroup('/', group_name)
+
+    other_targets = []
+    if stdout_too:
+        other_targets = [StdoutAppendTarget()]
+
+    for sn in series_names:
+        series_base = \
+            ErrorSeries(error_name=sn,
+                table_name=sn,
+                hdf5_file=hdf5_file,
+                hdf5_group='/'+group_name,
+                index_names=index_names,
+                other_targets=other_targets,
+                skip_hdf5_append=skip_hdf5_append)
+
+        all_series.append( \
+            AccumulatorSeriesWrapper( \
+                    base_series=series_base,
+                    reduce_every=reduce_every))
+
+    ret_wrapper = SeriesArrayWrapper(all_series)
+
+    return ret_wrapper
+
+class MnistCrbm(object):
+    def __init__(self):
+        self.mnist = MNIST.full()#first_10k()
+
+        self.cp = ConvolutionParams( \
+                    num_filters=40,
+                    num_input_planes=1,
+                    height_filters=12,
+                    width_filters=12)
+
+        self.image_size = (28,28)
+
+        self.minibatch_size = 10
+
+        self.lr = 0.01
+        self.sparsity_lambda = 1.0
+        # about 1/num_filters, so only one filter active at a time
+        # 40 * 0.05 = ~2 filters active for any given pixel
+        self.sparsity_p = 0.05
+
+        self.crbm = CRBM( \
+                    minibatch_size=self.minibatch_size,
+                    image_size=self.image_size,
+                    conv_params=self.cp,
+                    learning_rate=self.lr,
+                    sparsity_lambda=self.sparsity_lambda,
+                    sparsity_p=self.sparsity_p)
+        
+        self.num_epochs = 10
+
+        self.init_series()
+ 
+    def init_series(self):
+
+        series = {}
+
+        basedir = os.getcwd()
+
+        h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w")
+
+        cd_series_names = self.crbm.cd_return_desc
+        series['cd'] = \
+            get_accumulator_series_array( \
+                h5f, 'cd', cd_series_names,
+                REDUCE_EVERY,
+                stdout_too=True)
+
+        sparsity_series_names = self.crbm.sparsity_return_desc
+        series['sparsity'] = \
+            get_accumulator_series_array( \
+                h5f, 'sparsity', sparsity_series_names,
+                REDUCE_EVERY,
+                stdout_too=True)
+
+        # so first we create the names for each table, based on 
+        # position of each param in the array
+        params_stdout = StdoutAppendTarget("\n------\nParams")
+        series['params'] = SharedParamsStatisticsWrapper(
+                            new_group_name="params",
+                            base_group="/",
+                            arrays_names=['W','b_h','b_x'],
+                            hdf5_file=h5f,
+                            index_names=('epoch','minibatch'),
+                            other_targets=[params_stdout])
+
+        self.series = series
+
+    def train(self):
+        num_minibatches = len(self.mnist.train.x) / self.minibatch_size
+
+        print_every = 1000
+        visualize_every = 5000
+        gibbs_steps_from_random = 1000
+
+        for epoch in xrange(self.num_epochs):
+            for mb_index in xrange(num_minibatches):
+                mb_x = self.mnist.train.x \
+                         [mb_index : mb_index+self.minibatch_size]
+                mb_x = mb_x.reshape((self.minibatch_size, 1, 28, 28))
+
+                #E_h = crbm.E_h_given_x_func(mb_x)
+                #print "Shape of E_h", E_h.shape
+
+                cd_return = self.crbm.CD_step(mb_x)
+                sp_return = self.crbm.sparsity_step(mb_x)
+
+                self.series['cd'].append( \
+                        (epoch, mb_index), cd_return)
+                self.series['sparsity'].append( \
+                        (epoch, mb_index), sp_return)
+
+                total_idx = epoch*num_minibatches + mb_index
+
+                if (total_idx+1) % REDUCE_EVERY == 0:
+                    self.series['params'].append( \
+                        (epoch, mb_index), self.crbm.params)
+
+                if total_idx % visualize_every == 0:
+                    self.visualize_gibbs_result(\
+                        mb_x, gibbs_steps_from_random)
+                    self.visualize_gibbs_result(mb_x, 1)
+                    self.visualize_filters()
+    
+    def visualize_gibbs_result(self, start_x, gibbs_steps):
+        # Run minibatch_size chains for gibbs_steps
+        x_samples = None
+        if not start_x is None:
+            x_samples = self.crbm.gibbs_samples_from(start_x, gibbs_steps)
+        else:
+            x_samples = self.crbm.random_gibbs_samples(gibbs_steps)
+        x_samples = x_samples.reshape((self.minibatch_size, 28*28))
+ 
+        tile = tile_raster_images(x_samples, self.image_size,
+                    (1, self.minibatch_size), output_pixel_vals=True)
+
+        filepath = os.path.join(IMAGE_OUTPUT_DIR,
+                    filename_from_time("gibbs"))
+        img = Image.fromarray(tile)
+        img.save(filepath)
+
+        print "Result of running Gibbs", \
+                gibbs_steps, "times outputed to", filepath
+
+    def visualize_filters(self):
+        cp = self.cp
+
+        # filter size
+        fsz = (cp.height_filters, cp.width_filters)
+        tile_shape = (cp.num_filters, cp.num_input_planes)
+
+        filters_flattened = self.crbm.W.value.reshape(
+                                (tile_shape[0]*tile_shape[1],
+                                fsz[0]*fsz[1]))
+
+        tile = tile_raster_images(filters_flattened, fsz, 
+                                    tile_shape, output_pixel_vals=True)
+
+        filepath = os.path.join(IMAGE_OUTPUT_DIR,
+                        filename_from_time("filters"))
+        img = Image.fromarray(tile)
+        img.save(filepath)
+
+        print "Filters (as images) outputed to", filepath
+        print "b_h is", self.crbm.b_h.value
+
+
+
+
+if __name__ == '__main__':
+    mc = MnistCrbm()
+    mc.train()
+