Mercurial > ift6266
changeset 337:8d116d4a7593
Added convolutional RBM (ala Lee09) code, imported from my working dir elsewhere. Seems to work for one layer. No subsampling yet.
author | fsavard |
---|---|
date | Fri, 16 Apr 2010 16:05:55 -0400 |
parents | a79db7cee035 |
children | fca22114bb23 |
files | deep/crbm/crbm.py deep/crbm/mnist_crbm.py |
diffstat | 2 files changed, 589 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/crbm/crbm.py Fri Apr 16 16:05:55 2010 -0400 @@ -0,0 +1,374 @@ +import sys +import os, os.path + +import numpy + +import theano + +USING_GPU = "gpu" in theano.config.device + +import theano.tensor as T +from theano.tensor.nnet import conv, sigmoid + +if not USING_GPU: + from theano.tensor.shared_randomstreams import RandomStreams +else: + from theano.sandbox.rng_mrg import MRG_RandomStreams + +_PRINT_GRAPHS = True + +def _init_conv_biases(num_filters, varname, rng=numpy.random): + b_shp = (num_filters,) + b = theano.shared( numpy.asarray( + rng.uniform(low=-.5, high=.5, size=b_shp), + dtype=theano.config.floatX), name=varname) + return b + +def _init_conv_weights(conv_params, varname, rng=numpy.random): + cp = conv_params + + # initialize shared variable for weights. + w_shp = conv_params.as_conv2d_shape_tuple() + w_bound = numpy.sqrt(cp.num_input_planes * \ + cp.height_filters * cp.width_filters) + W = theano.shared( numpy.asarray( + rng.uniform( + low=-1.0 / w_bound, + high=1.0 / w_bound, + size=w_shp), + dtype=theano.config.floatX), name=varname) + + return W + +# Shape of W for conv2d +class ConvolutionParams: + def __init__(self, num_filters, num_input_planes, height_filters, width_filters): + self.num_filters = num_filters + self.num_input_planes = num_input_planes + self.height_filters = height_filters + self.width_filters = width_filters + + def as_conv2d_shape_tuple(self): + cp = self + return (cp.num_filters, cp.num_input_planes, + cp.height_filters, cp.width_filters) + +class CRBM: + def __init__(self, minibatch_size, image_size, conv_params, + learning_rate, sparsity_lambda, sparsity_p): + ''' + Parameters + ---------- + image_size + height, width + ''' + self.minibatch_size = minibatch_size + self.image_size = image_size + self.conv_params = conv_params + + ''' + Dimensions: + 0- minibatch + 1- plane/color + 2- y (rows) + 3- x (cols) + ''' + self.x = T.tensor4('x') + self.h = T.tensor4('h') + + self.lr = theano.shared(numpy.asarray(learning_rate, + dtype=theano.config.floatX)) + self.sparsity_lambda = \ + theano.shared( \ + numpy.asarray( \ + sparsity_lambda, + dtype=theano.config.floatX)) + self.sparsity_p = \ + theano.shared( \ + numpy.asarray(sparsity_p, \ + dtype=theano.config.floatX)) + + self.numpy_rng = numpy.random.RandomState(1234) + + if not USING_GPU: + self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30)) + else: + self.theano_rng = MRG_RandomStreams(234, use_cuda=True) + + self._init_params() + self._init_functions() + + def _get_visibles_shape(self): + imsz = self.image_size + return (self.minibatch_size, + self.conv_params.num_input_planes, + imsz[0], imsz[1]) + + def _get_hiddens_shape(self): + cp = self.conv_params + imsz = self.image_size + wf, hf = cp.height_filters, cp.width_filters + return (self.minibatch_size, cp.num_filters, + imsz[0] - hf + 1, imsz[1] - wf + 1) + + def _init_params(self): + cp = self.conv_params + + self.W = _init_conv_weights(cp, 'W') + self.b_h = _init_conv_biases(cp.num_filters, 'b_h') + ''' + Lee09 mentions "all visible units share a single bias c" + but for upper layers it's pretty clear we need one + per plane, by symmetry + ''' + self.b_x = _init_conv_biases(cp.num_input_planes, 'b_x') + + self.params = [self.W, self.b_h, self.b_x] + + # flip filters horizontally and vertically + W_flipped = self.W[:, :, ::-1, ::-1] + # also have to invert the filters/num_planes + self.W_tilde = W_flipped.dimshuffle(1,0,2,3) + + ''' + I_up and I_down come from the symbol used in the + Lee 2009 CRBM paper + ''' + def _I_up(self, visibles_mb): + ''' + output of conv is features maps of size + image_size - filter_size + 1 + The dimshuffle serves to broadcast b_h so that it + corresponds to output planes + ''' + fshp = self.conv_params.as_conv2d_shape_tuple() + return conv.conv2d(visibles_mb, self.W, + filter_shape=fshp) + \ + self.b_h.dimshuffle('x',0,'x','x') + + def _I_down(self, hiddens_mb): + ''' + notice border_mode='full'... we want to get + back the original size + so we get feature_map_size + filter_size - 1 + The dimshuffle serves to broadcast b_x so that + it corresponds to output planes + ''' + fshp = list(self.conv_params.as_conv2d_shape_tuple()) + # num_filters and num_planes swapped + fshp[0], fshp[1] = fshp[1], fshp[0] + return conv.conv2d(hiddens_mb, self.W_tilde, + border_mode='full',filter_shape=tuple(fshp)) + \ + self.b_x.dimshuffle('x',0,'x','x') + + def _mean_free_energy(self, visibles_mb): + ''' + visibles_mb is mb_size x num_planes x h x w + + we want to match the summed input planes + (second dimension, first is mb index) + to respective bias terms for the visibles + The dimshuffle isn't really necessary, + but I put it there for clarity. + ''' + vbias_term = \ + self.b_x.dimshuffle('x',0) * \ + T.sum(visibles_mb,axis=[2,3]) + # now sum over term per planes, get one free energy + # contribution per element of minibatch + vbias_term = - T.sum(vbias_term, axis=1) + + ''' + Here it's a bit more complex, a few points: + - The usual free energy, in the fully connected case, + is a sum over all hiddens. + We do the same thing here, but each unit has limited + connectivity and there's weight reuse. + Therefore we only need to first do the convolutions + (with I_up) which gives us + what would normally be the Wx+b_h for each hidden. + Once we have this, + we take the log(1+exp(sum for this hidden)) elemwise + for each hidden, + then we sum for all hiddens in one example of the minibatch. + + - Notice that we reuse the same b_h everywhere instead of + using one b per hidden, + so the broadcasting for b_h done in I_up is all right. + + That sum is over all hiddens, so all filters + (planes of hiddens), x, and y. + In the end we get one free energy contribution per + example of the minibatch. + ''' + softplused = T.log(1.0+T.exp(self._I_up(visibles_mb))) + # h_sz = self._get_hiddens_shape() + # this simplifies the sum + # num_hiddens = h_sz[1] * h_sz[2] * h_sz[3] + # reshaped = T.reshape(softplused, + # (self.minibatch_size, num_hiddens)) + + # this is because the 0,1,1,1 sum pattern is not + # implemented on gpu, but the 1,0,1,1 pattern is + dimshuffled = softplused.dimshuffle(1,0,2,3) + xh_and_hbias_term = - T.sum(dimshuffled, axis=[0,2,3]) + + ''' + both bias_term and vbias_term end up with one + contributor to free energy per minibatch + so we mean over minibatches + ''' + return T.mean(vbias_term + xh_and_hbias_term) + + def _init_functions(self): + # propup + # b_h is broadcasted keeping in mind we want it to + # correspond to each new plane (corresponding to filters) + I_up = self._I_up(self.x) + # expected values for the distributions for each hidden + E_h_given_x = sigmoid(I_up) + # might be needed if we ever want a version where we + # take expectations instead of samples for CD learning + self.E_h_given_x_func = theano.function([self.x], E_h_given_x) + + if _PRINT_GRAPHS: + print "----------------------\nE_h_given_x_func" + theano.printing.debugprint(self.E_h_given_x_func) + + h_sample_given_x = \ + self.theano_rng.binomial( \ + size = self._get_hiddens_shape(), + n = 1, + p = E_h_given_x, + dtype = theano.config.floatX) + + self.h_sample_given_x_func = \ + theano.function([self.x], + h_sample_given_x) + + if _PRINT_GRAPHS: + print "----------------------\nh_sample_given_x_func" + theano.printing.debugprint(self.h_sample_given_x_func) + + # propdown + I_down = self._I_down(self.h) + E_x_given_h = sigmoid(I_down) + self.E_x_given_h_func = theano.function([self.h], E_x_given_h) + + if _PRINT_GRAPHS: + print "----------------------\nE_x_given_h_func" + theano.printing.debugprint(self.E_x_given_h_func) + + x_sample_given_h = \ + self.theano_rng.binomial( \ + size = self._get_visibles_shape(), + n = 1, + p = E_x_given_h, + dtype = theano.config.floatX) + + self.x_sample_given_h_func = \ + theano.function([self.h], + x_sample_given_h) + + if _PRINT_GRAPHS: + print "----------------------\nx_sample_given_h_func" + theano.printing.debugprint(self.x_sample_given_h_func) + + ############################################## + # cd update done by grad of free energy + + x_tilde = T.tensor4('x_tilde') + cd_update_cost = self._mean_free_energy(self.x) - \ + self._mean_free_energy(x_tilde) + + cd_grad = T.grad(cd_update_cost, self.params) + # This is NLL minimization so we use a - + cd_updates = {self.W: self.W - self.lr * cd_grad[0], + self.b_h: self.b_h - self.lr * cd_grad[1], + self.b_x: self.b_x - self.lr * cd_grad[2]} + + cd_returned = [cd_update_cost, + cd_grad[0], cd_grad[1], cd_grad[2], + self.lr * cd_grad[0], + self.lr * cd_grad[1], + self.lr * cd_grad[2]] + self.cd_return_desc = \ + ['cd_update_cost', + 'cd_grad_W', 'cd_grad_b_h', 'cd_grad_b_x', + 'lr_times_cd_grad_W', + 'lr_times_cd_grad_b_h', + 'lr_times_cd_grad_b_x'] + + self.cd_update_function = \ + theano.function([self.x, x_tilde], + cd_returned, updates=cd_updates) + + if _PRINT_GRAPHS: + print "----------------------\ncd_update_function" + theano.printing.debugprint(self.cd_update_function) + + ############## + # sparsity update, based on grad for b_h only + + ''' + This mean returns an array of shape + (num_hiddens_planes, feature_map_height, feature_map_width) + (so it's a mean over each unit's activation) + ''' + mean_expected_activation = T.mean(E_h_given_x, axis=0) + # sparsity_p is broadcasted everywhere + sparsity_update_cost = \ + T.sqr(self.sparsity_p - mean_expected_activation) + sparsity_update_cost = \ + T.sum(T.sum(T.sum( \ + sparsity_update_cost, axis=2), axis=1), axis=0) + sparsity_grad = T.grad(sparsity_update_cost, [self.W, self.b_h]) + + sparsity_returned = \ + [sparsity_update_cost, + sparsity_grad[0], sparsity_grad[1], + self.sparsity_lambda * self.lr * sparsity_grad[0], + self.sparsity_lambda * self.lr * sparsity_grad[1]] + self.sparsity_return_desc = \ + ['sparsity_update_cost', + 'sparsity_grad_W', + 'sparsity_grad_b_h', + 'lambda_lr_times_sparsity_grad_W', + 'lambda_lr_times_sparsity_grad_b_h'] + + # gradient _descent_ so we use a - + sparsity_update = \ + {self.b_h: self.b_h - \ + self.sparsity_lambda * self.lr * sparsity_grad[1], + self.W: self.W - \ + self.sparsity_lambda * self.lr * sparsity_grad[0]} + self.sparsity_update_function = \ + theano.function([self.x], + sparsity_returned, updates=sparsity_update) + + if _PRINT_GRAPHS: + print "----------------------\nsparsity_update_function" + theano.printing.debugprint(self.sparsity_update_function) + + def CD_step(self, x): + h1 = self.h_sample_given_x_func(x) + x2 = self.x_sample_given_h_func(h1) + return self.cd_update_function(x, x2) + + def sparsity_step(self, x): + return self.sparsity_update_function(x) + + # these two also operate on minibatches + + def random_gibbs_samples(self, num_updown_steps): + start_x = self.numpy_rng.rand(*self._get_visibles_shape()) + return self.gibbs_samples_from(start_x, num_updown_steps) + + def gibbs_samples_from(self, start_x, num_updown_steps): + x_sample = start_x + for i in xrange(num_updown_steps): + h_sample = self.h_sample_given_x_func(x_sample) + x_sample = self.x_sample_given_h_func(h_sample) + return x_sample + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/crbm/mnist_crbm.py Fri Apr 16 16:05:55 2010 -0400 @@ -0,0 +1,215 @@ +#!/usr/bin/python + +import sys +import os, os.path + +import numpy as N + +import theano +import theano.tensor as T + +from crbm import CRBM, ConvolutionParams + +from pylearn.datasets import MNIST +from pylearn.io.image_tiling import tile_raster_images + +import Image + +from pylearn.io.seriestables import * +import tables + +IMAGE_OUTPUT_DIR = 'img/' + +REDUCE_EVERY = 100 + +def filename_from_time(suffix): + import datetime + return str(datetime.datetime.now()) + suffix + ".png" + +# Just a shortcut for a common case where we need a few +# related Error (float) series +def get_accumulator_series_array( \ + hdf5_file, group_name, series_names, + reduce_every, + index_names=('epoch','minibatch'), + stdout_too=True, + skip_hdf5_append=False): + all_series = [] + + hdf5_file.createGroup('/', group_name) + + other_targets = [] + if stdout_too: + other_targets = [StdoutAppendTarget()] + + for sn in series_names: + series_base = \ + ErrorSeries(error_name=sn, + table_name=sn, + hdf5_file=hdf5_file, + hdf5_group='/'+group_name, + index_names=index_names, + other_targets=other_targets, + skip_hdf5_append=skip_hdf5_append) + + all_series.append( \ + AccumulatorSeriesWrapper( \ + base_series=series_base, + reduce_every=reduce_every)) + + ret_wrapper = SeriesArrayWrapper(all_series) + + return ret_wrapper + +class MnistCrbm(object): + def __init__(self): + self.mnist = MNIST.full()#first_10k() + + self.cp = ConvolutionParams( \ + num_filters=40, + num_input_planes=1, + height_filters=12, + width_filters=12) + + self.image_size = (28,28) + + self.minibatch_size = 10 + + self.lr = 0.01 + self.sparsity_lambda = 1.0 + # about 1/num_filters, so only one filter active at a time + # 40 * 0.05 = ~2 filters active for any given pixel + self.sparsity_p = 0.05 + + self.crbm = CRBM( \ + minibatch_size=self.minibatch_size, + image_size=self.image_size, + conv_params=self.cp, + learning_rate=self.lr, + sparsity_lambda=self.sparsity_lambda, + sparsity_p=self.sparsity_p) + + self.num_epochs = 10 + + self.init_series() + + def init_series(self): + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + cd_series_names = self.crbm.cd_return_desc + series['cd'] = \ + get_accumulator_series_array( \ + h5f, 'cd', cd_series_names, + REDUCE_EVERY, + stdout_too=True) + + sparsity_series_names = self.crbm.sparsity_return_desc + series['sparsity'] = \ + get_accumulator_series_array( \ + h5f, 'sparsity', sparsity_series_names, + REDUCE_EVERY, + stdout_too=True) + + # so first we create the names for each table, based on + # position of each param in the array + params_stdout = StdoutAppendTarget("\n------\nParams") + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=['W','b_h','b_x'], + hdf5_file=h5f, + index_names=('epoch','minibatch'), + other_targets=[params_stdout]) + + self.series = series + + def train(self): + num_minibatches = len(self.mnist.train.x) / self.minibatch_size + + print_every = 1000 + visualize_every = 5000 + gibbs_steps_from_random = 1000 + + for epoch in xrange(self.num_epochs): + for mb_index in xrange(num_minibatches): + mb_x = self.mnist.train.x \ + [mb_index : mb_index+self.minibatch_size] + mb_x = mb_x.reshape((self.minibatch_size, 1, 28, 28)) + + #E_h = crbm.E_h_given_x_func(mb_x) + #print "Shape of E_h", E_h.shape + + cd_return = self.crbm.CD_step(mb_x) + sp_return = self.crbm.sparsity_step(mb_x) + + self.series['cd'].append( \ + (epoch, mb_index), cd_return) + self.series['sparsity'].append( \ + (epoch, mb_index), sp_return) + + total_idx = epoch*num_minibatches + mb_index + + if (total_idx+1) % REDUCE_EVERY == 0: + self.series['params'].append( \ + (epoch, mb_index), self.crbm.params) + + if total_idx % visualize_every == 0: + self.visualize_gibbs_result(\ + mb_x, gibbs_steps_from_random) + self.visualize_gibbs_result(mb_x, 1) + self.visualize_filters() + + def visualize_gibbs_result(self, start_x, gibbs_steps): + # Run minibatch_size chains for gibbs_steps + x_samples = None + if not start_x is None: + x_samples = self.crbm.gibbs_samples_from(start_x, gibbs_steps) + else: + x_samples = self.crbm.random_gibbs_samples(gibbs_steps) + x_samples = x_samples.reshape((self.minibatch_size, 28*28)) + + tile = tile_raster_images(x_samples, self.image_size, + (1, self.minibatch_size), output_pixel_vals=True) + + filepath = os.path.join(IMAGE_OUTPUT_DIR, + filename_from_time("gibbs")) + img = Image.fromarray(tile) + img.save(filepath) + + print "Result of running Gibbs", \ + gibbs_steps, "times outputed to", filepath + + def visualize_filters(self): + cp = self.cp + + # filter size + fsz = (cp.height_filters, cp.width_filters) + tile_shape = (cp.num_filters, cp.num_input_planes) + + filters_flattened = self.crbm.W.value.reshape( + (tile_shape[0]*tile_shape[1], + fsz[0]*fsz[1])) + + tile = tile_raster_images(filters_flattened, fsz, + tile_shape, output_pixel_vals=True) + + filepath = os.path.join(IMAGE_OUTPUT_DIR, + filename_from_time("filters")) + img = Image.fromarray(tile) + img.save(filepath) + + print "Filters (as images) outputed to", filepath + print "b_h is", self.crbm.b_h.value + + + + +if __name__ == '__main__': + mc = MnistCrbm() + mc.train() +