# HG changeset patch # User Dumitru Erhan # Date 1264091203 18000 # Node ID fda5f787baa63270fa9e7cded01700db1a544397 commit initial diff -r 000000000000 -r fda5f787baa6 code_tutoriel/convolutional_mlp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/convolutional_mlp.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,230 @@ + +""" +This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a +convolutional neural network, good for classifying images. This tutorial shows how to build the +architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST +results. + +The best results are obtained after X iterations of the main program loop, which takes *** +minutes on my workstation (an Intel Core i7, circa July 2009), and *** minutes on my GPU (an +NVIDIA GTX 285 graphics processor). + +This implementation simplifies the model in the following ways: + + - LeNetConvPool doesn't implement location-specific gain and bias parameters + + - LeNetConvPool doesn't implement pooling by average, it implements pooling by max. + + - Digit classification is implemented with a logistic regression rather than an RBF network + + - LeNet5 was not fully-connected convolutions at second layer + +References: + + - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document + Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. + http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf + + +""" +import numpy +from theano.compile.sandbox import shared, pfunc +from theano import tensor +from pylearn.shared.layers import LogisticRegression, SigmoidalLayer +import theano.sandbox.softsign +import pylearn.datasets.MNIST + + +try: + # this tells theano to use the GPU if possible + from theano.sandbox.cuda import use + use() +except Exception, e: + print('Warning: Attempt to use GPU resulted in error "%s"' % str(e)) + +class LeNetConvPool(object): + """WRITEME + + Math of what the layer does, and what symbolic variables are created by the class (w, b, + output). + + """ + + #TODO: implement biases & scales properly. There are supposed to be more parameters. + # - one bias & scale per filter + # - one bias & scale per downsample feature location (a 2d bias) + # - more? + + def __init__(self, rng, input, n_examples, n_imgs, img_shape, n_filters, filter_shape=(5,5), + poolsize=(2,2)): + """ + Allocate a LeNetConvPool layer with shared variable internal parameters. + + :param rng: a random number generator used to initialize weights + + :param input: symbolic images. Shape: (n_examples, n_imgs, img_shape[0], img_shape[1]) + + :param n_examples: input's shape[0] at runtime + + :param n_imgs: input's shape[1] at runtime + + :param img_shape: input's shape[2:4] at runtime + + :param n_filters: the number of filters to apply to the image. + + :param filter_shape: the size of the filters to apply + :type filter_shape: pair (rows, cols) + + :param poolsize: the downsampling (pooling) factor + :type poolsize: pair (rows, cols) + """ + + #TODO: make a simpler convolution constructor!! + # - make dx and dy optional + # - why do we have to pass shapes? (Can we make them optional at least?) + conv_op = ConvOp((n_imgs,)+img_shape, filter_shape, n_filters, n_examples, + dx=1, dy=1, output_mode='valid') + + # - why is poolsize an op parameter here? + # - can we just have a maxpool function that creates this Op internally? + ds_op = DownsampleFactorMax(poolsize, ignore_border=True) + + # the filter tensor that we will apply is a 4D tensor + w_shp = (n_filters, n_imgs) + filter_shape + + # the bias we add is a 1D tensor + b_shp = (n_filters,) + + self.w = shared( + numpy.asarray( + rng.uniform( + low=-1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), + high=1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), + size=w_shp), + dtype=input.dtype)) + self.b = shared( + numpy.asarray( + rng.uniform(low=-.0, high=0., size=(n_filters,)), + dtype=input.dtype)) + + self.input = input + conv_out = conv_op(input, self.w) + self.output = tensor.tanh(ds_op(conv_out) + b.dimshuffle('x', 0, 'x', 'x')) + self.params = [self.w, self.b] + +class SigmoidalLayer(object): + def __init__(self, input, n_in, n_out): + """ + :param input: a symbolic tensor of shape (n_examples, n_in) + :param w: a symbolic weight matrix of shape (n_in, n_out) + :param b: symbolic bias terms of shape (n_out,) + :param squash: an squashing function + """ + self.input = input + self.w = shared( + numpy.asarray( + rng.uniform(low=-2/numpy.sqrt(n_in), high=2/numpy.sqrt(n_in), + size=(n_in, n_out)), dtype=input.dtype)) + self.b = shared(numpy.asarray(numpy.zeros(n_out), dtype=input.dtype)) + self.output = tensor.tanh(tensor.dot(input, self.w) + self.b) + self.params = [self.w, self.b] + +class LogisticRegression(object): + """WRITEME""" + + def __init__(self, input, n_in, n_out): + self.w = shared(numpy.zeros((n_in, n_out), dtype=input.dtype)) + self.b = shared(numpy.zeros((n_out,), dtype=input.dtype)) + self.l1=abs(self.w).sum() + self.l2_sqr = (self.w**2).sum() + self.output=nnet.softmax(theano.dot(input, self.w)+self.b) + self.argmax=theano.tensor.argmax(self.output, axis=1) + self.params = [self.w, self.b] + + def nll(self, target): + """Return the negative log-likelihood of the prediction of this model under a given + target distribution. Passing symbolic integers here means 1-hot. + WRITEME + """ + return nnet.categorical_crossentropy(self.output, target) + + def errors(self, target): + """Return a vector of 0s and 1s, with 1s on every line that was mis-classified. + """ + if target.ndim != self.argmax.ndim: + raise TypeError('target should have the same shape as self.argmax', ('target', target.type, + 'argmax', self.argmax.type)) + if target.dtype.startswith('int'): + return theano.tensor.neq(self.argmax, target) + else: + raise NotImplementedError() + +def evaluate_lenet5(batch_size=30, n_iter=1000): + rng = numpy.random.RandomState(23455) + + mnist = pylearn.datasets.MNIST.train_valid_test() + + ishape=(28,28) #this is the size of MNIST images + + # allocate symbolic variables for the data + x = tensor.fmatrix() # the data is presented as rasterized images + y = tensor.lvector() # the labels are presented as 1D vector of [long int] labels + + # construct the first convolutional pooling layer + layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size,1,28,28)), n_examples=batch_size, + n_imgs=1, img_shape=ishape, + n_filters=6, filter_shape=(5,5), + poolsize=(2,2)) + + # construct the second convolutional pooling layer + layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size, + n_imgs=6, img_shape=(12,12), + n_filters=16, filter_shape=(5,5), + poolsize=(2,2)) + + # construct a fully-connected sigmoidal layer + layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16*16, n_out=128) # 128 ? + + # classify the values of the fully-connected sigmoidal layer + layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10) + + # the cost we minimize during training is the NLL of the model + cost = layer3.nll(y).mean() + + # create a function to compute the mistakes that are made by the model + test_model = pfunc([x,y], layer3.errors(y)) + + # create a list of all model parameters to be fit by gradient descent + params = layer3.params+ layer2.params+ layer1.params + layer0.params + learning_rate = numpy.asarray(0.01, dtype='float32') + + # train_model is a function that updates the model parameters by SGD + train_model = pfunc([x, y], cost, + updates=[(p, p - learning_rate*gp) for p,gp in zip(params, tensor.grad(cost, params))]) + + # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE? + + best_valid_score = float('inf') + for i in xrange(n_iter): + for j in xrange(len(mnist.train.x)/batch_size): + cost_ij = train_model( + mnist.train.x[j*batch_size:(j+1)*batch_size], + mnist.train.y[j*batch_size:(j+1)*batch_size]) + #if 0 == j % 100: + #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij)) + valid_score = numpy.mean([test_model( + mnist.valid.x[j*batch_size:(j+1)*batch_size], + mnist.valid.y[j*batch_size:(j+1)*batch_size]) + for j in xrange(len(mnist.valid.x)/batch_size)]) + print('epoch %i, validation error %f' % (i, valid_score)) + if valid_score < best_valid_score: + best_valid_score = valid_score + test_score = numpy.mean([test_model( + mnist.test.x[j*batch_size:(j+1)*batch_size], + mnist.test.y[j*batch_size:(j+1)*batch_size]) + for j in xrange(len(mnist.test.x)/batch_size)]) + print('epoch %i, test error of best model %f' % (i, test_score)) + +if __name__ == '__main__': + evaluate_lenet5() + diff -r 000000000000 -r fda5f787baa6 code_tutoriel/dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/dae.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,240 @@ +""" + This tutorial introduces denoising auto-encoders using Theano. + + Denoising autoencoders can be used as building blocks for deep networks. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + For X iteration of the main program loop it takes *** minutes on an + Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor). + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy +from theano import tensor +from theano.compile.sandbox import shared, pfunc +from theano.compile.sandbox.shared_randomstreams import RandomStreams +from theano.tensor import nnet +import pylearn.datasets.MNIST + + +try: + #this tells theano to use the GPU if possible + from theano.sandbox.cuda import use + use() +except Exception,e: + print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e)) + + +def load_mnist_batches(batch_size): + """ + We should remove the dependency on pylearn.datasets.MNIST .. and maybe + provide a pickled version of the dataset.. + """ + mnist = pylearn.datasets.MNIST.train_valid_test() + train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size]) + for i in xrange(0, len(mnist.train.x), batch_size)] + valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size]) + for i in xrange(0, len(mnist.valid.x), batch_size)] + test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size]) + for i in xrange(0, len(mnist.test.x), batch_size)] + return train_batches, valid_batches, test_batches + + + + +class DAE(): + """Denoising Auto-Encoder class + + A denoising autoencoders tried to reconstruct the input from a corrupted + version of it by projecting it first in a latent space and reprojecting + it in the input space. Please refer to Vincent et al.,2008 for more + details. If x is the input then equation (1) computes a partially destroyed + version of x by means of a stochastic mapping q_D. Equation (2) computes + the projection of the input into the latent space. Equation (3) computes + the reconstruction of the input, while equation (4) computes the + reconstruction error. + + .. latex-eqn: + \tilde{x} ~ q_D(\tilde{x}|x) (1) + y = s(W \tilde{x} + b) (2) + x = s(W' y + b') (3) + L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) + + Tricks and thumbrules for DAE + - learning rate should be used in a logarithmic scale ... + """ + + def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None): + """ + Initialize the DAE class by specifying the number of visible units (the + dimension d of the input ), the number of hidden units ( the dimension + d' of the latent or hidden space ), a initial value for the learning rate + and by giving a symbolic description of the input. Such a symbolic + description is of no importance for the simple DAE and therefore can be + ignored. This feature is useful when stacking DAEs, since the input of + intermediate layers can be symbolically described in terms of the hidden + units of the previous layer. See the tutorial on SDAE for more details. + + :param n_visible: number of visible units + :param n_hidden: number of hidden units + :param lr: a initial value for the learning rate + :param input: a symbolic description of the input or None + """ + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams( seed = 1234 ) + # create a numpy random generator + numpy_rng = numpy.random.RandomState( seed = 52432 ) + + + # initial values for weights and biases + # note : W' was written as W_prime and b' as b_prime + initial_W = numpy_rng.uniform(size = (n_visible, n_hidden)) + # transform W such that all values are between -.01 and .01 + initial_W = (initial_W*2.0 - 1.0)*.01 + initial_b = numpy.zeros(n_hidden) + initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible)) + # transform W_prime such that all values are between -.01 and .01 + initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01 + initial_b_prime= numpy.zeros(n_visible) + + + # theano shared variables for weights and biases + self.W = shared(value = initial_W , name = "W") + self.b = shared(value = initial_b , name = "b") + self.W_prime = shared(value = initial_W_prime, name = "W'") + self.b_prime = shared(value = initial_b_prime, name = "b'") + + # theano shared variable for the learning rate + self.lr = shared(value = lr , name = "learning_rate") + + # if no input is given generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + x = tensor.dmatrix(name = 'input') + else: + x = input + # Equation (1) + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 0.9 and 0 if 0.1 + tilde_x = theano_rng.binomial( x.shape, 1, 0.9) * x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking DAEs. + self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W ) + self.b) + # Equation (3) + z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to sum all these to get the cost of the + # minibatch + cost = tensor.sum(L) + # parameters with respect to whom we need to compute the gradient + self.params = [ self.W, self.b, self.W_prime, self.b_prime] + # use theano automatic differentiation to get the gradients + gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params) + # update the parameters in the direction of the gradient using the + # learning rate + updated_W = self.W - gW * self.lr + updated_b = self.b - gb * self.lr + updated_W_prime = self.W_prime - gW_prime * self.lr + updated_b_prime = self.b_prime - gb_prime * self.lr + + # defining the function that evaluate the symbolic description of + # one update step + self.update = pfunc(params = [x], outputs = cost, updates = + { self.W : updated_W, + self.b : updated_b, + self.W_prime : updated_W_prime, + self.b_prime : updated_b_prime } ) + self.get_cost = pfunc(params = [x], outputs = cost) + + + + + + + + + + + +def train_DAE_mnist(): + """ + Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist) + """ + + # load dataset as batches + train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16) + + # Create a denoising auto-encoders with 28*28 = 784 input units, and 500 + # units in the hidden layer (latent layer); Learning rate is set to 1e-1 + dae = DAE( n_visible = 784, n_hidden = 500, lr = 1e-2) + + # Number of iterations (epochs) to run + n_iter = 30 + best_valid_score = float('inf') + test_score = float('inf') + for i in xrange(n_iter): + # train once over the dataset + for x,y in train_batches: + cost = dae.update(x) + + # compute validation error + valid_cost = 0. + for x,y in valid_batches: + valid_cost = valid_cost + dae.get_cost(x) + valid_cost = valid_cost / len(valid_batches) + print('epoch %i, validation reconstruction error %f '%(i,valid_cost)) + + if valid_cost < best_valid_score : + best_valid_score = valid_cost + # compute test error !? + test_score = 0. + for x,y in test_batches: + test_score = test_score + dae.get_cost(x) + test_score = test_score / len(test_batches) + print('epoch %i, test error of best model %f' % (i, test_score)) + + print('Optimization done. Best validation score %f, test performance %f' % + (best_valid_score, test_score)) + + + +if __name__ == "__main__": + train_DAE_mnist() + diff -r 000000000000 -r fda5f787baa6 code_tutoriel/dbn.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/dbn.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,24 @@ +import numpy +import theano +import theano.tensor as T + +from deeplearning import rbm + +class DBN(): + + def __init__(self, vsize=None, hsizes=[], lr=None, bsize=10, seed=123): + assert vsize and hsizes and lr + + input = T.dmatrix('global_input') + + self.layers = [] + for hsize in hsizes: + r = rbm.RBM(input=input, vsize=vsize, hsize=hsize, bsize=bsize, + lr=lr, seed=seed) + self.layers.append(r) + + # configure inputs for subsequent layer + input = self.layers[-1].hid + vsize = hsize + + diff -r 000000000000 -r fda5f787baa6 code_tutoriel/logistic_cg.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/logistic_cg.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,282 @@ +""" +This tutorial introduces logistic regression using Theano and conjugate +gradient descent. + +Logistic regression is a probabilistic, linear classifier. It is parametrized +by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is +done by projecting data points onto a set of hyperplanes, the distance to +which is used to determine a class membership probability. + +Mathematically, this can be written as: + +.. math:: + P(Y=i|x, W,b) &= softmax_i(W x + b) \\ + &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} + + +The output of the model or prediction is then done by taking the argmax of +the vector whose i'th element is P(Y=i|x). + +.. math:: + + y_{pred} = argmax_i P(Y=i|x,W,b) + + +This tutorial presents a stochastic gradient descent optimization method +suitable for large datasets, and a conjugate gradient optimization method +that is suitable for smaller datasets. + + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 4.3.2 + + +""" +__docformat__ = 'restructedtext en' + + +import numpy, cPickle, gzip + +import time + +import theano +import theano.tensor as T +import theano.tensor.nnet + + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + + + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + + :param input: symbolic variable that describes the input of the + architecture ( one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoint lies + + :param n_out: number of output units, the dimension of the space in + which the target lies + + """ + + # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), + # while b is a vector of n_out elements, making theta a vector of + # n_in*n_out + n_out elements + self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out) ) + # W is represented by the fisr n_in*n_out elements of theta + self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out)) + # b is the rest (last n_out elements) + self.b = self.theta[n_in*n_out:n_in*n_out+n_out] + + + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + + + + + def negative_log_likelihood(self, y): + """Return the negative log-likelihood of the prediction of this model + under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + + :param y: corresponds to a vector that gives for each example the + :correct label + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + + + + + + +def cg_optimization_mnist( n_iter=50 ): + """Demonstrate conjugate gradient optimization of a log-linear model + + This is demonstrated on MNIST. + + :param n_iter: number of iterations ot run the optimizer + + """ + + # Load the dataset + f = gzip.open('mnist.pkl.gz','rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + # make minibatches of size 20 + batch_size = 20 # sized of the minibatch + + # Dealing with the training set + # get the list of training images (x) and their labels (y) + (train_set_x, train_set_y) = train_set + # initialize the list of training minibatches with empty list + train_batches = [] + for i in xrange(0, len(train_set_x), batch_size): + # add to the list of minibatches the minibatch starting at + # position i, ending at position i+batch_size + # a minibatch is a pair ; the first element of the pair is a list + # of datapoints, the second element is the list of corresponding + # labels + train_batches = train_batches + \ + [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + + # Dealing with the validation set + (valid_set_x, valid_set_y) = valid_set + # initialize the list of validation minibatches + valid_batches = [] + for i in xrange(0, len(valid_set_x), batch_size): + valid_batches = valid_batches + \ + [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] + + # Dealing with the testing set + (test_set_x, test_set_y) = test_set + # initialize the list of testing minibatches + test_batches = [] + for i in xrange(0, len(test_set_x), batch_size): + test_batches = test_batches + \ + [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + + + ishape = (28,28) # this is the size of MNIST images + n_in = 28*28 # number of input units + n_out = 10 # number of output units + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + + # construct the logistic regression class + classifier = LogisticRegression( \ + input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) + + # the cost we minimize during training is the negative log likelihood of + # the model in symbolic format + cost = classifier.negative_log_likelihood(y).mean() + + # compile a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + # compile a theano function that returns the gradient of the minibatch + # with respect to theta + batch_grad = theano.function([x, y], T.grad(cost, classifier.theta)) + # compile a thenao function that returns the cost of a minibatch + batch_cost = theano.function([x, y], cost) + + # creates a function that computes the average cost on the training set + def train_fn(theta_value): + classifier.theta.value = theta_value + cost = 0. + for x,y in train_batches : + cost += batch_cost(x,y) + return cost / len(train_batches) + + # creates a function that computes the average gradient of cost with + # respect to theta + def train_fn_grad(theta_value): + classifier.theta.value = theta_value + grad = numpy.zeros(n_in * n_out + n_out) + for x,y in train_batches: + grad += batch_grad(x,y) + return grad/ len(train_batches) + + + + validation_scores = [float('inf'), 0] + + # creates the validation function + def callback(theta_value): + classifier.theta.value = theta_value + #compute the validation loss + this_validation_loss = 0. + for x,y in valid_batches: + this_validation_loss += test_model(x,y) + + this_validation_loss /= len(valid_batches) + + print('validation error %f %%' % (this_validation_loss*100.,)) + + # check if it is better then best validation score got until now + if this_validation_loss < validation_scores[0]: + # if so, replace the old one, and compute the score on the + # testing dataset + validation_scores[0] = this_validation_loss + test_score = 0. + for x,y in test_batches: + test_score += test_model(x,y) + validation_scores[1] = test_score / len(test_batches) + + # using scipy conjugate gradient optimizer + import scipy.optimize + print ("Optimizing using scipy.optimize.fmin_cg...") + start_time = time.clock() + best_w_b = scipy.optimize.fmin_cg( + f=train_fn, + x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype), + fprime=train_fn_grad, + callback=callback, + disp=0, + maxiter=n_iter) + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%, with ' + 'test performance %f %%') % + (validation_scores[0]*100., validation_scores[1]*100.)) + + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + + + + + +if __name__ == '__main__': + cg_optimization_mnist() + diff -r 000000000000 -r fda5f787baa6 code_tutoriel/logistic_sgd.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/logistic_sgd.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,303 @@ +""" +This tutorial introduces logistic regression using Theano and stochastic +gradient descent. + +Logistic regression is a probabilistic, linear classifier. It is parametrized +by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is +done by projecting data points onto a set of hyperplanes, the distance to +which is used to determine a class membership probability. + +Mathematically, this can be written as: + +.. math:: + P(Y=i|x, W,b) &= softmax_i(W x + b) \\ + &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} + + +The output of the model or prediction is then done by taking the argmax of +the vector whose i'th element is P(Y=i|x). + +.. math:: + + y_{pred} = argmax_i P(Y=i|x,W,b) + + +This tutorial presents a stochastic gradient descent optimization method +suitable for large datasets, and a conjugate gradient optimization method +that is suitable for smaller datasets. + + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 4.3.2 + + +""" +__docformat__ = 'restructedtext en' + + +import numpy, cPickle, gzip + +import time + +import theano +import theano.tensor as T + +import theano.tensor.nnet + + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + + + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + + + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + + + + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + + :param y: corresponds to a vector that gives for each example the + :correct label + + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + + + + +def sgd_optimization_mnist( learning_rate=0.01, n_iter=100): + """ + Demonstrate stochastic gradient descent optimization of a log-linear + model + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :param n_iter: number of iterations ot run the optimizer + + """ + + # Load the dataset + f = gzip.open('mnist.pkl.gz','rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + # make minibatches of size 20 + batch_size = 20 # sized of the minibatch + + # Dealing with the training set + # get the list of training images (x) and their labels (y) + (train_set_x, train_set_y) = train_set + # initialize the list of training minibatches with empty list + train_batches = [] + for i in xrange(0, len(train_set_x), batch_size): + # add to the list of minibatches the minibatch starting at + # position i, ending at position i+batch_size + # a minibatch is a pair ; the first element of the pair is a list + # of datapoints, the second element is the list of corresponding + # labels + train_batches = train_batches + \ + [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + + # Dealing with the validation set + (valid_set_x, valid_set_y) = valid_set + # initialize the list of validation minibatches + valid_batches = [] + for i in xrange(0, len(valid_set_x), batch_size): + valid_batches = valid_batches + \ + [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] + + # Dealing with the testing set + (test_set_x, test_set_y) = test_set + # initialize the list of testing minibatches + test_batches = [] + for i in xrange(0, len(test_set_x), batch_size): + test_batches = test_batches + \ + [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + + + ishape = (28,28) # this is the size of MNIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + # construct the logistic regression class + classifier = LogisticRegression( \ + input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) + + # the cost we minimize during training is the negative log likelihood of + # the model in symbolic format + cost = classifier.negative_log_likelihood(y) + + # compiling a Theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + + # compute the gradient of cost with respect to theta = (W,b) + g_W = T.grad(cost, classifier.W) + g_b = T.grad(cost, classifier.b) + + # specify how to update the parameters of the model as a dictionary + updates ={classifier.W: classifier.W - learning_rate*g_W,\ + classifier.b: classifier.b - learning_rate*g_b} + + # compiling a Theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function([x, y], cost, updates = updates ) + + n_minibatches = len(train_batches) # number of minibatchers + + # early-stopping parameters + patience = 5000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + # have a maximum of `n_iter` iterations through the entire dataset + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + cost_ij = train_model(x,y) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + # sum up the errors for each minibatch + this_validation_loss += test_model(x,y) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(valid_batches) + + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1,n_minibatches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + best_validation_loss = this_validation_loss + # test it on the test set + + test_score = 0. + for x,y in test_batches: + test_score += test_model(x,y) + test_score /= len(test_batches) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % \ + (epoch, minibatch_index+1, n_minibatches,test_score*100.)) + + if patience <= iter : + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + + + + + +if __name__ == '__main__': + sgd_optimization_mnist() + diff -r 000000000000 -r fda5f787baa6 code_tutoriel/mlp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/mlp.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,331 @@ +""" +This tutorial introduces the multilayer perceptron using Theano. + + A multilayer perceptron is a logistic regressor where +instead of feeding the input to the logistic regression you insert a +intermidiate layer, called the hidden layer, that has a nonlinear +activation function (usually tanh or sigmoid) . One can use many such +hidden layers making the architecture deep. The tutorial will also tackle +the problem of MNIST digit classification. + +.. math:: + + f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 5 + +TODO: recommended preprocessing, lr ranges, regularization ranges (explain + to do lr first, then add regularization) + +""" +__docformat__ = 'restructedtext en' + + +import numpy, cPickle, gzip + + +import theano +import theano.tensor as T + +import time + +import theano.tensor.nnet + +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermidiate layers usually have as activation function thanh or the + sigmoid function while the top layer is a softamx layer. + """ + + + + def __init__(self, input, n_in, n_hidden, n_out): + """Initialize the parameters for the multilayer perceptron + + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :param n_hidden: number of hidden units + + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize the parameters theta = (W1,b1,W2,b2) ; note that this + # example contains only one hidden layer, but one can have as many + # layers as he/she wishes, making the network deeper. The only + # problem making the network deep this way is during learning, + # backpropagation being unable to move the network from the starting + # point towards; this is where pre-training helps, giving a good + # starting point for backpropagation, but more about this in the + # other tutorials + + # `W1` is initialized with `W1_values` which is uniformely sampled + # from -1/sqrt(n_in) and 1/sqrt(n_in) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W1_values = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_in+n_hidden)), high = numpy.sqrt(6./(n_in+n_hidden)), \ + size = (n_in, n_hidden)), dtype = theano.config.floatX) + # `W2` is initialized with `W2_values` which is uniformely sampled + # from -1/sqrt(n_hidden) and 1/sqrt(n_hidden) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W2_values = numpy.asarray( numpy.random.uniform( + low = numpy.sqrt(6./(n_hidden+n_out)), high= numpy.sqrt(6./(n_hidden+n_out)),\ + size= (n_hidden, n_out)), dtype = theano.config.floatX) + + self.W1 = theano.shared( value = W1_values ) + self.b1 = theano.shared( value = numpy.zeros((n_hidden,), + dtype= theano.config.floatX)) + self.W2 = theano.shared( value = W2_values ) + self.b2 = theano.shared( value = numpy.zeros((n_out,), + dtype= theano.config.floatX)) + + # symbolic expression computing the values of the hidden layer + self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) + + # symbolic expression computing the values of the top layer + self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred = T.argmax( self.p_y_given_x, axis =1) + + # L1 norm ; one regularization option is to enforce L1 norm to + # be small + self.L1 = abs(self.W1).sum() + abs(self.W2).sum() + + # square of L2 norm ; one regularization option is to enforce + # square of L2 norm to be small + self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() + + + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + + :param y: corresponds to a vector that gives for each example the + :correct label + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + + +def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ + L2_reg = 0.0001, n_iter=100): + """ + Demonstrate stochastic gradient descent optimization for a multilayer + perceptron + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :param n_iter: number of iterations ot run the optimizer + + :param L1_reg: L1-norm's weight when added to the cost (see + regularization) + + :param L2_reg: L2-norm's weight when added to the cost (see + regularization) + """ + + # Load the dataset + f = gzip.open('mnist.pkl.gz','rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + # make minibatches of size 20 + batch_size = 20 # sized of the minibatch + + # Dealing with the training set + # get the list of training images (x) and their labels (y) + (train_set_x, train_set_y) = train_set + # initialize the list of training minibatches with empty list + train_batches = [] + for i in xrange(0, len(train_set_x), batch_size): + # add to the list of minibatches the minibatch starting at + # position i, ending at position i+batch_size + # a minibatch is a pair ; the first element of the pair is a list + # of datapoints, the second element is the list of corresponding + # labels + train_batches = train_batches + \ + [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + + # Dealing with the validation set + (valid_set_x, valid_set_y) = valid_set + # initialize the list of validation minibatches + valid_batches = [] + for i in xrange(0, len(valid_set_x), batch_size): + valid_batches = valid_batches + \ + [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] + + # Dealing with the testing set + (test_set_x, test_set_y) = test_set + # initialize the list of testing minibatches + test_batches = [] + for i in xrange(0, len(test_set_x), batch_size): + test_batches = test_batches + \ + [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + + + ishape = (28,28) # this is the size of MNIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + # construct the logistic regression class + classifier = MLP( input=x.reshape((batch_size,28*28)),\ + n_in=28*28, n_hidden = 500, n_out=10) + + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = classifier.negative_log_likelihood(y) \ + + L1_reg * classifier.L1 \ + + L2_reg * classifier.L2_sqr + + # compiling a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + + # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) + g_W1 = T.grad(cost, classifier.W1) + g_b1 = T.grad(cost, classifier.b1) + g_W2 = T.grad(cost, classifier.W2) + g_b2 = T.grad(cost, classifier.b2) + + # specify how to update the parameters of the model as a dictionary + updates = \ + { classifier.W1: classifier.W1 - learning_rate*g_W1 \ + , classifier.b1: classifier.b1 - learning_rate*g_b1 \ + , classifier.W2: classifier.W2 - learning_rate*g_W2 \ + , classifier.b2: classifier.b2 - learning_rate*g_b2 } + + # compiling a theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function([x, y], cost, updates = updates ) + n_minibatches = len(train_batches) + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + # have a maximum of `n_iter` iterations through the entire dataset + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + cost_ij = train_model(x,y) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + # sum up the errors for each minibatch + this_validation_loss += test_model(x,y) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(valid_batches) + + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + best_validation_loss = this_validation_loss + # test it on the test set + + test_score = 0. + for x,y in test_batches: + test_score += test_model(x,y) + test_score /= len(test_batches) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + if patience <= iter : + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + + + + +if __name__ == '__main__': + sgd_optimization_mnist() + diff -r 000000000000 -r fda5f787baa6 code_tutoriel/rbm.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/rbm.py Thu Jan 21 11:26:43 2010 -0500 @@ -0,0 +1,133 @@ +import numpy +import theano +import theano.tensor as T + +from theano.compile.sandbox.sharedvalue import shared +from theano.compile.sandbox.pfunc import pfunc +from theano.compile.sandbox.shared_randomstreams import RandomStreams +from theano.tensor.nnet import sigmoid + +class A(): + + @execute + def propup(); + # do symbolic prop + self.hid = T.dot( + +class RBM(): + + def __init__(self, input=None, vsize=None, hsize=None, bsize=10, lr=1e-1, seed=123): + """ + RBM constructor. Defines the parameters of the model along with + basic operations for inferring hidden from visible (and vice-versa), as well + as for performing CD updates. + param input: None for standalone RBMs or symbolic variable if RBM is + part of a larger graph. + param vsize: number of visible units + param hsize: number of hidden units + param bsize: size of minibatch + param lr: unsupervised learning rate + param seed: seed for random number generator + """ + assert vsize and hsize + + self.vsize = vsize + self.hsize = hsize + self.lr = shared(lr, 'lr') + + # setup theano random number generator + self.random = RandomStreams(seed) + + #### INITIALIZATION #### + + # initialize input layer for standalone RBM or layer0 of DBN + self.input = input if input else T.dmatrix('input') + # initialize biases + self.b = shared(numpy.zeros(vsize), 'b') + self.c = shared(numpy.zeros(hsize), 'c') + # initialize random weights + rngseed = numpy.random.RandomState(seed).randint(2**30) + rng = numpy.random.RandomState(rngseed) + ubound = 1./numpy.sqrt(max(self.vsize,self.hsize)) + self.w = shared(rng.uniform(low=-ubound, high=ubound, size=(hsize,vsize)), 'w') + + + #### POSITIVE AND NEGATIVE PHASE #### + + # define graph for positive phase + ph, ph_s = self.def_propup(self.input) + # function which computes p(h|v=x) and ~ p(h|v=x) + self.pos_phase = pfunc([self.input], [ph, ph_s]) + + # define graph for negative phase + nv, nv_s = self.def_propdown(ph_s) + nh, nh_s = self.def_propup(nv_s) + # function which computes p(v|h=ph_s), ~ p(v|h=ph_s) and p(h|v=nv_s) + self.neg_phase = pfunc([ph_s], [nv, nv_s, nh, nh_s]) + + # calculate CD gradients for each parameter + db = T.mean(self.input, axis=0) - T.mean(nv, axis=0) + dc = T.mean(ph, axis=0) - T.mean(nh, axis=0) + dwp = T.dot(ph.T, self.input)/nv.shape[0] + dwn = T.dot(nh.T, nv)/nv.shape[0] + dw = dwp - dwn + + # define dictionary of stochastic gradient update equations + updates = {self.b: self.b - self.lr * db, + self.c: self.c - self.lr * dc, + self.w: self.w - self.lr * dw} + + # define private function, which performs one step in direction of CD gradient + self.cd_step = pfunc([self.input, ph, nv, nh], [], updates=updates) + + + def def_propup(self, vis): + """ Symbolic definition of p(hid|vis) """ + hid_activation = T.dot(vis, self.w.T) + self.c + hid = sigmoid(hid_activation) + hid_sample = self.random.binomial(T.shape(hid), 1, hid)*1.0 + return hid, hid_sample + + def def_propdown(self, hid): + """ Symbolic definition of p(vis|hid) """ + vis_activation = T.dot(hid, self.w) + self.b + vis = sigmoid(vis_activation) + vis_sample = self.random.binomial(T.shape(vis), 1, vis)*1.0 + return vis, vis_sample + + def cd(self, x, k=1): + """ Performs actual CD update """ + ph, ph_s = self.pos_phase(x) + + nh_s = ph_s + for ki in range(k): + nv, nv_s, nh, nh_s = self.neg_phase(nh_s) + + self.cd_step(x, ph, nv_s, nh) + + + +import os +from pylearn.datasets import MNIST + +if __name__ == '__main__': + + bsize = 10 + + # initialize dataset + dataset = MNIST.first_1k() + # initialize RBM with 784 visible units and 500 hidden units + r = RBM(vsize=784, hsize=500, bsize=bsize, lr=0.1) + + # for a fixed number of epochs ... + for e in range(10): + + print '@epoch %i ' % e + + # iterate over all training set mini-batches + for i in range(len(dataset.train.x)/bsize): + + rng = range(i*bsize,(i+1)*bsize) # index range of subsequent mini-batch + x = dataset.train.x[rng] # next mini-batch + r.cd(x) # perform cd update +