# HG changeset patch # User Dumitru Erhan # Date 1264390413 18000 # Node ID 0fda55a7de99c9b08ce0f36acd56b9f00f2d857c # Parent fda5f787baa63270fa9e7cded01700db1a544397 removing files that are not ready yet diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/convolutional_mlp.py --- a/code_tutoriel/convolutional_mlp.py Thu Jan 21 11:26:43 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,230 +0,0 @@ - -""" -This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a -convolutional neural network, good for classifying images. This tutorial shows how to build the -architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST -results. - -The best results are obtained after X iterations of the main program loop, which takes *** -minutes on my workstation (an Intel Core i7, circa July 2009), and *** minutes on my GPU (an -NVIDIA GTX 285 graphics processor). - -This implementation simplifies the model in the following ways: - - - LeNetConvPool doesn't implement location-specific gain and bias parameters - - - LeNetConvPool doesn't implement pooling by average, it implements pooling by max. - - - Digit classification is implemented with a logistic regression rather than an RBF network - - - LeNet5 was not fully-connected convolutions at second layer - -References: - - - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document - Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. - http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf - - -""" -import numpy -from theano.compile.sandbox import shared, pfunc -from theano import tensor -from pylearn.shared.layers import LogisticRegression, SigmoidalLayer -import theano.sandbox.softsign -import pylearn.datasets.MNIST - - -try: - # this tells theano to use the GPU if possible - from theano.sandbox.cuda import use - use() -except Exception, e: - print('Warning: Attempt to use GPU resulted in error "%s"' % str(e)) - -class LeNetConvPool(object): - """WRITEME - - Math of what the layer does, and what symbolic variables are created by the class (w, b, - output). - - """ - - #TODO: implement biases & scales properly. There are supposed to be more parameters. - # - one bias & scale per filter - # - one bias & scale per downsample feature location (a 2d bias) - # - more? - - def __init__(self, rng, input, n_examples, n_imgs, img_shape, n_filters, filter_shape=(5,5), - poolsize=(2,2)): - """ - Allocate a LeNetConvPool layer with shared variable internal parameters. - - :param rng: a random number generator used to initialize weights - - :param input: symbolic images. Shape: (n_examples, n_imgs, img_shape[0], img_shape[1]) - - :param n_examples: input's shape[0] at runtime - - :param n_imgs: input's shape[1] at runtime - - :param img_shape: input's shape[2:4] at runtime - - :param n_filters: the number of filters to apply to the image. - - :param filter_shape: the size of the filters to apply - :type filter_shape: pair (rows, cols) - - :param poolsize: the downsampling (pooling) factor - :type poolsize: pair (rows, cols) - """ - - #TODO: make a simpler convolution constructor!! - # - make dx and dy optional - # - why do we have to pass shapes? (Can we make them optional at least?) - conv_op = ConvOp((n_imgs,)+img_shape, filter_shape, n_filters, n_examples, - dx=1, dy=1, output_mode='valid') - - # - why is poolsize an op parameter here? - # - can we just have a maxpool function that creates this Op internally? - ds_op = DownsampleFactorMax(poolsize, ignore_border=True) - - # the filter tensor that we will apply is a 4D tensor - w_shp = (n_filters, n_imgs) + filter_shape - - # the bias we add is a 1D tensor - b_shp = (n_filters,) - - self.w = shared( - numpy.asarray( - rng.uniform( - low=-1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), - high=1.0 / numpy.sqrt(filter_shape[0] * filter_shape[1] * n_imgs), - size=w_shp), - dtype=input.dtype)) - self.b = shared( - numpy.asarray( - rng.uniform(low=-.0, high=0., size=(n_filters,)), - dtype=input.dtype)) - - self.input = input - conv_out = conv_op(input, self.w) - self.output = tensor.tanh(ds_op(conv_out) + b.dimshuffle('x', 0, 'x', 'x')) - self.params = [self.w, self.b] - -class SigmoidalLayer(object): - def __init__(self, input, n_in, n_out): - """ - :param input: a symbolic tensor of shape (n_examples, n_in) - :param w: a symbolic weight matrix of shape (n_in, n_out) - :param b: symbolic bias terms of shape (n_out,) - :param squash: an squashing function - """ - self.input = input - self.w = shared( - numpy.asarray( - rng.uniform(low=-2/numpy.sqrt(n_in), high=2/numpy.sqrt(n_in), - size=(n_in, n_out)), dtype=input.dtype)) - self.b = shared(numpy.asarray(numpy.zeros(n_out), dtype=input.dtype)) - self.output = tensor.tanh(tensor.dot(input, self.w) + self.b) - self.params = [self.w, self.b] - -class LogisticRegression(object): - """WRITEME""" - - def __init__(self, input, n_in, n_out): - self.w = shared(numpy.zeros((n_in, n_out), dtype=input.dtype)) - self.b = shared(numpy.zeros((n_out,), dtype=input.dtype)) - self.l1=abs(self.w).sum() - self.l2_sqr = (self.w**2).sum() - self.output=nnet.softmax(theano.dot(input, self.w)+self.b) - self.argmax=theano.tensor.argmax(self.output, axis=1) - self.params = [self.w, self.b] - - def nll(self, target): - """Return the negative log-likelihood of the prediction of this model under a given - target distribution. Passing symbolic integers here means 1-hot. - WRITEME - """ - return nnet.categorical_crossentropy(self.output, target) - - def errors(self, target): - """Return a vector of 0s and 1s, with 1s on every line that was mis-classified. - """ - if target.ndim != self.argmax.ndim: - raise TypeError('target should have the same shape as self.argmax', ('target', target.type, - 'argmax', self.argmax.type)) - if target.dtype.startswith('int'): - return theano.tensor.neq(self.argmax, target) - else: - raise NotImplementedError() - -def evaluate_lenet5(batch_size=30, n_iter=1000): - rng = numpy.random.RandomState(23455) - - mnist = pylearn.datasets.MNIST.train_valid_test() - - ishape=(28,28) #this is the size of MNIST images - - # allocate symbolic variables for the data - x = tensor.fmatrix() # the data is presented as rasterized images - y = tensor.lvector() # the labels are presented as 1D vector of [long int] labels - - # construct the first convolutional pooling layer - layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size,1,28,28)), n_examples=batch_size, - n_imgs=1, img_shape=ishape, - n_filters=6, filter_shape=(5,5), - poolsize=(2,2)) - - # construct the second convolutional pooling layer - layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size, - n_imgs=6, img_shape=(12,12), - n_filters=16, filter_shape=(5,5), - poolsize=(2,2)) - - # construct a fully-connected sigmoidal layer - layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16*16, n_out=128) # 128 ? - - # classify the values of the fully-connected sigmoidal layer - layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10) - - # the cost we minimize during training is the NLL of the model - cost = layer3.nll(y).mean() - - # create a function to compute the mistakes that are made by the model - test_model = pfunc([x,y], layer3.errors(y)) - - # create a list of all model parameters to be fit by gradient descent - params = layer3.params+ layer2.params+ layer1.params + layer0.params - learning_rate = numpy.asarray(0.01, dtype='float32') - - # train_model is a function that updates the model parameters by SGD - train_model = pfunc([x, y], cost, - updates=[(p, p - learning_rate*gp) for p,gp in zip(params, tensor.grad(cost, params))]) - - # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE? - - best_valid_score = float('inf') - for i in xrange(n_iter): - for j in xrange(len(mnist.train.x)/batch_size): - cost_ij = train_model( - mnist.train.x[j*batch_size:(j+1)*batch_size], - mnist.train.y[j*batch_size:(j+1)*batch_size]) - #if 0 == j % 100: - #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij)) - valid_score = numpy.mean([test_model( - mnist.valid.x[j*batch_size:(j+1)*batch_size], - mnist.valid.y[j*batch_size:(j+1)*batch_size]) - for j in xrange(len(mnist.valid.x)/batch_size)]) - print('epoch %i, validation error %f' % (i, valid_score)) - if valid_score < best_valid_score: - best_valid_score = valid_score - test_score = numpy.mean([test_model( - mnist.test.x[j*batch_size:(j+1)*batch_size], - mnist.test.y[j*batch_size:(j+1)*batch_size]) - for j in xrange(len(mnist.test.x)/batch_size)]) - print('epoch %i, test error of best model %f' % (i, test_score)) - -if __name__ == '__main__': - evaluate_lenet5() - diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/dae.py --- a/code_tutoriel/dae.py Thu Jan 21 11:26:43 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,240 +0,0 @@ -""" - This tutorial introduces denoising auto-encoders using Theano. - - Denoising autoencoders can be used as building blocks for deep networks. - They are based on auto-encoders as the ones used in Bengio et al. 2007. - An autoencoder takes an input x and first maps it to a hidden representation - y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting - latent representation y is then mapped back to a "reconstructed" vector - z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight - matrix W' can optionally be constrained such that W' = W^T, in which case - the autoencoder is said to have tied weights. The network is trained such - that to minimize the reconstruction error (the error between x and z). - - For the denosing autoencoder, during training, first x is corrupted into - \tilde{x}, where \tilde{x} is a partially destroyed version of x by means - of a stochastic mapping. Afterwards y is computed as before (using - \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction - error is now measured between z and the uncorrupted input x, which is - computed as the cross-entropy : - - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] - - For X iteration of the main program loop it takes *** minutes on an - Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor). - - - References : - - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and - Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, - 2008 - - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise - Training of Deep Networks, Advances in Neural Information Processing - Systems 19, 2007 - -""" - -import numpy -from theano import tensor -from theano.compile.sandbox import shared, pfunc -from theano.compile.sandbox.shared_randomstreams import RandomStreams -from theano.tensor import nnet -import pylearn.datasets.MNIST - - -try: - #this tells theano to use the GPU if possible - from theano.sandbox.cuda import use - use() -except Exception,e: - print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e)) - - -def load_mnist_batches(batch_size): - """ - We should remove the dependency on pylearn.datasets.MNIST .. and maybe - provide a pickled version of the dataset.. - """ - mnist = pylearn.datasets.MNIST.train_valid_test() - train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size]) - for i in xrange(0, len(mnist.train.x), batch_size)] - valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size]) - for i in xrange(0, len(mnist.valid.x), batch_size)] - test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size]) - for i in xrange(0, len(mnist.test.x), batch_size)] - return train_batches, valid_batches, test_batches - - - - -class DAE(): - """Denoising Auto-Encoder class - - A denoising autoencoders tried to reconstruct the input from a corrupted - version of it by projecting it first in a latent space and reprojecting - it in the input space. Please refer to Vincent et al.,2008 for more - details. If x is the input then equation (1) computes a partially destroyed - version of x by means of a stochastic mapping q_D. Equation (2) computes - the projection of the input into the latent space. Equation (3) computes - the reconstruction of the input, while equation (4) computes the - reconstruction error. - - .. latex-eqn: - \tilde{x} ~ q_D(\tilde{x}|x) (1) - y = s(W \tilde{x} + b) (2) - x = s(W' y + b') (3) - L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) - - Tricks and thumbrules for DAE - - learning rate should be used in a logarithmic scale ... - """ - - def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None): - """ - Initialize the DAE class by specifying the number of visible units (the - dimension d of the input ), the number of hidden units ( the dimension - d' of the latent or hidden space ), a initial value for the learning rate - and by giving a symbolic description of the input. Such a symbolic - description is of no importance for the simple DAE and therefore can be - ignored. This feature is useful when stacking DAEs, since the input of - intermediate layers can be symbolically described in terms of the hidden - units of the previous layer. See the tutorial on SDAE for more details. - - :param n_visible: number of visible units - :param n_hidden: number of hidden units - :param lr: a initial value for the learning rate - :param input: a symbolic description of the input or None - """ - self.n_visible = n_visible - self.n_hidden = n_hidden - - # create a Theano random generator that gives symbolic random values - theano_rng = RandomStreams( seed = 1234 ) - # create a numpy random generator - numpy_rng = numpy.random.RandomState( seed = 52432 ) - - - # initial values for weights and biases - # note : W' was written as W_prime and b' as b_prime - initial_W = numpy_rng.uniform(size = (n_visible, n_hidden)) - # transform W such that all values are between -.01 and .01 - initial_W = (initial_W*2.0 - 1.0)*.01 - initial_b = numpy.zeros(n_hidden) - initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible)) - # transform W_prime such that all values are between -.01 and .01 - initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01 - initial_b_prime= numpy.zeros(n_visible) - - - # theano shared variables for weights and biases - self.W = shared(value = initial_W , name = "W") - self.b = shared(value = initial_b , name = "b") - self.W_prime = shared(value = initial_W_prime, name = "W'") - self.b_prime = shared(value = initial_b_prime, name = "b'") - - # theano shared variable for the learning rate - self.lr = shared(value = lr , name = "learning_rate") - - # if no input is given generate a variable representing the input - if input == None : - # we use a matrix because we expect a minibatch of several examples, - # each example being a row - x = tensor.dmatrix(name = 'input') - else: - x = input - # Equation (1) - # note : first argument of theano.rng.binomial is the shape(size) of - # random numbers that it should produce - # second argument is the number of trials - # third argument is the probability of success of any trial - # - # this will produce an array of 0s and 1s where 1 has a - # probability of 0.9 and 0 if 0.1 - tilde_x = theano_rng.binomial( x.shape, 1, 0.9) * x - # Equation (2) - # note : y is stored as an attribute of the class so that it can be - # used later when stacking DAEs. - self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W ) + self.b) - # Equation (3) - z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime) - # Equation (4) - L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 ) - # note : L is now a vector, where each element is the cross-entropy cost - # of the reconstruction of the corresponding example of the - # minibatch. We need to sum all these to get the cost of the - # minibatch - cost = tensor.sum(L) - # parameters with respect to whom we need to compute the gradient - self.params = [ self.W, self.b, self.W_prime, self.b_prime] - # use theano automatic differentiation to get the gradients - gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params) - # update the parameters in the direction of the gradient using the - # learning rate - updated_W = self.W - gW * self.lr - updated_b = self.b - gb * self.lr - updated_W_prime = self.W_prime - gW_prime * self.lr - updated_b_prime = self.b_prime - gb_prime * self.lr - - # defining the function that evaluate the symbolic description of - # one update step - self.update = pfunc(params = [x], outputs = cost, updates = - { self.W : updated_W, - self.b : updated_b, - self.W_prime : updated_W_prime, - self.b_prime : updated_b_prime } ) - self.get_cost = pfunc(params = [x], outputs = cost) - - - - - - - - - - - -def train_DAE_mnist(): - """ - Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist) - """ - - # load dataset as batches - train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16) - - # Create a denoising auto-encoders with 28*28 = 784 input units, and 500 - # units in the hidden layer (latent layer); Learning rate is set to 1e-1 - dae = DAE( n_visible = 784, n_hidden = 500, lr = 1e-2) - - # Number of iterations (epochs) to run - n_iter = 30 - best_valid_score = float('inf') - test_score = float('inf') - for i in xrange(n_iter): - # train once over the dataset - for x,y in train_batches: - cost = dae.update(x) - - # compute validation error - valid_cost = 0. - for x,y in valid_batches: - valid_cost = valid_cost + dae.get_cost(x) - valid_cost = valid_cost / len(valid_batches) - print('epoch %i, validation reconstruction error %f '%(i,valid_cost)) - - if valid_cost < best_valid_score : - best_valid_score = valid_cost - # compute test error !? - test_score = 0. - for x,y in test_batches: - test_score = test_score + dae.get_cost(x) - test_score = test_score / len(test_batches) - print('epoch %i, test error of best model %f' % (i, test_score)) - - print('Optimization done. Best validation score %f, test performance %f' % - (best_valid_score, test_score)) - - - -if __name__ == "__main__": - train_DAE_mnist() - diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/dbn.py --- a/code_tutoriel/dbn.py Thu Jan 21 11:26:43 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -import numpy -import theano -import theano.tensor as T - -from deeplearning import rbm - -class DBN(): - - def __init__(self, vsize=None, hsizes=[], lr=None, bsize=10, seed=123): - assert vsize and hsizes and lr - - input = T.dmatrix('global_input') - - self.layers = [] - for hsize in hsizes: - r = rbm.RBM(input=input, vsize=vsize, hsize=hsize, bsize=bsize, - lr=lr, seed=seed) - self.layers.append(r) - - # configure inputs for subsequent layer - input = self.layers[-1].hid - vsize = hsize - - diff -r fda5f787baa6 -r 0fda55a7de99 code_tutoriel/rbm.py --- a/code_tutoriel/rbm.py Thu Jan 21 11:26:43 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -import numpy -import theano -import theano.tensor as T - -from theano.compile.sandbox.sharedvalue import shared -from theano.compile.sandbox.pfunc import pfunc -from theano.compile.sandbox.shared_randomstreams import RandomStreams -from theano.tensor.nnet import sigmoid - -class A(): - - @execute - def propup(); - # do symbolic prop - self.hid = T.dot( - -class RBM(): - - def __init__(self, input=None, vsize=None, hsize=None, bsize=10, lr=1e-1, seed=123): - """ - RBM constructor. Defines the parameters of the model along with - basic operations for inferring hidden from visible (and vice-versa), as well - as for performing CD updates. - param input: None for standalone RBMs or symbolic variable if RBM is - part of a larger graph. - param vsize: number of visible units - param hsize: number of hidden units - param bsize: size of minibatch - param lr: unsupervised learning rate - param seed: seed for random number generator - """ - assert vsize and hsize - - self.vsize = vsize - self.hsize = hsize - self.lr = shared(lr, 'lr') - - # setup theano random number generator - self.random = RandomStreams(seed) - - #### INITIALIZATION #### - - # initialize input layer for standalone RBM or layer0 of DBN - self.input = input if input else T.dmatrix('input') - # initialize biases - self.b = shared(numpy.zeros(vsize), 'b') - self.c = shared(numpy.zeros(hsize), 'c') - # initialize random weights - rngseed = numpy.random.RandomState(seed).randint(2**30) - rng = numpy.random.RandomState(rngseed) - ubound = 1./numpy.sqrt(max(self.vsize,self.hsize)) - self.w = shared(rng.uniform(low=-ubound, high=ubound, size=(hsize,vsize)), 'w') - - - #### POSITIVE AND NEGATIVE PHASE #### - - # define graph for positive phase - ph, ph_s = self.def_propup(self.input) - # function which computes p(h|v=x) and ~ p(h|v=x) - self.pos_phase = pfunc([self.input], [ph, ph_s]) - - # define graph for negative phase - nv, nv_s = self.def_propdown(ph_s) - nh, nh_s = self.def_propup(nv_s) - # function which computes p(v|h=ph_s), ~ p(v|h=ph_s) and p(h|v=nv_s) - self.neg_phase = pfunc([ph_s], [nv, nv_s, nh, nh_s]) - - # calculate CD gradients for each parameter - db = T.mean(self.input, axis=0) - T.mean(nv, axis=0) - dc = T.mean(ph, axis=0) - T.mean(nh, axis=0) - dwp = T.dot(ph.T, self.input)/nv.shape[0] - dwn = T.dot(nh.T, nv)/nv.shape[0] - dw = dwp - dwn - - # define dictionary of stochastic gradient update equations - updates = {self.b: self.b - self.lr * db, - self.c: self.c - self.lr * dc, - self.w: self.w - self.lr * dw} - - # define private function, which performs one step in direction of CD gradient - self.cd_step = pfunc([self.input, ph, nv, nh], [], updates=updates) - - - def def_propup(self, vis): - """ Symbolic definition of p(hid|vis) """ - hid_activation = T.dot(vis, self.w.T) + self.c - hid = sigmoid(hid_activation) - hid_sample = self.random.binomial(T.shape(hid), 1, hid)*1.0 - return hid, hid_sample - - def def_propdown(self, hid): - """ Symbolic definition of p(vis|hid) """ - vis_activation = T.dot(hid, self.w) + self.b - vis = sigmoid(vis_activation) - vis_sample = self.random.binomial(T.shape(vis), 1, vis)*1.0 - return vis, vis_sample - - def cd(self, x, k=1): - """ Performs actual CD update """ - ph, ph_s = self.pos_phase(x) - - nh_s = ph_s - for ki in range(k): - nv, nv_s, nh, nh_s = self.neg_phase(nh_s) - - self.cd_step(x, ph, nv_s, nh) - - - -import os -from pylearn.datasets import MNIST - -if __name__ == '__main__': - - bsize = 10 - - # initialize dataset - dataset = MNIST.first_1k() - # initialize RBM with 784 visible units and 500 hidden units - r = RBM(vsize=784, hsize=500, bsize=bsize, lr=0.1) - - # for a fixed number of epochs ... - for e in range(10): - - print '@epoch %i ' % e - - # iterate over all training set mini-batches - for i in range(len(dataset.train.x)/bsize): - - rng = range(i*bsize,(i+1)*bsize) # index range of subsequent mini-batch - x = dataset.train.x[rng] # next mini-batch - r.cd(x) # perform cd update -