# HG changeset patch # User Arnaud Bergeron # Date 1269898964 14400 # Node ID a9af079892ce2a36c218387aa18c540b89412183 # Parent a222af1d0598c750240eac8c89c5970360ec77e4# Parent 6f606b359df3881f839ca2786b2c372da8c93dde branch merge diff -r 6f606b359df3 -r a9af079892ce .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,6 @@ +# use glob syntax. +syntax: glob + +*.pyc +*~ + diff -r 6f606b359df3 -r a9af079892ce __init__.py diff -r 6f606b359df3 -r a9af079892ce baseline/__init__.py diff -r 6f606b359df3 -r a9af079892ce baseline/conv_mlp/__init__.py diff -r 6f606b359df3 -r a9af079892ce baseline/conv_mlp/convolutional_mlp.conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/conv_mlp/convolutional_mlp.conf Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,7 @@ +learning_rate=0.01 +n_iter=1 +batch_size=20 +n_kern0=20 +n_kern1=50 +filter_shape=5 +n_layer=3 \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce baseline/conv_mlp/convolutional_mlp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/conv_mlp/convolutional_mlp.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,479 @@ +""" +This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a +convolutional neural network, good for classifying images. This tutorial shows how to build the +architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST +results. + +The best results are obtained after X iterations of the main program loop, which takes *** +minutes on my workstation (an Intel Core i7, circa July 2009), and *** minutes on my GPU (an +NVIDIA GTX 285 graphics processor). + +This implementation simplifies the model in the following ways: + + - LeNetConvPool doesn't implement location-specific gain and bias parameters + - LeNetConvPool doesn't implement pooling by average, it implements pooling by max. + - Digit classification is implemented with a logistic regression rather than an RBF network + - LeNet5 was not fully-connected convolutions at second layer + +References: + - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document + Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. + http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf +""" + +import numpy, theano, cPickle, gzip, time +import theano.tensor as T +import theano.sandbox.softsign +import sys +import pylearn.datasets.MNIST +from pylearn.io import filetensor as ft +from theano.sandbox import conv, downsample + +from ift6266 import datasets +import theano,pylearn.version,ift6266 + +class LeNetConvPoolLayer(object): + + def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2,2)): + """ + Allocate a LeNetConvPoolLayer with shared variable internal parameters. + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + :type input: theano.tensor.dtensor4 + :param input: symbolic image tensor, of shape image_shape + :type filter_shape: tuple or list of length 4 + :param filter_shape: (number of filters, num input feature maps, + filter height,filter width) + :type image_shape: tuple or list of length 4 + :param image_shape: (batch size, num input feature maps, + image height, image width) + :type poolsize: tuple or list of length 2 + :param poolsize: the downsampling (pooling) factor (#rows,#cols) + """ + assert image_shape[1]==filter_shape[1] + self.input = input + + # initialize weight values: the fan-in of each hidden neuron is + # restricted by the size of the receptive fields. + fan_in = numpy.prod(filter_shape[1:]) + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(3./fan_in), \ + high = numpy.sqrt(3./fan_in), \ + size = filter_shape), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + # the bias is a 1D tensor -- one bias per output feature map + b_values = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + # convolve input feature maps with filters + conv_out = conv.conv2d(input, self.W, + filter_shape=filter_shape, image_shape=image_shape) + + # downsample each feature map individually, using maxpooling + pooled_out = downsample.max_pool2D(conv_out, poolsize, ignore_border=True) + + # add the bias term. Since the bias is a vector (1D array), we first + # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will thus + # be broadcasted across mini-batches and feature map width & height + self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + + # store parameters of this layer + self.params = [self.W, self.b] + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + """ + Typical hidden layer of a MLP: units are fully-connected and have + sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + Hidden unit activation is given by: sigmoid(dot(input,W) + b) + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + :type input: theano.tensor.dmatrix + :param input: a symbolic tensor of shape (n_examples, n_in) + :type n_in: int + :param n_in: dimensionality of input + :type n_out: int + :param n_out: number of hidden units + """ + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.tanh(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + """ + + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + :param y: corresponds to a vector that gives for each example the + correct label + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + """ + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +def evaluate_lenet5(learning_rate=0.1, n_iter=200, batch_size=20, n_kern0=20, n_kern1=50, n_layer=3, filter_shape0=5, filter_shape1=5, sigmoide_size=500, dataset='mnist.pkl.gz'): + rng = numpy.random.RandomState(23455) + + print 'Before load dataset' + dataset=datasets.nist_digits + train_batches= dataset.train(batch_size) + valid_batches=dataset.valid(batch_size) + test_batches=dataset.test(batch_size) + #print valid_batches.shape + #print test_batches.shape + print 'After load dataset' + + ishape = (32,32) # this is the size of NIST images + n_kern2=80 + n_kern3=100 + if n_layer==4: + filter_shape1=3 + filter_shape2=3 + if n_layer==5: + filter_shape0=4 + filter_shape1=2 + filter_shape2=2 + filter_shape3=2 + + + # allocate symbolic variables for the data + x = T.matrix('x') # rasterized images + y = T.lvector() # the labels are presented as 1D vector of [long int] labels + + + ###################### + # BUILD ACTUAL MODEL # + ###################### + + # Reshape matrix of rasterized images of shape (batch_size,28*28) + # to a 4D tensor, compatible with our LeNetConvPoolLayer + layer0_input = x.reshape((batch_size,1,32,32)) + + # Construct the first convolutional pooling layer: + # filtering reduces the image size to (32-5+1,32-5+1)=(28,28) + # maxpooling reduces this further to (28/2,28/2) = (14,14) + # 4D output tensor is thus of shape (20,20,14,14) + layer0 = LeNetConvPoolLayer(rng, input=layer0_input, + image_shape=(batch_size,1,32,32), + filter_shape=(n_kern0,1,filter_shape0,filter_shape0), poolsize=(2,2)) + + if(n_layer>2): + + # Construct the second convolutional pooling layer + # filtering reduces the image size to (14-5+1,14-5+1)=(10,10) + # maxpooling reduces this further to (10/2,10/2) = (5,5) + # 4D output tensor is thus of shape (20,50,5,5) + fshape0=(32-filter_shape0+1)/2 + layer1 = LeNetConvPoolLayer(rng, input=layer0.output, + image_shape=(batch_size,n_kern0,fshape0,fshape0), + filter_shape=(n_kern1,n_kern0,filter_shape1,filter_shape1), poolsize=(2,2)) + + else: + + fshape0=(32-filter_shape0+1)/2 + layer1_input = layer0.output.flatten(2) + # construct a fully-connected sigmoidal layer + layer1 = SigmoidalLayer(rng, input=layer1_input,n_in=n_kern0*fshape0*fshape0, n_out=sigmoide_size) + + layer2 = LogisticRegression(input=layer1.output, n_in=sigmoide_size, n_out=10) + cost = layer2.negative_log_likelihood(y) + test_model = theano.function([x,y], layer2.errors(y)) + params = layer2.params+ layer1.params + layer0.params + + + if(n_layer>3): + + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 + layer2 = LeNetConvPoolLayer(rng, input=layer1.output, + image_shape=(batch_size,n_kern1,fshape1,fshape1), + filter_shape=(n_kern2,n_kern1,filter_shape2,filter_shape2), poolsize=(2,2)) + + if(n_layer>4): + + + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 + fshape2=(fshape1-filter_shape2+1)/2 + fshape3=(fshape2-filter_shape3+1)/2 + layer3 = LeNetConvPoolLayer(rng, input=layer2.output, + image_shape=(batch_size,n_kern2,fshape2,fshape2), + filter_shape=(n_kern3,n_kern2,filter_shape3,filter_shape3), poolsize=(2,2)) + + layer4_input = layer3.output.flatten(2) + + layer4 = SigmoidalLayer(rng, input=layer4_input, + n_in=n_kern3*fshape3*fshape3, n_out=sigmoide_size) + + + layer5 = LogisticRegression(input=layer4.output, n_in=sigmoide_size, n_out=10) + + cost = layer5.negative_log_likelihood(y) + + test_model = theano.function([x,y], layer5.errors(y)) + + params = layer5.params+ layer4.params+ layer3.params+ layer2.params+ layer1.params + layer0.params + + elif(n_layer>3): + + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 + fshape2=(fshape1-filter_shape2+1)/2 + layer3_input = layer2.output.flatten(2) + + layer3 = SigmoidalLayer(rng, input=layer3_input, + n_in=n_kern2*fshape2*fshape2, n_out=sigmoide_size) + + + layer4 = LogisticRegression(input=layer3.output, n_in=sigmoide_size, n_out=10) + + cost = layer4.negative_log_likelihood(y) + + test_model = theano.function([x,y], layer4.errors(y)) + + params = layer4.params+ layer3.params+ layer2.params+ layer1.params + layer0.params + + + elif(n_layer>2): + + fshape0=(32-filter_shape0+1)/2 + fshape1=(fshape0-filter_shape1+1)/2 + + # the SigmoidalLayer being fully-connected, it operates on 2D matrices of + # shape (batch_size,num_pixels) (i.e matrix of rasterized images). + # This will generate a matrix of shape (20,32*4*4) = (20,512) + layer2_input = layer1.output.flatten(2) + + # construct a fully-connected sigmoidal layer + layer2 = SigmoidalLayer(rng, input=layer2_input, + n_in=n_kern1*fshape1*fshape1, n_out=sigmoide_size) + + + # classify the values of the fully-connected sigmoidal layer + layer3 = LogisticRegression(input=layer2.output, n_in=sigmoide_size, n_out=10) + + # the cost we minimize during training is the NLL of the model + cost = layer3.negative_log_likelihood(y) + + # create a function to compute the mistakes that are made by the model + test_model = theano.function([x,y], layer3.errors(y)) + + # create a list of all model parameters to be fit by gradient descent + params = layer3.params+ layer2.params+ layer1.params + layer0.params + + + + + + # create a list of gradients for all model parameters + grads = T.grad(cost, params) + + # train_model is a function that updates the model parameters by SGD + # Since this model has many parameters, it would be tedious to manually + # create an update rule for each model parameter. We thus create the updates + # dictionary by automatically looping over all (params[i],grads[i]) pairs. + updates = {} + for param_i, grad_i in zip(params, grads): + updates[param_i] = param_i - learning_rate * grad_i + train_model = theano.function([x, y], cost, updates=updates) + + + ############### + # TRAIN MODEL # + ############### + + #n_minibatches = len(train_batches) + n_minibatches=0 + n_valid=0 + n_test=0 + for x, y in dataset.train(batch_size): + if x.shape[0] == batch_size: + n_minibatches+=1 + n_minibatches*=batch_size + print n_minibatches + + for x, y in dataset.valid(batch_size): + if x.shape[0] == batch_size: + n_valid+=1 + n_valid*=batch_size + print n_valid + + for x, y in dataset.test(batch_size): + if x.shape[0] == batch_size: + n_test+=1 + n_test*=batch_size + print n_test + + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + + + # have a maximum of `n_iter` iterations through the entire dataset + iter=0 + for epoch in xrange(n_iter): + for x, y in train_batches: + if x.shape[0] != batch_size: + continue + iter+=1 + + # get epoch and minibatch index + #epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + if iter %100 == 0: + print 'training @ iter = ', iter + cost_ij = train_model(x,y) + + + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + if x.shape[0] != batch_size: + continue + # sum up the errors for each minibatch + this_validation_loss += test_model(x,y) + + # get the average by dividing with the number of minibatches + this_validation_loss /= n_valid + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_score = 0. + for x,y in test_batches: + if x.shape[0] != batch_size: + continue + test_score += test_model(x,y) + test_score /= n_test + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + if patience <= iter : + break + + end_time = time.clock() + print('Optimization complete.') + print('Best validation score of %f %% obtained at iteration %i,'\ + 'with test performance %f %%' % + (best_validation_loss * 100., best_iter, test_score*100.)) + print('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + return (best_validation_loss * 100., test_score*100., (end_time-start_time)/60., best_iter) + +if __name__ == '__main__': + evaluate_lenet5() + +def experiment(state, channel): + print 'start experiment' + (best_validation_loss, test_score, minutes_trained, iter) = evaluate_lenet5(state.learning_rate, state.n_iter, state.batch_size, state.n_kern0, state.n_kern1, state.n_layer, state.filter_shape0, state.filter_shape1,state.sigmoide_size) + print 'end experiment' + + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + + state.best_validation_loss = best_validation_loss + state.test_score = test_score + state.minutes_trained = minutes_trained + state.iter = iter + + return channel.COMPLETE diff -r 6f606b359df3 -r a9af079892ce baseline/deep_mlp/__init__.py diff -r 6f606b359df3 -r a9af079892ce baseline/deep_mlp/deepmlp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/deep_mlp/deepmlp.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,310 @@ +# + +import numpy, cPickle, gzip + + +import theano +import theano.tensor as T + +import time + +import theano.tensor.nnet + +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermidiate layers usually have as activation function thanh or the + sigmoid function while the top layer is a softamx layer. + """ + + + + def __init__(self, input, n_in, n_hidden, n_out): + """Initialize the parameters for the multilayer perceptron + + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :param n_hidden: List representing the number of units for each + hidden layer + + #:param n_layer: Number of hidden layers + + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize the parameters theta = (W,b) ; Here W and b are lists + # where W[i] and b[i] represent the parameters and the bias vector + # of the i-th layer. + n_layer=len(n_hidden) + W_values=[] + b_values=[] + self.W=[] + self.b=[] + + # We first initialize the matrix W[0] and b[0] that represent the parameters + # from the input to the first hidden layer + W_values.append(numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_in+n_hidden[0])), \ + high = numpy.sqrt(6./(n_in+n_hidden[0])), \ + size = (n_in, n_hidden[0])), dtype = theano.config.floatX)) + self.W.append(theano.shared( value = W_values[0] )) + self.b.append(theano.shared( value = numpy.zeros((n_hidden[0],), + dtype= theano.config.floatX))) + + # We initialize the parameters between all consecutive hidden layers + for i in range(1,n_layer): + # Each `W[i]` is initialized with `W_values[i]` which is uniformely sampled + # from -6./sqrt(n_hidden[i]+n_hidden[i+1]) and 6./sqrt(n_hidden[i]+n_hidden[i+1]) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W_values.append(numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ + high = numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ + size = (n_hidden[i-1], n_hidden[i])), dtype = theano.config.floatX)) + self.W.append(theano.shared( value = W_values[i] )) + self.b.append(theano.shared( value = numpy.zeros((n_hidden[i],), + dtype= theano.config.floatX))) + + # We initialize the matrix W[n_layer] and b[n_layer] that represent + # the parameters from the last hidden layer to the output layer using the + # same uniform sampling. + W_values.append(numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)), \ + high= numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)),\ + size= (n_hidden[n_layer-1], n_out)), dtype = theano.config.floatX)) + self.W.append(theano.shared( value = W_values[n_layer])) + self.b.append(theano.shared( value = numpy.zeros((n_out,), + dtype= theano.config.floatX))) + + # List of the symbolic expressions computing the values each hidden layer + self.hidden = [] + + # Symbolic expression of the first hidden layer + self.hidden.append(T.tanh(T.dot(input, self.W[0])+ self.b[0])) + for i in range(1,n_layer): + # Symbolic expression of the i-th hidden layer + self.hidden.append(T.tanh(T.dot(self.hidden[i-1], self.W[i])+ self.b[i])) + + # symbolic expression computing the values of the top layer + self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden[n_layer-1], self.W[n_layer])+self.b[n_layer]) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred = T.argmax( self.p_y_given_x, axis =1) + + # L1 norm ; one regularization option is to enforce L1 norm to + # be small + self.L1=abs(self.W[0]).sum() + self.L2_sqr=abs(self.W[0]).sum() + for i in range(1,n_layer+1): + self.L1 += abs(self.W[i]).sum() + # square of L2 norm ; one regularization option is to enforce + # square of L2 norm to be small + for i in range(n_layer+1): + self.L2_sqr += abs(self.W[i]**2).sum() + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() +def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ + L2_reg = 0.0001, n_iter=100,n_hidden=[200,100,90,80,70]): + """ + Demonstrate stochastic gradient descent optimization for a multilayer + perceptron + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :param L1_reg: L1-norm's weight when added to the cost (see + regularization) + + :param L2_reg: L2-norm's weight when added to the cost (see + regularization) + + :param n_iter: maximal number of iterations ot run the optimizer + + """ + + # Load the dataset + f = gzip.open('mnist.pkl.gz','rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + # make minibatches of size 20 + batch_size = 20 # sized of the minibatch + + # Dealing with the training set + # get the list of training images (x) and their labels (y) + (train_set_x, train_set_y) = train_set + + # initialize the list of training minibatches with empty list + train_batches = [] + for i in xrange(0, len(train_set_x), batch_size): + # add to the list of minibatches the minibatch starting at + # position i, ending at position i+batch_size + # a minibatch is a pair ; the first element of the pair is a list + # of datapoints, the second element is the list of corresponding + # labels + train_batches = train_batches + \ + [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + + # Dealing with the validation set + (valid_set_x, valid_set_y) = valid_set + # initialize the list of validation minibatches + valid_batches = [] + for i in xrange(0, len(valid_set_x), batch_size): + valid_batches = valid_batches + \ + [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] + + # Dealing with the testing set + (test_set_x, test_set_y) = test_set + # initialize the list of testing minibatches + test_batches = [] + for i in xrange(0, len(test_set_x), batch_size): + test_batches = test_batches + \ + [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + + + ishape = (28,28) # this is the size of MNIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + # construct the logistic regression class + classifier = MLP( input=x.reshape((batch_size,28*28)),\ + n_in=28*28, n_hidden=n_hidden, n_out=10) + + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = classifier.negative_log_likelihood(y) \ + + L1_reg * classifier.L1 \ + + L2_reg * classifier.L2_sqr + + # compiling a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + g_W=[] + g_b=[] + # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) + for i in range(len(n_hidden)+1): + g_W.append(T.grad(cost, classifier.W[i])) + g_b.append(T.grad(cost, classifier.b[i])) + + + # specify how to update the parameters of the model as a dictionary + updates={} + for i in range(len(n_hidden)+1): + updates[classifier.W[i]]= classifier.W[i] - learning_rate*g_W[i] + updates[classifier.b[i]]= classifier.b[i] - learning_rate*g_b[i] + # compiling a theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function([x, y], cost, updates = updates ) + n_minibatches = len(train_batches) + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + # have a maximum of `n_iter` iterations through the entire dataset + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + cost_ij = train_model(x,y) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + # sum up the errors for each minibatch + this_validation_loss += test_model(x,y) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(valid_batches) + + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_score = 0. + for x,y in test_batches: + test_score += test_model(x,y) + test_score /= len(test_batches) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + if patience <= iter : + break + + end_time = time.clock() + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + #test on NIST (you need pylearn and access to NIST to do that) +if __name__ == '__main__': + sgd_optimization_mnist() + diff -r 6f606b359df3 -r a9af079892ce baseline/log_reg/__init__.py diff -r 6f606b359df3 -r a9af079892ce baseline/log_reg/log_reg.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/log_reg/log_reg.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,329 @@ +""" +This tutorial introduces logistic regression using Theano and stochastic +gradient descent. + +Logistic regression is a probabilistic, linear classifier. It is parametrized +by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is +done by projecting data points onto a set of hyperplanes, the distance to +which is used to determine a class membership probability. + +Mathematically, this can be written as: + +.. math:: + P(Y=i|x, W,b) &= softmax_i(W x + b) \\ + &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} + + +The output of the model or prediction is then done by taking the argmax of +the vector whose i'th element is P(Y=i|x). + +.. math:: + + y_{pred} = argmax_i P(Y=i|x,W,b) + + +This tutorial presents a stochastic gradient descent optimization method +suitable for large datasets, and a conjugate gradient optimization method +that is suitable for smaller datasets. + + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 4.3.2 + +""" +__docformat__ = 'restructedtext en' + +import numpy, time + +import theano +import theano.tensor as T +from ift6266 import datasets + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + + def __init__( self, input, n_in, n_out ): + """ Initialize the parameters of the logistic regression + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value = numpy.zeros(( n_in, n_out ), dtype = theano.config.floatX ), + name =' W') + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value = numpy.zeros(( n_out, ), dtype = theano.config.floatX ), + name = 'b') + + + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax( T.dot( input, self.W ) + self.b ) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax( self.p_y_given_x, axis =1 ) + + # parameters of the model + self.params = [ self.W, self.b ] + + + def negative_log_likelihood( self, y ): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch + # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1] + # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class + # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]] + # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v, + # i.e., the mean log-likelihood across the minibatch. + return -T.mean( T.log( self.p_y_given_x )[ T.arange( y.shape[0] ), y ] ) + + def MSE(self, y): + return -T.mean(abs((self.p_t_given_x)[T.arange(y.shape[0]), y]-y)**2) + + def errors( self, y ): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError( 'y should have the same shape as self.y_pred', + ( 'y', target.type, 'y_pred', self.y_pred.type ) ) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean( T.neq( self.y_pred, y ) ) + else: + raise NotImplementedError() + +#-------------------------------------------------------------------------------------------------------------------- +# MAIN +#-------------------------------------------------------------------------------------------------------------------- + +def log_reg( learning_rate = 0.13, nb_max_examples =1000000, batch_size = 50, \ + dataset=datasets.nist_digits(), image_size = 32 * 32, nb_class = 10, \ + patience = 5000, patience_increase = 2, improvement_threshold = 0.995): + + #28 * 28 = 784 + """ + Demonstrate stochastic gradient descent optimization of a log-linear + model + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used (factor for the stochastic + gradient) + + :type nb_max_examples: int + :param nb_max_examples: maximal number of epochs to run the optimizer + + :type batch_size: int + :param batch_size: size of the minibatch + + :type dataset: dataset + :param dataset: a dataset instance from ift6266.datasets + + :type image_size: int + :param image_size: size of the input image in pixels (width * height) + + :type nb_class: int + :param nb_class: number of classes + + :type patience: int + :param patience: look as this many examples regardless + + :type patience_increase: int + :param patience_increase: wait this much longer when a new best is found + + :type improvement_threshold: float + :param improvement_threshold: a relative improvement of this much is considered significant + + + """ + #-------------------------------------------------------------------------------------------------------------------- + # Build actual model + #-------------------------------------------------------------------------------------------------------------------- + + print '... building the model' + + # allocate symbolic variables for the data + index = T.lscalar( ) # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # construct the logistic regression class + + classifier = LogisticRegression( input = x, n_in = image_size, n_out = nb_class ) + + # the cost we minimize during training is the negative log likelihood of + # the model in symbolic format + cost = classifier.negative_log_likelihood( y ) + + # compiling a Theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function( inputs = [ x, y ], + outputs = classifier.errors( y )) + + validate_model = theano.function( inputs = [ x, y ], + outputs = classifier.errors( y )) + + # compute the gradient of cost with respect to theta = ( W, b ) + g_W = T.grad( cost = cost, wrt = classifier.W ) + g_b = T.grad( cost = cost, wrt = classifier.b ) + + # specify how to update the parameters of the model as a dictionary + updates = { classifier.W: classifier.W - learning_rate * g_W,\ + classifier.b: classifier.b - learning_rate * g_b} + + # compiling a Theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function( inputs = [ x, y ], + outputs = cost, + updates = updates) + + #-------------------------------------------------------------------------------------------------------------------- + # Train model + #-------------------------------------------------------------------------------------------------------------------- + + print '... training the model' + # early-stopping parameters + patience = 5000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = patience * 0.5 + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + n_iters = nb_max_examples / batch_size + epoch = 0 + iter = 0 + + while ( iter < n_iters ) and ( not done_looping ): + + epoch = epoch + 1 + for x, y in dataset.train(batch_size): + + minibatch_avg_cost = train_model( x, y ) + # iteration number + iter += 1 + + if iter % validation_frequency == 0: + # compute zero-one loss on validation set + validation_losses = [ validate_model( xv, yv ) for xv, yv in dataset.valid(batch_size) ] + this_validation_loss = numpy.mean( validation_losses ) + + print('epoch %i, iter %i, validation error %f %%' % \ + ( epoch, iter, this_validation_loss*100. ) ) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max( patience, iter * patience_increase ) + + best_validation_loss = this_validation_loss + # test it on the test set + + test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)] + test_score = numpy.mean(test_losses) + + print((' epoch %i, iter %i, test error of best ' + 'model %f %%') % \ + (epoch, iter, test_score*100.)) + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + ( best_validation_loss * 100., test_score * 100.)) + print ('The code ran for %f minutes' % ((end_time-start_time) / 60.)) + + return best_validation_loss, test_score, iter*batch_size, (end_time-start_time) / 60. + +if __name__ == '__main__': + log_reg() + + +def jobman_log_reg(state, channel): + print state + (validation_error, test_error, nb_exemples, time) = log_reg( learning_rate = state.learning_rate, \ + nb_max_examples = state.nb_max_examples, \ + batch_size = state.batch_size,\ + image_size = state.image_size, \ + nb_class = state.nb_class, \ + patience = state.patience, \ + patience_increase = state.patience_increase, \ + improvement_threshold = state.improvement_threshold ) + + + print state + state.validation_error = validation_error + state.test_error = test_error + state.nb_exemples = nb_exemples + state.time = time + return channel.COMPLETE + + + + + + diff -r 6f606b359df3 -r a9af079892ce baseline/mlp/__init__.py diff -r 6f606b359df3 -r a9af079892ce baseline/mlp/mlp_get_error_from_model.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/mlp/mlp_get_error_from_model.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,151 @@ +__docformat__ = 'restructedtext en' + +import pdb +import numpy as np +import pylab +import time +import pylearn +from pylearn.io import filetensor as ft + +data_path = '/data/lisa/data/nist/by_class/' +test_data = 'all/all_train_data.ft' +test_labels = 'all/all_train_labels.ft' + +def read_test_data(mlp_model): + + + #read the data + h = open(data_path+test_data) + i= open(data_path+test_labels) + raw_test_data = ft.read(h) + raw_test_labels = ft.read(i) + i.close() + h.close() + + #read the model chosen + a=np.load(mlp_model) + W1=a['W1'] + W2=a['W2'] + b1=a['b1'] + b2=a['b2'] + + return (W1,b1,W2,b2,raw_test_data,raw_test_labels) + + + + +def get_total_test_error(everything): + + W1=everything[0] + b1=everything[1] + W2=everything[2] + b2=everything[3] + test_data=everything[4] + test_labels=everything[5] + total_error_count=0 + total_exemple_count=0 + + nb_error_count=0 + nb_exemple_count=0 + + char_error_count=0 + char_exemple_count=0 + + min_error_count=0 + min_exemple_count=0 + + maj_error_count=0 + maj_exemple_count=0 + + for i in range(test_labels.size): + total_exemple_count = total_exemple_count +1 + #get activation for layer 1 + a0=np.dot(np.transpose(W1),np.transpose(test_data[i]/255.0)) + b1 + #add non linear function to layer 1 activation + a0_out=np.tanh(a0) + + #get activation for output layer + a1= np.dot(np.transpose(W2),a0_out) + b2 + #add non linear function for output activation (softmax) + a1_exp = np.exp(a1) + sum_a1=np.sum(a1_exp) + a1_out=a1_exp/sum_a1 + + predicted_class=np.argmax(a1_out) + wanted_class=test_labels[i] + + if(predicted_class!=wanted_class): + total_error_count = total_error_count +1 + + #get grouped based error + #with a priori +# if(wanted_class>9 and wanted_class<35): +# min_exemple_count=min_exemple_count+1 +# predicted_class=np.argmax(a1_out[10:35])+10 +# if(predicted_class!=wanted_class): +# min_error_count=min_error_count+1 +# if(wanted_class<10): +# nb_exemple_count=nb_exemple_count+1 +# predicted_class=np.argmax(a1_out[0:10]) +# if(predicted_class!=wanted_class): +# nb_error_count=nb_error_count+1 +# if(wanted_class>34): +# maj_exemple_count=maj_exemple_count+1 +# predicted_class=np.argmax(a1_out[35:])+35 +# if(predicted_class!=wanted_class): +# maj_error_count=maj_error_count+1 +# +# if(wanted_class>9): +# char_exemple_count=char_exemple_count+1 +# predicted_class=np.argmax(a1_out[10:])+10 +# if(predicted_class!=wanted_class): +# char_error_count=char_error_count+1 + + + + #get grouped based error + #with no a priori + if(wanted_class>9 and wanted_class<35): + min_exemple_count=min_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + min_error_count=min_error_count+1 + if(wanted_class<10): + nb_exemple_count=nb_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + nb_error_count=nb_error_count+1 + if(wanted_class>34): + maj_exemple_count=maj_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + maj_error_count=maj_error_count+1 + + if(wanted_class>9): + char_exemple_count=char_exemple_count+1 + predicted_class=np.argmax(a1_out) + if(predicted_class!=wanted_class): + char_error_count=char_error_count+1 + + + #convert to float + return ( total_exemple_count,nb_exemple_count,char_exemple_count,min_exemple_count,maj_exemple_count,\ + total_error_count,nb_error_count,char_error_count,min_error_count,maj_error_count,\ + total_error_count*100.0/total_exemple_count*1.0,\ + nb_error_count*100.0/nb_exemple_count*1.0,\ + char_error_count*100.0/char_exemple_count*1.0,\ + min_error_count*100.0/min_exemple_count*1.0,\ + maj_error_count*100.0/maj_exemple_count*1.0) + + + + + + + + + + + + + \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce baseline/mlp/mlp_nist.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/baseline/mlp/mlp_nist.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,474 @@ +""" +This tutorial introduces the multilayer perceptron using Theano. + + A multilayer perceptron is a logistic regressor where +instead of feeding the input to the logistic regression you insert a +intermidiate layer, called the hidden layer, that has a nonlinear +activation function (usually tanh or sigmoid) . One can use many such +hidden layers making the architecture deep. The tutorial will also tackle +the problem of MNIST digit classification. + +.. math:: + + f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 5 + +TODO: recommended preprocessing, lr ranges, regularization ranges (explain + to do lr first, then add regularization) + +""" +__docformat__ = 'restructedtext en' + +import pdb +import numpy +import pylab +import theano +import theano.tensor as T +import time +import theano.tensor.nnet +import pylearn +import theano,pylearn.version +from pylearn.io import filetensor as ft + +data_path = '/data/lisa/data/nist/by_class/' + +class MLP(object): + """Multi-Layer Perceptron Class + + A multilayer perceptron is a feedforward artificial neural network model + that has one layer or more of hidden units and nonlinear activations. + Intermidiate layers usually have as activation function thanh or the + sigmoid function while the top layer is a softamx layer. + """ + + + + def __init__(self, input, n_in, n_hidden, n_out,learning_rate): + """Initialize the parameters for the multilayer perceptron + + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + + :param n_hidden: number of hidden units + + :param n_out: number of output units, the dimension of the space in + which the labels lie + + """ + + # initialize the parameters theta = (W1,b1,W2,b2) ; note that this + # example contains only one hidden layer, but one can have as many + # layers as he/she wishes, making the network deeper. The only + # problem making the network deep this way is during learning, + # backpropagation being unable to move the network from the starting + # point towards; this is where pre-training helps, giving a good + # starting point for backpropagation, but more about this in the + # other tutorials + + # `W1` is initialized with `W1_values` which is uniformely sampled + # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W1_values = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_in+n_hidden)), \ + high = numpy.sqrt(6./(n_in+n_hidden)), \ + size = (n_in, n_hidden)), dtype = theano.config.floatX) + # `W2` is initialized with `W2_values` which is uniformely sampled + # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W2_values = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden+n_out)), \ + high= numpy.sqrt(6./(n_hidden+n_out)),\ + size= (n_hidden, n_out)), dtype = theano.config.floatX) + + self.W1 = theano.shared( value = W1_values ) + self.b1 = theano.shared( value = numpy.zeros((n_hidden,), + dtype= theano.config.floatX)) + self.W2 = theano.shared( value = W2_values ) + self.b2 = theano.shared( value = numpy.zeros((n_out,), + dtype= theano.config.floatX)) + + #include the learning rate in the classifer so + #we can modify it on the fly when we want + lr_value=learning_rate + self.lr=theano.shared(value=lr_value) + # symbolic expression computing the values of the hidden layer + self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) + + + + # symbolic expression computing the values of the top layer + self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred = T.argmax( self.p_y_given_x, axis =1) + self.y_pred_num = T.argmax( self.p_y_given_x[0:9], axis =1) + + + + + # L1 norm ; one regularization option is to enforce L1 norm to + # be small + self.L1 = abs(self.W1).sum() + abs(self.W2).sum() + + # square of L2 norm ; one regularization option is to enforce + # square of L2 norm to be small + self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() + + + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + + :param y: corresponds to a vector that gives for each example the + :correct label + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +def mlp_full_nist( verbose = False,\ + adaptive_lr = 0,\ + train_data = 'all/all_train_data.ft',\ + train_labels = 'all/all_train_labels.ft',\ + test_data = 'all/all_test_data.ft',\ + test_labels = 'all/all_test_labels.ft',\ + learning_rate=0.01,\ + L1_reg = 0.00,\ + L2_reg = 0.0001,\ + nb_max_exemples=1000000,\ + batch_size=20,\ + nb_hidden = 500,\ + nb_targets = 62, + tau=1e6): + + + configuration = [learning_rate,nb_max_exemples,nb_hidden,adaptive_lr] + + #save initial learning rate if classical adaptive lr is used + initial_lr=learning_rate + + total_validation_error_list = [] + total_train_error_list = [] + learning_rate_list=[] + best_training_error=float('inf'); + + + + + f = open(data_path+train_data) + g= open(data_path+train_labels) + h = open(data_path+test_data) + i= open(data_path+test_labels) + + raw_train_data = ft.read(f) + raw_train_labels = ft.read(g) + raw_test_data = ft.read(h) + raw_test_labels = ft.read(i) + + f.close() + g.close() + i.close() + h.close() + #create a validation set the same size as the test size + #use the end of the training array for this purpose + #discard the last remaining so we get a %batch_size number + test_size=len(raw_test_labels) + test_size = int(test_size/batch_size) + test_size*=batch_size + train_size = len(raw_train_data) + train_size = int(train_size/batch_size) + train_size*=batch_size + validation_size =test_size + offset = train_size-test_size + if verbose == True: + print 'train size = %d' %train_size + print 'test size = %d' %test_size + print 'valid size = %d' %validation_size + print 'offset = %d' %offset + + + train_set = (raw_train_data,raw_train_labels) + train_batches = [] + for i in xrange(0, train_size-test_size, batch_size): + train_batches = train_batches + \ + [(raw_train_data[i:i+batch_size], raw_train_labels[i:i+batch_size])] + + test_batches = [] + for i in xrange(0, test_size, batch_size): + test_batches = test_batches + \ + [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])] + + validation_batches = [] + for i in xrange(0, test_size, batch_size): + validation_batches = validation_batches + \ + [(raw_train_data[offset+i:offset+i+batch_size], raw_train_labels[offset+i:offset+i+batch_size])] + + + ishape = (32,32) # this is the size of NIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + if verbose==True: + print 'finished parsing the data' + # construct the logistic regression class + classifier = MLP( input=x.reshape((batch_size,32*32)),\ + n_in=32*32,\ + n_hidden=nb_hidden,\ + n_out=nb_targets, + learning_rate=learning_rate) + + + + + # the cost we minimize during training is the negative log likelihood of + # the model plus the regularization terms (L1 and L2); cost is expressed + # here symbolically + cost = classifier.negative_log_likelihood(y) \ + + L1_reg * classifier.L1 \ + + L2_reg * classifier.L2_sqr + + # compiling a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([x,y], classifier.errors(y)) + + # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) + g_W1 = T.grad(cost, classifier.W1) + g_b1 = T.grad(cost, classifier.b1) + g_W2 = T.grad(cost, classifier.W2) + g_b2 = T.grad(cost, classifier.b2) + + # specify how to update the parameters of the model as a dictionary + updates = \ + { classifier.W1: classifier.W1 - classifier.lr*g_W1 \ + , classifier.b1: classifier.b1 - classifier.lr*g_b1 \ + , classifier.W2: classifier.W2 - classifier.lr*g_W2 \ + , classifier.b2: classifier.b2 - classifier.lr*g_b2 } + + # compiling a theano function `train_model` that returns the cost, but in + # the same time updates the parameter of the model based on the rules + # defined in `updates` + train_model = theano.function([x, y], cost, updates = updates ) + n_minibatches = len(train_batches) + + + + + + + #conditions for stopping the adaptation: + #1) we have reached nb_max_exemples (this is rounded up to be a multiple of the train size) + #2) validation error is going up twice in a row(probable overfitting) + + # This means we no longer stop on slow convergence as low learning rates stopped + # too fast. + + # no longer relevant + patience =nb_max_exemples/batch_size + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches/4 + + + + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + n_iter = nb_max_exemples/batch_size # nb of max times we are allowed to run through all exemples + n_iter = n_iter/n_minibatches + 1 #round up + n_iter=max(1,n_iter) # run at least once on short debug call + time_n=0 #in unit of exemples + + + + if verbose == True: + print 'looping at most %d times through the data set' %n_iter + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + + if adaptive_lr==2: + classifier.lr.value = tau*initial_lr/(tau+time_n) + + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + # convert to float + x_float = x/255.0 + cost_ij = train_model(x_float,y) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + + this_validation_loss = 0. + for x,y in validation_batches: + # sum up the errors for each minibatch + x_float = x/255.0 + this_validation_loss += test_model(x_float,y) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(validation_batches) + #save the validation loss + total_validation_error_list.append(this_validation_loss) + + #get the training error rate + this_train_loss=0 + for x,y in train_batches: + # sum up the errors for each minibatch + x_float = x/255.0 + this_train_loss += test_model(x_float,y) + # get the average by dividing with the number of minibatches + this_train_loss /= len(train_batches) + #save the validation loss + total_train_error_list.append(this_train_loss) + if(this_train_loss= best_validation_loss: + #calculate the test error at this point and exit + # test it on the test set + # however, if adaptive_lr is true, try reducing the lr to + # get us out of an oscilliation + if adaptive_lr==1: + classifier.lr.value=classifier.lr.value/2.0 + + test_score = 0. + #cap the patience so we are allowed one more validation error + #calculation before aborting + patience = iter+validation_frequency+1 + for x,y in test_batches: + x_float=x/255.0 + test_score += test_model(x_float,y) + test_score /= len(test_batches) + if verbose == True: + print ' validation error is going up, possibly stopping soon' + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_minibatches, + test_score*100.)) + + + + + if iter>patience: + print 'we have diverged' + break + + + time_n= time_n + batch_size + end_time = time.clock() + if verbose == True: + print(('Optimization complete. Best validation score of %f %% ' + 'obtained at iteration %i, with test performance %f %%') % + (best_validation_loss * 100., best_iter, test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + print iter + + #save the model and the weights + numpy.savez('model.npy', config=configuration, W1=classifier.W1.value,W2=classifier.W2.value, b1=classifier.b1.value,b2=classifier.b2.value) + numpy.savez('results.npy',config=configuration,total_train_error_list=total_train_error_list,total_validation_error_list=total_validation_error_list,\ + learning_rate_list=learning_rate_list) + + return (best_training_error*100.0,best_validation_loss * 100.,test_score*100.,best_iter*batch_size,(end_time-start_time)/60) + + +if __name__ == '__main__': + mlp_full_mnist() + +def jobman_mlp_full_nist(state,channel): + (train_error,validation_error,test_error,nb_exemples,time)=mlp_full_nist(learning_rate=state.learning_rate,\ + nb_max_exemples=state.nb_max_exemples,\ + nb_hidden=state.nb_hidden,\ + adaptive_lr=state.adaptive_lr,\ + tau=state.tau) + state.train_error=train_error + state.validation_error=validation_error + state.test_error=test_error + state.nb_exemples=nb_exemples + state.time=time + return channel.COMPLETE + + \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/DBN.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/DBN.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,384 @@ +""" +""" +import os + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from logistic_sgd import LogisticRegression, load_data +from mlp import HiddenLayer +from rbm import RBM + + + +class DBN(object): + """ + """ + + def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, + hidden_layers_sizes = [500,500], n_outs = 10): + """This class is made to support a variable number of layers. + + :type numpy_rng: numpy.random.RandomState + :param numpy_rng: numpy random number generator used to draw initial + weights + + :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams + :param theano_rng: Theano random generator; if None is given one is + generated based on a seed drawn from `rng` + + :type n_ins: int + :param n_ins: dimension of the input to the DBN + + :type n_layers_sizes: list of ints + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :type n_outs: int + :param n_outs: dimension of the output of the network + """ + + self.sigmoid_layers = [] + self.rbm_layers = [] + self.params = [] + self.n_layers = len(hidden_layers_sizes) + + assert self.n_layers > 0 + + if not theano_rng: + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + # allocate symbolic variables for the data + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # The DBN is an MLP, for which all weights of intermidiate layers are shared with a + # different RBM. We will first construct the DBN as a deep multilayer perceptron, and + # when constructing each sigmoidal layer we also construct an RBM that shares weights + # with that layer. During pretraining we will train these RBMs (which will lead + # to chainging the weights of the MLP as well) During finetuning we will finish + # training the DBN by doing stochastic gradient descent on the MLP. + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of the layer below or + # the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden layer below or the + # input of the DBN if you are on the first layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.sigmoid_layers[-1].output + + sigmoid_layer = HiddenLayer(rng = numpy_rng, + input = layer_input, + n_in = input_size, + n_out = hidden_layers_sizes[i], + activation = T.nnet.sigmoid) + + # add the layer to our list of layers + self.sigmoid_layers.append(sigmoid_layer) + + # its arguably a philosophical question... but we are going to only declare that + # the parameters of the sigmoid_layers are parameters of the DBN. The visible + # biases in the RBM are parameters of those RBMs, but not of the DBN. + self.params.extend(sigmoid_layer.params) + + # Construct an RBM that shared weights with this layer + rbm_layer = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng, + input = layer_input, + n_visible = input_size, + n_hidden = hidden_layers_sizes[i], + W = sigmoid_layer.W, + hbias = sigmoid_layer.b) + self.rbm_layers.append(rbm_layer) + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.sigmoid_layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + self.params.extend(self.logLayer.params) + + # construct a function that implements one step of fine-tuning compute the cost for + # second phase of training, defined as the negative log likelihood + self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + + # compute the gradients with respect to the model parameters + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + self.errors = self.logLayer.errors(self.y) + + def pretraining_functions(self, train_set_x, batch_size): + ''' Generates a list of functions, for performing one step of gradient descent at a + given layer. The function will require as input the minibatch index, and to train an + RBM you just need to iterate, calling the corresponding function on all minibatch + indexes. + + :type train_set_x: theano.tensor.TensorType + :param train_set_x: Shared var. that contains all datapoints used for training the RBM + :type batch_size: int + :param batch_size: size of a [mini]batch + ''' + + # index to a [mini]batch + index = T.lscalar('index') # index to a minibatch + learning_rate = T.scalar('lr') # learning rate to use + + # number of batches + n_batches = train_set_x.value.shape[0] / batch_size + # begining of a batch, given `index` + batch_begin = index * batch_size + # ending of a batch given `index` + batch_end = batch_begin+batch_size + + pretrain_fns = [] + for rbm in self.rbm_layers: + + # get the cost and the updates list + # TODO: change cost function to reconstruction error + cost,updates = rbm.cd(learning_rate, persistent=None) + + # compile the theano function + fn = theano.function(inputs = [index, + theano.Param(learning_rate, default = 0.1)], + outputs = cost, + updates = updates, + givens = {self.x :train_set_x[batch_begin:batch_end]}) + # append `fn` to the list of functions + pretrain_fns.append(fn) + + return pretrain_fns + + + def build_finetune_functions(self, datasets, batch_size, learning_rate): + '''Generates a function `train` that implements one step of finetuning, a function + `validate` that computes the error on a batch from the validation set, and a function + `test` that computes the error on a batch from the testing set + + :type datasets: list of pairs of theano.tensor.TensorType + :param datasets: It is a list that contain all the datasets; the has to contain three + pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano + variables, one for the datapoints, the other for the labels + :type batch_size: int + :param batch_size: size of a minibatch + :type learning_rate: float + :param learning_rate: learning rate used during finetune stage + ''' + + (train_set_x, train_set_y) = datasets[0] + (valid_set_x, valid_set_y) = datasets[1] + (test_set_x , test_set_y ) = datasets[2] + + # compute number of minibatches for training, validation and testing + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + index = T.lscalar('index') # index to a [mini]batch + + # compute the gradients with respect to the model parameters + gparams = T.grad(self.finetune_cost, self.params) + + # compute list of fine-tuning updates + updates = {} + for param, gparam in zip(self.params, gparams): + updates[param] = param - gparam*learning_rate + + train_fn = theano.function(inputs = [index], + outputs = self.finetune_cost, + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size], + self.y : train_set_y[index*batch_size:(index+1)*batch_size]}) + + test_score_i = theano.function([index], self.errors, + givens = { + self.x: test_set_x[index*batch_size:(index+1)*batch_size], + self.y: test_set_y[index*batch_size:(index+1)*batch_size]}) + + valid_score_i = theano.function([index], self.errors, + givens = { + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], + self.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # Create a function that scans the entire validation set + def valid_score(): + return [valid_score_i(i) for i in xrange(n_valid_batches)] + + # Create a function that scans the entire test set + def test_score(): + return [test_score_i(i) for i in xrange(n_test_batches)] + + return train_fn, valid_score, test_score + + + + + + +def test_DBN( finetune_lr = 0.1, pretraining_epochs = 10, \ + pretrain_lr = 0.1, training_epochs = 1000, \ + dataset='mnist.pkl.gz'): + """ + Demonstrates how to train and test a Deep Belief Network. + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used in the finetune stage + :type pretraining_epochs: int + :param pretraining_epochs: number of epoch to do pretraining + :type pretrain_lr: float + :param pretrain_lr: learning rate to be used during pre-training + :type n_iter: int + :param n_iter: maximal number of iterations ot run the optimizer + :type dataset: string + :param dataset: path the the pickled dataset + """ + + print 'finetune_lr = ', finetune_lr + print 'pretrain_lr = ', pretrain_lr + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] + + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # numpy random generator + numpy_rng = numpy.random.RandomState(123) + print '... building the model' + # construct the Deep Belief Network + dbn = DBN(numpy_rng = numpy_rng, n_ins = 28*28, + hidden_layers_sizes = [1000,1000,1000], + n_outs = 10) + + + ######################### + # PRETRAINING THE MODEL # + ######################### + print '... getting the pretraining functions' + pretraining_fns = dbn.pretraining_functions( + train_set_x = train_set_x, + batch_size = batch_size ) + + print '... pre-training the model' + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(dbn.n_layers): + # go through pretraining epochs + for epoch in xrange(pretraining_epochs): + # go through the training set + c = [] + for batch_index in xrange(n_train_batches): + c.append(pretraining_fns[i](index = batch_index, + lr = pretrain_lr ) ) + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + + ######################## + # FINETUNING THE MODEL # + ######################## + + # get the training, validation and testing function for the model + print '... getting the finetuning functions' + train_fn, validate_model, test_model = dbn.build_finetune_functions ( + datasets = datasets, batch_size = batch_size, + learning_rate = finetune_lr) + + print '... finetunning the model' + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < training_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): + + minibatch_avg_cost = train_fn(minibatch_index) + iter = epoch * n_train_batches + minibatch_index + + if (iter+1) % validation_frequency == 0: + + validation_losses = validate_model() + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = test_model() + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_train_batches, + test_score*100.)) + + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + + + +if __name__ == '__main__': + pretrain_lr = numpy.float(os.sys.argv[1]) + finetune_lr = numpy.float(os.sys.argv[2]) + test_DBN(pretrain_lr=pretrain_lr, finetune_lr=finetune_lr) diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/SdA.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/SdA.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,441 @@ +""" + This tutorial introduces stacked denoising auto-encoders (SdA) using Theano. + + Denoising autoencoders are the building blocks for SdA. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from logistic_sgd import LogisticRegression, load_data +from mlp import HiddenLayer +from dA import dA + + + +class SdA(object): + """Stacked denoising auto-encoder class (SdA) + + A stacked denoising autoencoder model is obtained by stacking several + dAs. The hidden layer of the dA at layer `i` becomes the input of + the dA at layer `i+1`. The first layer dA gets as input the input of + the SdA, and the hidden layer of the last dA represents the output. + Note that after pretraining, the SdA is dealt with as a normal MLP, + the dAs are only used to initialize the weights. + """ + + def __init__(self, numpy_rng, theano_rng = None, n_ins = 784, + hidden_layers_sizes = [500,500], n_outs = 10, + corruption_levels = [0.1, 0.1]): + """ This class is made to support a variable number of layers. + + :type numpy_rng: numpy.random.RandomState + :param numpy_rng: numpy random number generator used to draw initial + weights + + :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams + :param theano_rng: Theano random generator; if None is given one is + generated based on a seed drawn from `rng` + + :type n_ins: int + :param n_ins: dimension of the input to the sdA + + :type n_layers_sizes: list of ints + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :type n_outs: int + :param n_outs: dimension of the output of the network + + :type corruption_levels: list of float + :param corruption_levels: amount of corruption to use for each + layer + """ + + self.sigmoid_layers = [] + self.dA_layers = [] + self.params = [] + self.n_layers = len(hidden_layers_sizes) + + assert self.n_layers > 0 + + if not theano_rng: + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + # allocate symbolic variables for the data + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # The SdA is an MLP, for which all weights of intermidiate layers + # are shared with a different denoising autoencoders + # We will first construct the SdA as a deep multilayer perceptron, + # and when constructing each sigmoidal layer we also construct a + # denoising autoencoder that shares weights with that layer + # During pretraining we will train these autoencoders (which will + # lead to chainging the weights of the MLP as well) + # During finetunining we will finish training the SdA by doing + # stochastich gradient descent on the MLP + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.sigmoid_layers[-1].output + + sigmoid_layer = HiddenLayer(rng = numpy_rng, + input = layer_input, + n_in = input_size, + n_out = hidden_layers_sizes[i], + activation = T.nnet.sigmoid) + # add the layer to our list of layers + self.sigmoid_layers.append(sigmoid_layer) + # its arguably a philosophical question... + # but we are going to only declare that the parameters of the + # sigmoid_layers are parameters of the StackedDAA + # the visible biases in the dA are parameters of those + # dA, but not the SdA + self.params.extend(sigmoid_layer.params) + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input, + n_visible = input_size, + n_hidden = hidden_layers_sizes[i], + W = sigmoid_layer.W, bhid = sigmoid_layer.b) + self.dA_layers.append(dA_layer) + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.sigmoid_layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params.extend(self.logLayer.params) + # construct a function that implements one step of finetunining + + # compute the cost for second phase of training, + # defined as the negative log likelihood + self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + self.errors = self.logLayer.errors(self.y) + + def pretraining_functions(self, train_set_x, batch_size): + ''' Generates a list of functions, each of them implementing one + step in trainnig the dA corresponding to the layer with same index. + The function will require as input the minibatch index, and to train + a dA you just need to iterate, calling the corresponding function on + all minibatch indexes. + + :type train_set_x: theano.tensor.TensorType + :param train_set_x: Shared variable that contains all datapoints used + for training the dA + + :type batch_size: int + :param batch_size: size of a [mini]batch + + :type learning_rate: float + :param learning_rate: learning rate used during training for any of + the dA layers + ''' + + # index to a [mini]batch + index = T.lscalar('index') # index to a minibatch + corruption_level = T.scalar('corruption') # amount of corruption to use + learning_rate = T.scalar('lr') # learning rate to use + # number of batches + n_batches = train_set_x.value.shape[0] / batch_size + # begining of a batch, given `index` + batch_begin = index * batch_size + # ending of a batch given `index` + batch_end = batch_begin+batch_size + + pretrain_fns = [] + for dA in self.dA_layers: + # get the cost and the updates list + cost,updates = dA.get_cost_updates( corruption_level, learning_rate) + # compile the theano function + fn = theano.function( inputs = [index, + theano.Param(corruption_level, default = 0.2), + theano.Param(learning_rate, default = 0.1)], + outputs = cost, + updates = updates, + givens = {self.x :train_set_x[batch_begin:batch_end]}) + # append `fn` to the list of functions + pretrain_fns.append(fn) + + return pretrain_fns + + + def build_finetune_functions(self, datasets, batch_size, learning_rate): + '''Generates a function `train` that implements one step of + finetuning, a function `validate` that computes the error on + a batch from the validation set, and a function `test` that + computes the error on a batch from the testing set + + :type datasets: list of pairs of theano.tensor.TensorType + :param datasets: It is a list that contain all the datasets; + the has to contain three pairs, `train`, + `valid`, `test` in this order, where each pair + is formed of two Theano variables, one for the + datapoints, the other for the labels + + :type batch_size: int + :param batch_size: size of a minibatch + + :type learning_rate: float + :param learning_rate: learning rate used during finetune stage + ''' + + (train_set_x, train_set_y) = datasets[0] + (valid_set_x, valid_set_y) = datasets[1] + (test_set_x , test_set_y ) = datasets[2] + + # compute number of minibatches for training, validation and testing + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + index = T.lscalar('index') # index to a [mini]batch + + # compute the gradients with respect to the model parameters + gparams = T.grad(self.finetune_cost, self.params) + + # compute list of fine-tuning updates + updates = {} + for param, gparam in zip(self.params, gparams): + updates[param] = param - gparam*learning_rate + + train_fn = theano.function(inputs = [index], + outputs = self.finetune_cost, + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size], + self.y : train_set_y[index*batch_size:(index+1)*batch_size]}) + + test_score_i = theano.function([index], self.errors, + givens = { + self.x: test_set_x[index*batch_size:(index+1)*batch_size], + self.y: test_set_y[index*batch_size:(index+1)*batch_size]}) + + valid_score_i = theano.function([index], self.errors, + givens = { + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], + self.y: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # Create a function that scans the entire validation set + def valid_score(): + return [valid_score_i(i) for i in xrange(n_valid_batches)] + + # Create a function that scans the entire test set + def test_score(): + return [test_score_i(i) for i in xrange(n_test_batches)] + + return train_fn, valid_score, test_score + + + + + + +def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \ + pretrain_lr = 0.1, training_epochs = 1000, \ + dataset='mnist.pkl.gz'): + """ + Demonstrates how to train and test a stochastic denoising autoencoder. + + This is demonstrated on MNIST. + + :type learning_rate: float + :param learning_rate: learning rate used in the finetune stage + (factor for the stochastic gradient) + + :type pretraining_epochs: int + :param pretraining_epochs: number of epoch to do pretraining + + :type pretrain_lr: float + :param pretrain_lr: learning rate to be used during pre-training + + :type n_iter: int + :param n_iter: maximal number of iterations ot run the optimizer + + :type dataset: string + :param dataset: path the the pickled dataset + + """ + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] + + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # numpy random generator + numpy_rng = numpy.random.RandomState(123) + print '... building the model' + # construct the stacked denoising autoencoder class + sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28, + hidden_layers_sizes = [1000,1000,1000], + n_outs = 10) + + + ######################### + # PRETRAINING THE MODEL # + ######################### + print '... getting the pretraining functions' + pretraining_fns = sda.pretraining_functions( + train_set_x = train_set_x, + batch_size = batch_size ) + + print '... pre-training the model' + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(sda.n_layers): + # go through pretraining epochs + for epoch in xrange(pretraining_epochs): + # go through the training set + c = [] + for batch_index in xrange(n_train_batches): + c.append( pretraining_fns[i](index = batch_index, + corruption = 0.2, lr = pretrain_lr ) ) + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + + ######################## + # FINETUNING THE MODEL # + ######################## + + # get the training, validation and testing function for the model + print '... getting the finetuning functions' + train_fn, validate_model, test_model = sda.build_finetune_functions ( + datasets = datasets, batch_size = batch_size, + learning_rate = finetune_lr) + + print '... finetunning the model' + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < training_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): + + minibatch_avg_cost = train_fn(minibatch_index) + iter = epoch * n_train_batches + minibatch_index + + if (iter+1) % validation_frequency == 0: + + validation_losses = validate_model() + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = test_model() + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_train_batches, + test_score*100.)) + + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + + + + +if __name__ == '__main__': + test_SdA() + + diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/convolutional_mlp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/convolutional_mlp.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,292 @@ +""" +This tutorial introduces the LeNet5 neural network architecture using Theano. LeNet5 is a +convolutional neural network, good for classifying images. This tutorial shows how to build the +architecture, and comes with all the hyper-parameters you need to reproduce the paper's MNIST +results. + + +This implementation simplifies the model in the following ways: + + - LeNetConvPool doesn't implement location-specific gain and bias parameters + - LeNetConvPool doesn't implement pooling by average, it implements pooling by max. + - Digit classification is implemented with a logistic regression rather than an RBF network + - LeNet5 was not fully-connected convolutions at second layer + +References: + - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner: Gradient-Based Learning Applied to Document + Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998. + http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf +""" + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.signal import downsample +from theano.tensor.nnet import conv + +from logistic_sgd import LogisticRegression, load_data +from mlp import HiddenLayer + + +class LeNetConvPoolLayer(object): + """Pool Layer of a convolutional network """ + + def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2,2)): + """ + Allocate a LeNetConvPoolLayer with shared variable internal parameters. + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.dtensor4 + :param input: symbolic image tensor, of shape image_shape + + :type filter_shape: tuple or list of length 4 + :param filter_shape: (number of filters, num input feature maps, + filter height,filter width) + + :type image_shape: tuple or list of length 4 + :param image_shape: (batch size, num input feature maps, + image height, image width) + + :type poolsize: tuple or list of length 2 + :param poolsize: the downsampling (pooling) factor (#rows,#cols) + """ + + assert image_shape[1]==filter_shape[1] + self.input = input + + # initialize weights to temporary values until we know the shape of the output feature + # maps + W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX) + self.W = theano.shared(value = W_values) + + # the bias is a 1D tensor -- one bias per output feature map + b_values = numpy.zeros((filter_shape[0],), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + # convolve input feature maps with filters + conv_out = conv.conv2d(input = input, filters = self.W, + filter_shape=filter_shape, image_shape=image_shape) + + # there are "num input feature maps * filter height * filter width" inputs + # to each hidden unit + fan_in = numpy.prod(filter_shape[1:]) + # each unit in the lower layer receives a gradient from: + # "num output feature maps * filter height * filter width" / pooling size + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize) + # replace weight values with random weights + W_bound = numpy.sqrt(6./(fan_in + fan_out)) + self.W.value = numpy.asarray( + rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), + dtype = theano.config.floatX) + + # downsample each feature map individually, using maxpooling + pooled_out = downsample.max_pool2D( input = conv_out, + ds = poolsize, ignore_border=True) + + # add the bias term. Since the bias is a vector (1D array), we first + # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will thus + # be broadcasted across mini-batches and feature map width & height + self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + + # store parameters of this layer + self.params = [self.W, self.b] + + + +def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20,50]): + """ Demonstrates lenet on MNIST dataset + + :type learning_rate: float + :param learning_rate: learning rate used (factor for the stochastic + gradient) + + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: path to the dataset used for training /testing (MNIST here) + + :type nkerns: list of ints + :param nkerns: number of kernels on each layer + """ + + rng = numpy.random.RandomState(23455) + + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] + + + batch_size = 500 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + + ishape = (28,28) # this is the size of MNIST images + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + + # Reshape matrix of rasterized images of shape (batch_size,28*28) + # to a 4D tensor, compatible with our LeNetConvPoolLayer + layer0_input = x.reshape((batch_size,1,28,28)) + + # Construct the first convolutional pooling layer: + # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) + # maxpooling reduces this further to (24/2,24/2) = (12,12) + # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) + layer0 = LeNetConvPoolLayer(rng, input=layer0_input, + image_shape=(batch_size,1,28,28), + filter_shape=(nkerns[0],1,5,5), poolsize=(2,2)) + + # Construct the second convolutional pooling layer + # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) + # maxpooling reduces this further to (8/2,8/2) = (4,4) + # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) + layer1 = LeNetConvPoolLayer(rng, input=layer0.output, + image_shape=(batch_size,nkerns[0],12,12), + filter_shape=(nkerns[1],nkerns[0],5,5), poolsize=(2,2)) + + # the TanhLayer being fully-connected, it operates on 2D matrices of + # shape (batch_size,num_pixels) (i.e matrix of rasterized images). + # This will generate a matrix of shape (20,32*4*4) = (20,512) + layer2_input = layer1.output.flatten(2) + + # construct a fully-connected sigmoidal layer + layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1]*4*4, + n_out=500, activation = T.tanh) + + # classify the values of the fully-connected sigmoidal layer + layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) + + # the cost we minimize during training is the NLL of the model + cost = layer3.negative_log_likelihood(y) + + # create a function to compute the mistakes that are made by the model + test_model = theano.function([index], layer3.errors(y), + givens = { + x: test_set_x[index*batch_size:(index+1)*batch_size], + y: test_set_y[index*batch_size:(index+1)*batch_size]}) + + validate_model = theano.function([index], layer3.errors(y), + givens = { + x: valid_set_x[index*batch_size:(index+1)*batch_size], + y: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # create a list of all model parameters to be fit by gradient descent + params = layer3.params+ layer2.params+ layer1.params + layer0.params + + # create a list of gradients for all model parameters + grads = T.grad(cost, params) + + # train_model is a function that updates the model parameters by SGD + # Since this model has many parameters, it would be tedious to manually + # create an update rule for each model parameter. We thus create the updates + # dictionary by automatically looping over all (params[i],grads[i]) pairs. + updates = {} + for param_i, grad_i in zip(params, grads): + updates[param_i] = param_i - learning_rate * grad_i + + train_model = theano.function([index], cost, updates=updates, + givens = { + x: train_set_x[index*batch_size:(index+1)*batch_size], + y: train_set_y[index*batch_size:(index+1)*batch_size]}) + + + ############### + # TRAIN MODEL # + ############### + print '... training' + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + + epoch = 0 + done_looping = False + + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): + + iter = epoch * n_train_batches + minibatch_index + + if iter %100 == 0: + print 'training @ iter = ', iter + cost_ij = train_model(minibatch_index) + + if (iter+1) % validation_frequency == 0: + + # compute zero-one loss on validation set + validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, n_train_batches, + test_score*100.)) + + if patience <= iter : + done_looping = False + break + + end_time = time.clock() + print('Optimization complete.') + print('Best validation score of %f %% obtained at iteration %i,'\ + 'with test performance %f %%' % + (best_validation_loss * 100., best_iter, test_score*100.)) + print('The code ran for %f minutes' % ((end_time-start_time)/60.)) + +if __name__ == '__main__': + evaluate_lenet5() + +def experiment(state, channel): + evaluate_lenet5(state.learning_rate, dataset=state.dataset) diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/dA.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/dA.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,330 @@ +""" + This tutorial introduces denoising auto-encoders (dA) using Theano. + + Denoising autoencoders are the building blocks for SdA. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from logistic_sgd import load_data +from utils import tile_raster_images + +import PIL.Image + + +class dA(object): + """Denoising Auto-Encoder class (dA) + + A denoising autoencoders tries to reconstruct the input from a corrupted + version of it by projecting it first in a latent space and reprojecting + it afterwards back in the input space. Please refer to Vincent et al.,2008 + for more details. If x is the input then equation (1) computes a partially + destroyed version of x by means of a stochastic mapping q_D. Equation (2) + computes the projection of the input into the latent space. Equation (3) + computes the reconstruction of the input, while equation (4) computes the + reconstruction error. + + .. math:: + + \tilde{x} ~ q_D(\tilde{x}|x) (1) + + y = s(W \tilde{x} + b) (2) + + x = s(W' y + b') (3) + + L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) + + """ + + def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n_hidden= 500, + W = None, bhid = None, bvis = None): + """ + Initialize the dA class by specifying the number of visible units (the + dimension d of the input ), the number of hidden units ( the dimension + d' of the latent or hidden space ) and the corruption level. The + constructor also receives symbolic variables for the input, weights and + bias. Such a symbolic variables are useful when, for example the input is + the result of some computations, or when weights are shared between the + dA and an MLP layer. When dealing with SdAs this always happens, + the dA on layer 2 gets as input the output of the dA on layer 1, + and the weights of the dA are used in the second stage of training + to construct an MLP. + + :type numpy_rng: numpy.random.RandomState + :param numpy_rng: number random generator used to generate weights + + :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams + :param theano_rng: Theano random generator; if None is given one is generated + based on a seed drawn from `rng` + + :type input: theano.tensor.TensorType + :paran input: a symbolic description of the input or None for standalone + dA + + :type n_visible: int + :param n_visible: number of visible units + + :type n_hidden: int + :param n_hidden: number of hidden units + + :type W: theano.tensor.TensorType + :param W: Theano variable pointing to a set of weights that should be + shared belong the dA and another architecture; if dA should + be standalone set this to None + + :type bhid: theano.tensor.TensorType + :param bhid: Theano variable pointing to a set of biases values (for + hidden units) that should be shared belong dA and another + architecture; if dA should be standalone set this to None + + :type bvis: theano.tensor.TensorType + :param bvis: Theano variable pointing to a set of biases values (for + visible units) that should be shared belong dA and another + architecture; if dA should be standalone set this to None + + + """ + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + if not theano_rng : + theano_rng = RandomStreams(rng.randint(2**30)) + + # note : W' was written as `W_prime` and b' as `b_prime` + if not W: + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy_rng.uniform( + low = -numpy.sqrt(6./(n_hidden+n_visible)), + high = numpy.sqrt(6./(n_hidden+n_visible)), + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + W = theano.shared(value = initial_W, name ='W') + + if not bvis: + bvis = theano.shared(value = numpy.zeros(n_visible, + dtype = theano.config.floatX)) + + if not bhid: + bhid = theano.shared(value = numpy.zeros(n_hidden, + dtype = theano.config.floatX)) + + + self.W = W + # b corresponds to the bias of the hidden + self.b = bhid + # b_prime corresponds to the bias of the visible + self.b_prime = bvis + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.theano_rng = theano_rng + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + + self.params = [self.W, self.b, self.b_prime] + + def get_corrupted_input(self, input, corruption_level): + """ This function keeps ``1-corruption_level`` entries of the inputs the same + and zero-out randomly selected subset of size ``coruption_level`` + Note : first argument of theano.rng.binomial is the shape(size) of + random numbers that it should produce + second argument is the number of trials + third argument is the probability of success of any trial + + this will produce an array of 0s and 1s where 1 has a probability of + 1 - ``corruption_level`` and 0 with ``corruption_level`` + """ + return self.theano_rng.binomial( size = input.shape, n = 1, prob = 1 - corruption_level) * input + + + def get_hidden_values(self, input): + """ Computes the values of the hidden layer """ + return T.nnet.sigmoid(T.dot(input, self.W) + self.b) + + def get_reconstructed_input(self, hidden ): + """ Computes the reconstructed input given the values of the hidden layer """ + return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime) + + def get_cost_updates(self, corruption_level, learning_rate): + """ This function computes the cost and the updates for one trainng + step of the dA """ + + tilde_x = self.get_corrupted_input(self.x, corruption_level) + y = self.get_hidden_values( tilde_x) + z = self.get_reconstructed_input(y) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + L = - T.sum( self.x*T.log(z) + (1-self.x)*T.log(1-z), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + cost = T.mean(L) + + # compute the gradients of the cost of the `dA` with respect + # to its parameters + gparams = T.grad(cost, self.params) + # generate the list of updates + updates = {} + for param, gparam in zip(self.params, gparams): + updates[param] = param - learning_rate*gparam + + return (cost, updates) + + + + +def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz' ): + + """ + This demo is tested on MNIST + + :type learning_rate: float + :param learning_rate: learning rate used for training the DeNosing AutoEncoder + + :type training_epochs: int + :param training_epochs: number of epochs used for training + + :type dataset: string + :param dataset: path to the picked dataset + + """ + datasets = load_data(dataset) + train_set_x, train_set_y = datasets[0] + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + + #################################### + # BUILDING THE MODEL NO CORRUPTION # + #################################### + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x, + n_visible = 28*28, n_hidden = 500) + + cost, updates = da.get_cost_updates(corruption_level = 0., + learning_rate = learning_rate) + + + train_da = theano.function([index], cost, updates = updates, + givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]}) + + start_time = time.clock() + + ############ + # TRAINING # + ############ + + # go through training epochs + for epoch in xrange(training_epochs): + # go through trainng set + c = [] + for batch_index in xrange(n_train_batches): + c.append(train_da(batch_index)) + + print 'Training epoch %d, cost '%epoch, numpy.mean(c) + + end_time = time.clock() + + training_time = (end_time - start_time) + + print ('Training took %f minutes' %(training_time/60.)) + + image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T, + img_shape = (28,28),tile_shape = (10,10), + tile_spacing=(1,1))) + image.save('filters_corruption_0.png') + + ##################################### + # BUILDING THE MODEL CORRUPTION 30% # + ##################################### + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x, + n_visible = 28*28, n_hidden = 500) + + cost, updates = da.get_cost_updates(corruption_level = 0.3, + learning_rate = learning_rate) + + + train_da = theano.function([index], cost, updates = updates, + givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]}) + + start_time = time.clock() + + ############ + # TRAINING # + ############ + + # go through training epochs + for epoch in xrange(training_epochs): + # go through trainng set + c = [] + for batch_index in xrange(n_train_batches): + c.append(train_da(batch_index)) + + print 'Training epoch %d, cost '%epoch, numpy.mean(c) + + end_time = time.clock() + + training_time = (end_time - start_time) + + print ('Training took %f minutes' %(training_time/60.)) + + image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T, + img_shape = (28,28),tile_shape = (10,10), + tile_spacing=(1,1))) + image.save('filters_corruption_30.png') + + + +if __name__ == '__main__': + test_dA() diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/deep.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/deep.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,880 @@ +""" +Draft of DBN, DAA, SDAA, RBM tutorial code + +""" +import sys +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +from theano import shared, function + +import gzip +import cPickle +import pylearn.io.image_tiling +import PIL + +# NNET STUFF + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + :param input: symbolic variable that describes the input of the + architecture (one minibatch) + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoints lie + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the labels lie + """ + + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + """Return the mean of the negative log-likelihood of the prediction + of this model under a given target distribution. + :param y: corresponds to a vector that gives for each example the + correct label + Note: we use the mean instead of the sum so that + the learning rate is less dependent on the batch size + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch ; zero one + loss over the size of the minibatch + """ + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + """ + Typical hidden layer of a MLP: units are fully-connected and have + sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + Hidden unit activation is given by: sigmoid(dot(input,W) + b) + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + :type input: theano.tensor.matrix + :param input: a symbolic tensor of shape (n_examples, n_in) + :type n_in: int + :param n_in: dimensionality of input + :type n_out: int + :param n_out: number of hidden units + """ + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + +# PRETRAINING LAYERS + +class RBM(object): + """ + *** WRITE THE ENERGY FUNCTION USE SAME LETTERS AS VARIABLE NAMES IN CODE + """ + + def __init__(self, input=None, n_visible=None, n_hidden=None, + W=None, hbias=None, vbias=None, + numpy_rng=None, theano_rng=None): + """ + RBM constructor. Defines the parameters of the model along with + basic operations for inferring hidden from visible (and vice-versa), + as well as for performing CD updates. + + :param input: None for standalone RBMs or symbolic variable if RBM is + part of a larger graph. + + :param n_visible: number of visible units (necessary when W or vbias is None) + + :param n_hidden: number of hidden units (necessary when W or hbias is None) + + :param W: weights to use for the RBM. None means that a shared variable will be + created with a randomly chosen matrix of size (n_visible, n_hidden). + + :param hbias: *** + + :param vbias: *** + + :param numpy_rng: random number generator (necessary when W is None) + + """ + + params = [] + if W is None: + # choose initial values for weight matrix of RBM + initial_W = numpy.asarray( + numpy_rng.uniform( \ + low=-numpy.sqrt(6./(n_hidden+n_visible)), \ + high=numpy.sqrt(6./(n_hidden+n_visible)), \ + size=(n_visible, n_hidden)), \ + dtype=theano.config.floatX) + W = theano.shared(value=initial_W, name='W') + params.append(W) + + if hbias is None: + # theano shared variables for hidden biases + hbias = theano.shared(value=numpy.zeros(n_hidden, + dtype=theano.config.floatX), name='hbias') + params.append(hbias) + + if vbias is None: + # theano shared variables for visible biases + vbias = theano.shared(value=numpy.zeros(n_visible, + dtype=theano.config.floatX), name='vbias') + params.append(vbias) + + if input is None: + # initialize input layer for standalone RBM or layer0 of DBN + input = T.matrix('input') + + # setup theano random number generator + if theano_rng is None: + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + self.visible = self.input = input + self.W = W + self.hbias = hbias + self.vbias = vbias + self.theano_rng = theano_rng + self.params = params + self.hidden_mean = T.nnet.sigmoid(T.dot(input, W)+hbias) + self.hidden_sample = theano_rng.binomial(self.hidden_mean.shape, 1, self.hidden_mean) + + def gibbs_k(self, v_sample, k): + ''' This function implements k steps of Gibbs sampling ''' + + # We compute the visible after k steps of Gibbs by iterating + # over ``gibs_1`` for k times; this can be done in Theano using + # the `scan op`. For a more comprehensive description of scan see + # http://deeplearning.net/software/theano/library/scan.html . + + def gibbs_1(v0_sample, W, hbias, vbias): + ''' This function implements one Gibbs step ''' + + # compute the activation of the hidden units given a sample of the + # vissibles + h0_mean = T.nnet.sigmoid(T.dot(v0_sample, W) + hbias) + # get a sample of the hiddens given their activation + h0_sample = self.theano_rng.binomial(h0_mean.shape, 1, h0_mean) + # compute the activation of the visible given the hidden sample + v1_mean = T.nnet.sigmoid(T.dot(h0_sample, W.T) + vbias) + # get a sample of the visible given their activation + v1_act = self.theano_rng.binomial(v1_mean.shape, 1, v1_mean) + return [v1_mean, v1_act] + + + # DEBUGGING TO DO ALL WITHOUT SCAN + if k == 1: + return gibbs_1(v_sample, self.W, self.hbias, self.vbias) + + + # Because we require as output two values, namely the mean field + # approximation of the visible and the sample obtained after k steps, + # scan needs to know the shape of those two outputs. Scan takes + # this information from the variables containing the initial state + # of the outputs. Since we do not need a initial state of ``v_mean`` + # we provide a dummy one used only to get the correct shape + v_mean = T.zeros_like(v_sample) + + # ``outputs_taps`` is an argument of scan which describes at each + # time step what past values of the outputs the function applied + # recursively needs. This is given in the form of a dictionary, + # where the keys are outputs indexes, and values are a list of + # of the offsets used by the corresponding outputs + # In our case the function ``gibbs_1`` applied recursively, requires + # at time k the past value k-1 for the first output (index 0) and + # no past value of the second output + outputs_taps = { 0 : [-1], 1 : [] } + + v_means, v_samples = theano.scan( fn = gibbs_1, + sequences = [], + initial_states = [v_sample, v_mean], + non_sequences = [self.W, self.hbias, self.vbias], + outputs_taps = outputs_taps, + n_steps = k) + return v_means[-1], v_samples[-1] + + def free_energy(self, v_sample): + wx_b = T.dot(v_sample, self.W) + self.hbias + vbias_term = T.sum(T.dot(v_sample, self.vbias)) + hidden_term = T.sum(T.log(1+T.exp(wx_b))) + return -hidden_term - vbias_term + + def cd(self, visible = None, persistent = None, steps = 1): + """ + Return a 5-tuple of values related to contrastive divergence: (cost, + end-state of negative-phase chain, gradient on weights, gradient on + hidden bias, gradient on visible bias) + + If visible is None, it defaults to self.input + If persistent is None, it defaults to self.input + + CD aka CD1 - cd() + CD-10 - cd(steps=10) + PCD - cd(persistent=shared(numpy.asarray(initializer))) + PCD-k - cd(persistent=shared(numpy.asarray(initializer)), + steps=10) + """ + if visible is None: + visible = self.input + + if visible is None: + raise TypeError('visible argument is required when self.input is None') + + if steps is None: + steps = self.gibbs_1 + + if persistent is None: + chain_start = visible + else: + chain_start = persistent + + chain_end_mean, chain_end_sample = self.gibbs_k(chain_start, steps) + + #print >> sys.stderr, "WARNING: DEBUGGING with wrong FREE ENERGY" + #free_energy_delta = - self.free_energy(chain_end_sample) + free_energy_delta = self.free_energy(visible) - self.free_energy(chain_end_sample) + + # we will return all of these regardless of what is in self.params + all_params = [self.W, self.hbias, self.vbias] + + gparams = T.grad(free_energy_delta, all_params, + consider_constant = [chain_end_sample]) + + cross_entropy = T.mean(T.sum( + visible*T.log(chain_end_mean) + (1 - visible)*T.log(1-chain_end_mean), + axis = 1)) + + return (cross_entropy, chain_end_sample,) + tuple(gparams) + + def cd_updates(self, lr, visible = None, persistent = None, steps = 1): + """ + Return the learning updates for the RBM parameters that are shared variables. + + Also returns an update for the persistent if it is a shared variable. + + These updates are returned as a dictionary. + + :param lr: [scalar] learning rate for contrastive divergence learning + :param visible: see `cd_grad` + :param persistent: see `cd_grad` + :param steps: see `cd_grad` + + """ + + cross_entropy, chain_end, gW, ghbias, gvbias = self.cd(visible, + persistent, steps) + + updates = {} + if hasattr(self.W, 'value'): + updates[self.W] = self.W - lr * gW + if hasattr(self.hbias, 'value'): + updates[self.hbias] = self.hbias - lr * ghbias + if hasattr(self.vbias, 'value'): + updates[self.vbias] = self.vbias - lr * gvbias + if persistent: + #if persistent is a shared var, then it means we should use + updates[persistent] = chain_end + + return updates + +# DEEP MODELS + +class DBN(object): + """ + *** WHAT IS A DBN? + """ + + def __init__(self, input_len, hidden_layers_sizes, n_classes, rng): + """ This class is made to support a variable number of layers. + + :param train_set_x: symbolic variable pointing to the training dataset + + :param train_set_y: symbolic variable pointing to the labels of the + training dataset + + :param input_len: dimension of the input to the sdA + + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :param n_classes: dimension of the output of the network + + :param corruption_levels: amount of corruption to use for each + layer + + :param rng: numpy random number generator used to draw initial weights + + :param pretrain_lr: learning rate used during pre-trainnig stage + + :param finetune_lr: learning rate used during finetune stage + """ + + self.sigmoid_layers = [] + self.rbm_layers = [] + self.pretrain_functions = [] + self.params = [] + + theano_rng = RandomStreams(rng.randint(2**30)) + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + input = self.x + + # The SdA is an MLP, for which all weights of intermidiate layers + # are shared with a different denoising autoencoders + # We will first construct the SdA as a deep multilayer perceptron, + # and when constructing each sigmoidal layer we also construct a + # denoising autoencoder that shares weights with that layer, and + # compile a training function for that denoising autoencoder + + for n_hid in hidden_layers_sizes: + # construct the sigmoidal layer + + sigmoid_layer = SigmoidalLayer(rng, input, input_len, n_hid) + self.sigmoid_layers.append(sigmoid_layer) + + self.rbm_layers.append(RBM(input=input, + W=sigmoid_layer.W, + hbias=sigmoid_layer.b, + n_visible = input_len, + n_hidden = n_hid, + numpy_rng=rng, + theano_rng=theano_rng)) + + # its arguably a philosophical question... + # but we are going to only declare that the parameters of the + # sigmoid_layers are parameters of the StackedDAA + # the hidden-layer biases in the daa_layers are parameters of those + # daa_layers, but not the StackedDAA + self.params.extend(self.sigmoid_layers[-1].params) + + # get ready for the next loop iteration + input_len = n_hid + input = self.sigmoid_layers[-1].output + + # We now need to add a logistic layer on top of the MLP + self.logistic_regressor = LogisticRegression(input = input, + n_in = input_len, n_out = n_classes) + + self.params.extend(self.logistic_regressor.params) + + def pretraining_functions(self, train_set_x, batch_size, learning_rate, k=1): + if k!=1: + raise NotImplementedError() + index = T.lscalar() # index to a [mini]batch + n_train_batches = train_set_x.value.shape[0] / batch_size + batch_begin = (index % n_train_batches) * batch_size + batch_end = batch_begin+batch_size + + print 'TRAIN_SET X', train_set_x.value.shape + rval = [] + for rbm in self.rbm_layers: + # N.B. these cd() samples are independent from the + # samples used for learning + outputs = list(rbm.cd())[0:2] + rval.append(function([index], outputs, + updates = rbm.cd_updates(lr=learning_rate), + givens = {self.x: train_set_x[batch_begin:batch_end]})) + if rbm is self.rbm_layers[0]: + f = rval[-1] + AA=len(outputs) + for i, implicit_out in enumerate(f.maker.env.outputs): #[len(outputs):]: + print 'OUTPUT ', i + theano.printing.debugprint(implicit_out, file=sys.stdout) + + return rval + + def finetune(self, datasets, lr, batch_size): + + # unpack the various datasets + (train_set_x, train_set_y) = datasets[0] + (valid_set_x, valid_set_y) = datasets[1] + (test_set_x, test_set_y) = datasets[2] + + # compute number of minibatches for training, validation and testing + assert train_set_x.value.shape[0] % batch_size == 0 + assert valid_set_x.value.shape[0] % batch_size == 0 + assert test_set_x.value.shape[0] % batch_size == 0 + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + index = T.lscalar() # index to a [mini]batch + target = self.y + + train_index = index % n_train_batches + + classifier = self.logistic_regressor + cost = classifier.negative_log_likelihood(target) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + + # compute list of fine-tuning updates + updates = [(param, param - gparam*finetune_lr) + for param,gparam in zip(self.params, gparams)] + + train_fn = theano.function([index], cost, + updates = updates, + givens = { + self.x : train_set_x[train_index*batch_size:(train_index+1)*batch_size], + target : train_set_y[train_index*batch_size:(train_index+1)*batch_size]}) + + test_score_i = theano.function([index], classifier.errors(target), + givens = { + self.x: test_set_x[index*batch_size:(index+1)*batch_size], + target: test_set_y[index*batch_size:(index+1)*batch_size]}) + + valid_score_i = theano.function([index], classifier.errors(target), + givens = { + self.x: valid_set_x[index*batch_size:(index+1)*batch_size], + target: valid_set_y[index*batch_size:(index+1)*batch_size]}) + + def test_scores(): + return [test_score_i(i) for i in xrange(n_test_batches)] + + def valid_scores(): + return [valid_score_i(i) for i in xrange(n_valid_batches)] + + return train_fn, valid_scores, test_scores + +def load_mnist(filename): + f = gzip.open(filename,'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + def shared_dataset(data_xy): + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + return shared_x, T.cast(shared_y, 'int32') + + n_train_examples = train_set[0].shape[0] + datasets = shared_dataset(train_set), shared_dataset(valid_set), shared_dataset(test_set) + + return n_train_examples, datasets + +def dbn_main(finetune_lr = 0.01, + pretraining_epochs = 10, + pretrain_lr = 0.1, + training_epochs = 1000, + batch_size = 20, + mnist_file='mnist.pkl.gz'): + """ + Demonstrate stochastic gradient descent optimization for a multilayer perceptron + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used in the finetune stage + (factor for the stochastic gradient) + + :param pretraining_epochs: number of epoch to do pretraining + + :param pretrain_lr: learning rate to be used during pre-training + + :param n_iter: maximal number of iterations ot run the optimizer + + :param mnist_file: path the the pickled mnist_file + + """ + + n_train_examples, train_valid_test = load_mnist(mnist_file) + + print "Creating a Deep Belief Network" + deep_model = DBN( + input_len=28*28, + hidden_layers_sizes = [500, 150, 100], + n_classes=10, + rng = numpy.random.RandomState()) + + #### + #### Phase 1: Pre-training + #### + print "Pretraining (unsupervised learning) ..." + + pretrain_functions = deep_model.pretraining_functions( + batch_size=batch_size, + train_set_x=train_valid_test[0][0], + learning_rate=pretrain_lr, + ) + + start_time = time.clock() + for layer_idx, pretrain_fn in enumerate(pretrain_functions): + # go through pretraining epochs + print 'Pre-training layer %i'% layer_idx + for i in xrange(pretraining_epochs * n_train_examples / batch_size): + outstuff = pretrain_fn(i) + xe, negsample = outstuff[:2] + print (layer_idx, i, + n_train_examples / batch_size, + float(xe), + 'Wmin', deep_model.rbm_layers[0].W.value.min(), + 'Wmax', deep_model.rbm_layers[0].W.value.max(), + 'vmin', deep_model.rbm_layers[0].vbias.value.min(), + 'vmax', deep_model.rbm_layers[0].vbias.value.max(), + #'x>0.3', (input_i>0.3).sum(), + ) + sys.stdout.flush() + if i % 1000 == 0: + PIL.Image.fromarray( + pylearn.io.image_tiling.tile_raster_images(negsample, (28,28), (10,10), + tile_spacing=(1,1))).save('samples_%i_%i.png'%(layer_idx,i)) + + PIL.Image.fromarray( + pylearn.io.image_tiling.tile_raster_images( + deep_model.rbm_layers[0].W.value.T, + (28,28), (10,10), + tile_spacing=(1,1))).save('filters_%i_%i.png'%(layer_idx,i)) + end_time = time.clock() + print 'Pretraining took %f minutes' %((end_time - start_time)/60.) + + return + + print "Fine tuning (supervised learning) ..." + train_fn, valid_scores, test_scores =\ + deep_model.finetune_functions(train_valid_test[0][0], + learning_rate=finetune_lr, # the learning rate + batch_size = batch_size) # number of examples to use at once + + #### + #### Phase 2: Fine Tuning + #### + + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(n_train_examples, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + patience_max = n_train_examples * training_epochs + + best_epoch = None + best_epoch_test_score = None + best_epoch_valid_score = float('inf') + start_time = time.clock() + + for i in xrange(patience_max): + if i >= patience: + break + + cost_i = train_fn(i) + + if i % validation_frequency == 0: + validation_i = numpy.mean([score for score in valid_scores()]) + + # if we got the best validation score until now + if validation_i < best_epoch_valid_score: + + # improve patience if loss improvement is good enough + threshold_i = best_epoch_valid_score * improvement_threshold + if validation_i < threshold_i: + patience = max(patience, i * patience_increase) + + # save best validation score and iteration number + best_epoch_valid_score = validation_i + best_epoch = i/validation_i + best_epoch_test_score = numpy.mean( + [score for score in test_scores()]) + + print('epoch %i, validation error %f %%, test error %f %%'%( + i/validation_frequency, validation_i*100., + best_epoch_test_score*100.)) + else: + print('epoch %i, validation error %f %%' % ( + i/validation_frequency, validation_i*100.)) + end_time = time.clock() + + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (finetune_status['best_validation_loss']*100., + finetune_status['test_score']*100.)) + print ('The code ran for %f minutes' % ((finetune_status['duration'])/60.)) + +def rbm_main(): + rbm = RBM(n_visible=20, n_hidden=30, + numpy_rng = numpy.random.RandomState(34)) + + cd_updates = rbm.cd_updates(lr=0.25) + + print cd_updates + + f = function([rbm.input], [], + updates={rbm.W:cd_updates[rbm.W]}) + + theano.printing.debugprint(f.maker.env.outputs[0], + file=sys.stdout) + + +if __name__ == '__main__': + dbn_main() + #rbm_main() + + +if 0: + class DAA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + """ + Initialize the dA class by specifying the number of visible units (the + dimension d of the input ), the number of hidden units ( the dimension + d' of the latent or hidden space ) and the corruption level. The + constructor also receives symbolic variables for the input, weights and + bias. Such a symbolic variables are useful when, for example the input is + the result of some computations, or when weights are shared between the + dA and an MLP layer. When dealing with SdAs this always happens, + the dA on layer 2 gets as input the output of the dA on layer 1, + and the weights of the dA are used in the second stage of training + to construct an MLP. + + :param n_visible: number of visible units + + :param n_hidden: number of hidden units + + :param input: a symbolic description of the input or None + + :param corruption_level: the corruption mechanism picks up randomly this + fraction of entries of the input and turns them to 0 + + + """ + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.matrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + class StackedDAA(DeepLayerwiseModel): + """Stacked denoising auto-encoder class (SdA) + + A stacked denoising autoencoder model is obtained by stacking several + dAs. The hidden layer of the dA at layer `i` becomes the input of + the dA at layer `i+1`. The first layer dA gets as input the input of + the SdA, and the hidden layer of the last dA represents the output. + Note that after pretraining, the SdA is dealt with as a normal MLP, + the dAs are only used to initialize the weights. + """ + + def __init__(self, n_ins, hidden_layers_sizes, n_outs, + corruption_levels, rng, ): + """ This class is made to support a variable number of layers. + + :param train_set_x: symbolic variable pointing to the training dataset + + :param train_set_y: symbolic variable pointing to the labels of the + training dataset + + :param n_ins: dimension of the input to the sdA + + :param n_layers_sizes: intermidiate layers size, must contain + at least one value + + :param n_outs: dimension of the output of the network + + :param corruption_levels: amount of corruption to use for each + layer + + :param rng: numpy random number generator used to draw initial weights + + :param pretrain_lr: learning rate used during pre-trainnig stage + + :param finetune_lr: learning rate used during finetune stage + """ + + self.sigmoid_layers = [] + self.daa_layers = [] + self.pretrain_functions = [] + self.params = [] + self.n_layers = len(hidden_layers_sizes) + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + theano_rng = RandomStreams(rng.randint(2**30)) + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + # The SdA is an MLP, for which all weights of intermidiate layers + # are shared with a different denoising autoencoders + # We will first construct the SdA as a deep multilayer perceptron, + # and when constructing each sigmoidal layer we also construct a + # denoising autoencoder that shares weights with that layer, and + # compile a training function for that denoising autoencoder + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + sigmoid_layer = SigmoidalLayer(rng, + self.layers[-1].output if i else self.x, + hidden_layers_sizes[i-1] if i else n_ins, + hidden_layers_sizes[i]) + + daa_layer = DAA(corruption_level = corruption_levels[i], + input = sigmoid_layer.input, + W = sigmoid_layer.W, + b = sigmoid_layer.b) + + # add the layer to the + self.sigmoid_layers.append(sigmoid_layer) + self.daa_layers.append(daa_layer) + + # its arguably a philosophical question... + # but we are going to only declare that the parameters of the + # sigmoid_layers are parameters of the StackedDAA + # the hidden-layer biases in the daa_layers are parameters of those + # daa_layers, but not the StackedDAA + self.params.extend(sigmoid_layer.params) + + # We now need to add a logistic layer on top of the MLP + self.logistic_regressor = LogisticRegression( + input = self.sigmoid_layers[-1].output, + n_in = hidden_layers_sizes[-1], + n_out = n_outs) + + self.params.extend(self.logLayer.params) + + def pretraining_functions(self, train_set_x, batch_size): + + # compiles update functions for each layer, and + # returns them as a list + # + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([index], dA_layer.cost, \ + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size]}) + # collect this function into a list + self.pretrain_functions += [update_fn] + + diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/logistic_cg.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/logistic_cg.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,310 @@ +""" +This tutorial introduces logistic regression using Theano and conjugate +gradient descent. + +Logistic regression is a probabilistic, linear classifier. It is parametrized +by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is +done by projecting data points onto a set of hyperplanes, the distance to +which is used to determine a class membership probability. + +Mathematically, this can be written as: + +.. math:: + P(Y=i|x, W,b) &= softmax_i(W x + b) \\ + &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} + + +The output of the model or prediction is then done by taking the argmax of +the vector whose i'th element is P(Y=i|x). + +.. math:: + + y_{pred} = argmax_i P(Y=i|x,W,b) + + +This tutorial presents a stochastic gradient descent optimization method +suitable for large datasets, and a conjugate gradient optimization method +that is suitable for smaller datasets. + + +References: + + - textbooks: "Pattern Recognition and Machine Learning" - + Christopher M. Bishop, section 4.3.2 + + +""" +__docformat__ = 'restructedtext en' + + +import numpy, time, cPickle, gzip + +import theano +import theano.tensor as T + + +class LogisticRegression(object): + """Multi-class Logistic Regression Class + + The logistic regression is fully described by a weight matrix :math:`W` + and bias vector :math:`b`. Classification is done by projecting data + points onto a set of hyperplanes, the distance to which is used to + determine a class membership probability. + """ + + + + + def __init__(self, input, n_in, n_out): + """ Initialize the parameters of the logistic regression + + :type input: theano.tensor.TensorType + :param input: symbolic variable that describes the input of the + architecture ( one minibatch) + + :type n_in: int + :param n_in: number of input units, the dimension of the space in + which the datapoint lies + + :type n_out: int + :param n_out: number of output units, the dimension of the space in + which the target lies + + """ + + # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out), + # while b is a vector of n_out elements, making theta a vector of + # n_in*n_out + n_out elements + self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out, dtype = theano.config.floatX) ) + # W is represented by the fisr n_in*n_out elements of theta + self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out)) + # b is the rest (last n_out elements) + self.b = self.theta[n_in*n_out:n_in*n_out+n_out] + + + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + + + + + def negative_log_likelihood(self, y): + """Return the negative log-likelihood of the prediction of this model + under a given target distribution. + + .. math:: + + \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = + \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ + \ell (\theta=\{W,b\}, \mathcal{D}) + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label + """ + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + + + + + def errors(self, y): + """Return a float representing the number of errors in the minibatch + over the total number of examples of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example + the correct label + """ + + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + + + + + + +def cg_optimization_mnist( n_epochs=50, mnist_pkl_gz='mnist.pkl.gz' ): + """Demonstrate conjugate gradient optimization of a log-linear model + + This is demonstrated on MNIST. + + :type n_epochs: int + :param n_epochs: number of epochs to run the optimizer + + :type mnist_pkl_gz: string + :param mnist_pkl_gz: the path of the mnist training file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + + """ + ############# + # LOAD DATA # + ############# + print '... loading data' + + # Load the dataset + f = gzip.open(mnist_pkl_gz,'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + def shared_dataset(data_xy): + """ Function that loads the dataset into shared variables + + The reason we store our dataset in shared variables is to allow + Theano to copy it into the GPU memory (when code is run on GPU). + Since copying data into the GPU is slow, copying a minibatch everytime + is needed (the default behaviour if the data is not in a shared + variable) would lead to a large decrease in performance. + """ + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + # When storing data on the GPU it has to be stored as floats + # therefore we will store the labels as ``floatX`` as well + # (``shared_y`` does exactly that). But during our computations + # we need them as ints (we use labels as index, and if they are + # floats it doesn't make sense) therefore instead of returning + # ``shared_y`` we will have to cast it to int. This little hack + # lets ous get around this issue + return shared_x, T.cast(shared_y, 'int32') + + + test_set_x, test_set_y = shared_dataset(test_set) + valid_set_x, valid_set_y = shared_dataset(valid_set) + train_set_x, train_set_y = shared_dataset(train_set) + + batch_size = 600 # size of the minibatch + + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + + ishape = (28,28) # this is the size of MNIST images + n_in = 28*28 # number of input units + n_out = 10 # number of output units + + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + + # allocate symbolic variables for the data + minibatch_offset = T.lscalar() # offset to the start of a [mini]batch + x = T.matrix() # the data is presented as rasterized images + y = T.ivector() # the labels are presented as 1D vector of + # [int] labels + + + # construct the logistic regression class + classifier = LogisticRegression( input=x, n_in=28*28, n_out=10) + + # the cost we minimize during training is the negative log likelihood of + # the model in symbolic format + cost = classifier.negative_log_likelihood(y).mean() + + # compile a theano function that computes the mistakes that are made by + # the model on a minibatch + test_model = theano.function([minibatch_offset], classifier.errors(y), + givens={ + x:test_set_x[minibatch_offset:minibatch_offset+batch_size], + y:test_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + validate_model = theano.function([minibatch_offset],classifier.errors(y), + givens={ + x:valid_set_x[minibatch_offset:minibatch_offset+batch_size], + y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + # compile a thenao function that returns the cost of a minibatch + batch_cost = theano.function([minibatch_offset], cost, + givens= { + x : train_set_x[minibatch_offset:minibatch_offset+batch_size], + y : train_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + + + # compile a theano function that returns the gradient of the minibatch + # with respect to theta + batch_grad = theano.function([minibatch_offset], T.grad(cost,classifier.theta), + givens= { + x : train_set_x[minibatch_offset:minibatch_offset+batch_size], + y : train_set_y[minibatch_offset:minibatch_offset+batch_size]}) + + + # creates a function that computes the average cost on the training set + def train_fn(theta_value): + classifier.theta.value = theta_value + train_losses = [batch_cost(i*batch_size) for i in xrange(n_train_batches)] + return numpy.mean(train_losses) + + # creates a function that computes the average gradient of cost with + # respect to theta + def train_fn_grad(theta_value): + classifier.theta.value = theta_value + grad = batch_grad(0) + for i in xrange(1,n_train_batches): + grad += batch_grad(i*batch_size) + return grad/n_train_batches + + + validation_scores = [float('inf'), 0] + + # creates the validation function + def callback(theta_value): + classifier.theta.value = theta_value + #compute the validation loss + validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) + print('validation error %f %%' % (this_validation_loss*100.,)) + + # check if it is better then best validation score got until now + if this_validation_loss < validation_scores[0]: + # if so, replace the old one, and compute the score on the + # testing dataset + validation_scores[0] = this_validation_loss + test_loses = [test_model(i*batch_size) for i in xrange(n_test_batches)] + validation_scores[1] = numpy.mean(test_loses) + + ############### + # TRAIN MODEL # + ############### + + # using scipy conjugate gradient optimizer + import scipy.optimize + print ("Optimizing using scipy.optimize.fmin_cg...") + start_time = time.clock() + best_w_b = scipy.optimize.fmin_cg( + f = train_fn, + x0 = numpy.zeros((n_in+1)*n_out, dtype=x.dtype), + fprime = train_fn_grad, + callback = callback, + disp = 0, + maxiter = n_epochs) + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%, with ' + 'test performance %f %%') % + (validation_scores[0]*100., validation_scores[1]*100.)) + + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + +if __name__ == '__main__': + cg_optimization_mnist() + diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/logistic_sgd.py --- a/code_tutoriel/logistic_sgd.py Wed Feb 10 11:15:04 2010 -0500 +++ b/code_tutoriel/logistic_sgd.py Mon Mar 29 17:42:44 2010 -0400 @@ -32,20 +32,14 @@ - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 4.3.2 - """ __docformat__ = 'restructedtext en' - -import numpy, cPickle, gzip - -import time +import numpy, time, cPickle, gzip import theano import theano.tensor as T -import theano.tensor.nnet - class LogisticRegression(object): """Multi-class Logistic Regression Class @@ -62,23 +56,26 @@ def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression + :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the - architecture (one minibatch) - + architecture (one minibatch) + + :type n_in: int :param n_in: number of input units, the dimension of the space in - which the datapoints lie + which the datapoints lie + :type n_out: int :param n_out: number of output units, the dimension of the space in - which the labels lie + which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - self.W = theano.shared( value=numpy.zeros((n_in,n_out), - dtype = theano.config.floatX) ) + self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX), + name='W') # initialize the baises b as a vector of n_out 0s - self.b = theano.shared( value=numpy.zeros((n_out,), - dtype = theano.config.floatX) ) + self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX), + name='b') # compute vector of class-membership probabilities in symbolic form @@ -88,6 +85,9 @@ # symbolic form self.y_pred=T.argmax(self.p_y_given_x, axis=1) + # parameters of the model + self.params = [self.W, self.b] + @@ -102,23 +102,30 @@ \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) - + :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the - :correct label + correct label Note: we use the mean instead of the sum so that - the learning rate is less dependent on the batch size + the learning rate is less dependent on the batch size """ + # y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch + # T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1] + # T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class + # LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]] + # and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v, + # i.e., the mean log-likelihood across the minibatch. return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - - def errors(self, y): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch + + :type y: theano.tensor.TensorType + :param y: corresponds to a vector that gives for each example the + correct label """ # check if y has same dimension of y_pred @@ -134,72 +141,103 @@ raise NotImplementedError() +def load_data(dataset): + ''' Loads the dataset + + :type dataset: string + :param dataset: the path to the dataset (here MNIST) + ''' + + ############# + # LOAD DATA # + ############# + print '... loading data' + + # Load the dataset + f = gzip.open(dataset,'rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + + def shared_dataset(data_xy): + """ Function that loads the dataset into shared variables + + The reason we store our dataset in shared variables is to allow + Theano to copy it into the GPU memory (when code is run on GPU). + Since copying data into the GPU is slow, copying a minibatch everytime + is needed (the default behaviour if the data is not in a shared + variable) would lead to a large decrease in performance. + """ + data_x, data_y = data_xy + shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + # When storing data on the GPU it has to be stored as floats + # therefore we will store the labels as ``floatX`` as well + # (``shared_y`` does exactly that). But during our computations + # we need them as ints (we use labels as index, and if they are + # floats it doesn't make sense) therefore instead of returning + # ``shared_y`` we will have to cast it to int. This little hack + # lets ous get around this issue + return shared_x, T.cast(shared_y, 'int32') + + test_set_x, test_set_y = shared_dataset(test_set) + valid_set_x, valid_set_y = shared_dataset(valid_set) + train_set_x, train_set_y = shared_dataset(train_set) + + rval = [(train_set_x, train_set_y), (valid_set_x,valid_set_y), (test_set_x, test_set_y)] + return rval -def sgd_optimization_mnist( learning_rate=0.01, n_iter=100): + +def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz'): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. + :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic - gradient + gradient) - :param n_iter: maximal number of iterations ot run the optimizer + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ + datasets = load_data(dataset) - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + batch_size = 600 # size of the minibatch - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size - ishape = (28,28) # this is the size of MNIST images + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' + # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels # construct the logistic regression class - classifier = LogisticRegression( \ - input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10) + # Each MNIST image has size 28*28 + classifier = LogisticRegression( input=x, n_in=28*28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format @@ -207,11 +245,21 @@ # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) + test_model = theano.function(inputs = [index], + outputs = classifier.errors(y), + givens={ + x:test_set_x[index*batch_size:(index+1)*batch_size], + y:test_set_y[index*batch_size:(index+1)*batch_size]}) + + validate_model = theano.function( inputs = [index], + outputs = classifier.errors(y), + givens={ + x:valid_set_x[index*batch_size:(index+1)*batch_size], + y:valid_set_y[index*batch_size:(index+1)*batch_size]}) # compute the gradient of cost with respect to theta = (W,b) - g_W = T.grad(cost, classifier.W) - g_b = T.grad(cost, classifier.b) + g_W = T.grad(cost = cost, wrt = classifier.W) + g_b = T.grad(cost = cost, wrt = classifier.b) # specify how to update the parameters of the model as a dictionary updates ={classifier.W: classifier.W - learning_rate*g_W,\ @@ -220,17 +268,25 @@ # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` - train_model = theano.function([x, y], cost, updates = updates ) + train_model = theano.function(inputs = [index], + outputs = cost, + updates = updates, + givens={ + x:train_set_x[index*batch_size:(index+1)*batch_size], + y:train_set_y[index*batch_size:(index+1)*batch_size]}) - n_minibatches = len(train_batches) # number of minibatchers - + ############### + # TRAIN MODEL # + ############### + print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = n_minibatches # go through this many + validation_frequency = min(n_train_batches, patience/2) + # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch @@ -239,29 +295,24 @@ best_validation_loss = float('inf') test_score = 0. start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter* n_minibatches): - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches + done_looping = False + epoch = 0 + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - cost_ij = train_model(x,y) + minibatch_avg_cost = train_model(minibatch_index) + # iteration number + iter = epoch * n_train_batches + minibatch_index if (iter+1) % validation_frequency == 0: # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) + validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1,n_minibatches, \ + (epoch, minibatch_index+1,n_train_batches, \ this_validation_loss*100.)) @@ -275,15 +326,15 @@ best_validation_loss = this_validation_loss # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) + test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_score = numpy.mean(test_losses) + print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % \ - (epoch, minibatch_index+1, n_minibatches,test_score*100.)) + (epoch, minibatch_index+1, n_train_batches,test_score*100.)) if patience <= iter : + done_looping = True break end_time = time.clock() @@ -292,12 +343,6 @@ (best_validation_loss * 100., test_score*100.)) print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - - - - - - if __name__ == '__main__': sgd_optimization_mnist() diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/mlp.py --- a/code_tutoriel/mlp.py Wed Feb 10 11:15:04 2010 -0500 +++ b/code_tutoriel/mlp.py Mon Mar 29 17:42:44 2010 -0400 @@ -17,22 +17,65 @@ - textbooks: "Pattern Recognition and Machine Learning" - Christopher M. Bishop, section 5 -TODO: recommended preprocessing, lr ranges, regularization ranges (explain - to do lr first, then add regularization) - """ __docformat__ = 'restructedtext en' -import numpy, cPickle, gzip - +import numpy, time, cPickle, gzip import theano import theano.tensor as T -import time + +from logistic_sgd import LogisticRegression, load_data + + +class HiddenLayer(object): + def __init__(self, rng, input, n_in, n_out, activation = T.tanh): + """ + Typical hidden layer of a MLP: units are fully-connected and have + sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) + and the bias vector b is of shape (n_out,). + + NOTE : The nonlinearity used here is tanh + + Hidden unit activation is given by: tanh(dot(input,W) + b) + + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.dmatrix + :param input: a symbolic tensor of shape (n_examples, n_in) + + :type n_in: int + :param n_in: dimensionality of input -import theano.tensor.nnet + :type n_out: int + :param n_out: number of hidden units + + :type activation: theano.Op or function + :param activation: Non linearity to be applied in the hidden + layer + """ + self.input = input + + # `W` is initialized with `W_values` which is uniformely sampled + # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = activation(T.dot(input, self.W) + self.b) + # parameters of the model + self.params = [self.W, self.b] + class MLP(object): """Multi-Layer Perceptron Class @@ -40,188 +83,132 @@ A multilayer perceptron is a feedforward artificial neural network model that has one layer or more of hidden units and nonlinear activations. Intermidiate layers usually have as activation function thanh or the - sigmoid function while the top layer is a softamx layer. + sigmoid function (defined here by a ``SigmoidalLayer`` class) while the + top layer is a softamx layer (defined here by a ``LogisticRegression`` + class). """ - def __init__(self, input, n_in, n_hidden, n_out): + def __init__(self, rng, input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron + :type rng: numpy.random.RandomState + :param rng: a random number generator used to initialize weights + + :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) + :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie + :type n_hidden: int :param n_hidden: number of hidden units + :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ - # initialize the parameters theta = (W1,b1,W2,b2) ; note that this - # example contains only one hidden layer, but one can have as many - # layers as he/she wishes, making the network deeper. The only - # problem making the network deep this way is during learning, - # backpropagation being unable to move the network from the starting - # point towards; this is where pre-training helps, giving a good - # starting point for backpropagation, but more about this in the - # other tutorials - - # `W1` is initialized with `W1_values` which is uniformely sampled - # from -6./sqrt(n_in+n_hidden) and 6./sqrt(n_in+n_hidden) - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - W1_values = numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_in+n_hidden)), \ - high = numpy.sqrt(6./(n_in+n_hidden)), \ - size = (n_in, n_hidden)), dtype = theano.config.floatX) - # `W2` is initialized with `W2_values` which is uniformely sampled - # from -6./sqrt(n_hidden+n_out) and 6./sqrt(n_hidden+n_out) - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - W2_values = numpy.asarray( numpy.random.uniform( - low = -numpy.sqrt(6./(n_hidden+n_out)), \ - high= numpy.sqrt(6./(n_hidden+n_out)),\ - size= (n_hidden, n_out)), dtype = theano.config.floatX) + # Since we are dealing with a one hidden layer MLP, this will + # translate into a TanhLayer connected to the LogisticRegression + # layer; this can be replaced by a SigmoidalLayer, or a layer + # implementing any other nonlinearity + self.hiddenLayer = HiddenLayer(rng = rng, input = input, + n_in = n_in, n_out = n_hidden, + activation = T.tanh) - self.W1 = theano.shared( value = W1_values ) - self.b1 = theano.shared( value = numpy.zeros((n_hidden,), - dtype= theano.config.floatX)) - self.W2 = theano.shared( value = W2_values ) - self.b2 = theano.shared( value = numpy.zeros((n_out,), - dtype= theano.config.floatX)) + # The logistic regression layer gets as input the hidden units + # of the hidden layer + self.logRegressionLayer = LogisticRegression( + input = self.hiddenLayer.output, + n_in = n_hidden, + n_out = n_out) - # symbolic expression computing the values of the hidden layer - self.hidden = T.tanh(T.dot(input, self.W1)+ self.b1) - - # symbolic expression computing the values of the top layer - self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden, self.W2)+self.b2) - - # compute prediction as class whose probability is maximal in - # symbolic form - self.y_pred = T.argmax( self.p_y_given_x, axis =1) - # L1 norm ; one regularization option is to enforce L1 norm to # be small - self.L1 = abs(self.W1).sum() + abs(self.W2).sum() + self.L1 = abs(self.hiddenLayer.W).sum() \ + + abs(self.logRegressionLayer.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small - self.L2_sqr = (self.W1**2).sum() + (self.W2**2).sum() - - - - def negative_log_likelihood(self, y): - """Return the mean of the negative log-likelihood of the prediction - of this model under a given target distribution. - - .. math:: + self.L2_sqr = (self.hiddenLayer.W**2).sum() \ + + (self.logRegressionLayer.W**2).sum() - \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = - \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ - \ell (\theta=\{W,b\}, \mathcal{D}) - + # negative log likelihood of the MLP is given by the negative + # log likelihood of the output of the model, computed in the + # logistic regression layer + self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood + # same holds for the function computing the number of errors + self.errors = self.logRegressionLayer.errors - :param y: corresponds to a vector that gives for each example the - :correct label - """ - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - + # the parameters of the model are the parameters of the two layer it is + # made out of + self.params = self.hiddenLayer.params + self.logRegressionLayer.params - - def errors(self, y): - """Return a float representing the number of errors in the minibatch - over the total number of examples of the minibatch - """ - - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() - - - -def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ - L2_reg = 0.0001, n_iter=100): +def test_mlp( learning_rate=0.01, L1_reg = 0.00, L2_reg = 0.0001, n_epochs=1000, + dataset = 'mnist.pkl.gz'): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. + :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient + :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) + :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) - :param n_iter: maximal number of iterations ot run the optimizer + :type n_epochs: int + :param n_epochs: maximal number of epochs to run the optimizer + + :type dataset: string + :param dataset: the path of the MNIST dataset file from + http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz + """ - - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch + datasets = load_data(dataset) - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + train_set_x, train_set_y = datasets[0] + valid_set_x, valid_set_y = datasets[1] + test_set_x , test_set_y = datasets[2] - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] - ishape = (28,28) # this is the size of MNIST images + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + n_valid_batches = valid_set_x.value.shape[0] / batch_size + n_test_batches = test_set_x.value.shape[0] / batch_size + + ###################### + # BUILD ACTUAL MODEL # + ###################### + print '... building the model' # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels - # construct the logistic regression class - classifier = MLP( input=x.reshape((batch_size,28*28)),\ - n_in=28*28, n_hidden = 500, n_out=10) + rng = numpy.random.RandomState(1234) + + # construct the MLP class + classifier = MLP( rng = rng, input=x, n_in=28*28, n_hidden = 500, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed @@ -230,36 +217,59 @@ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr - # compiling a theano function that computes the mistakes that are made by - # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) + # compiling a Theano function that computes the mistakes that are made + # by the model on a minibatch + test_model = theano.function(inputs = [index], + outputs = classifier.errors(y), + givens={ + x:test_set_x[index*batch_size:(index+1)*batch_size], + y:test_set_y[index*batch_size:(index+1)*batch_size]}) - # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) - g_W1 = T.grad(cost, classifier.W1) - g_b1 = T.grad(cost, classifier.b1) - g_W2 = T.grad(cost, classifier.W2) - g_b2 = T.grad(cost, classifier.b2) + validate_model = theano.function(inputs = [index], + outputs = classifier.errors(y), + givens={ + x:valid_set_x[index*batch_size:(index+1)*batch_size], + y:valid_set_y[index*batch_size:(index+1)*batch_size]}) + + # compute the gradient of cost with respect to theta (sotred in params) + # the resulting gradients will be stored in a list gparams + gparams = [] + for param in classifier.params: + gparam = T.grad(cost, param) + gparams.append(gparam) + # specify how to update the parameters of the model as a dictionary - updates = \ - { classifier.W1: classifier.W1 - learning_rate*g_W1 \ - , classifier.b1: classifier.b1 - learning_rate*g_b1 \ - , classifier.W2: classifier.W2 - learning_rate*g_W2 \ - , classifier.b2: classifier.b2 - learning_rate*g_b2 } + updates = {} + # given two list the zip A = [ a1,a2,a3,a4] and B = [b1,b2,b3,b4] of + # same length, zip generates a list C of same size, where each element + # is a pair formed from the two lists : + # C = [ (a1,b1), (a2,b2), (a3,b3) , (a4,b4) ] + for param, gparam in zip(classifier.params, gparams): + updates[param] = param - learning_rate*gparam - # compiling a theano function `train_model` that returns the cost, but in - # the same time updates the parameter of the model based on the rules + # compiling a Theano function `train_model` that returns the cost, but + # in the same time updates the parameter of the model based on the rules # defined in `updates` - train_model = theano.function([x, y], cost, updates = updates ) - n_minibatches = len(train_batches) - + train_model =theano.function( inputs = [index], outputs = cost, + updates = updates, + givens={ + x:train_set_x[index*batch_size:(index+1)*batch_size], + y:train_set_y[index*batch_size:(index+1)*batch_size]}) + + ############### + # TRAIN MODEL # + ############### + print '... training' + # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant - validation_frequency = n_minibatches # go through this many + validation_frequency = min(n_train_batches,patience/2) + # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch @@ -270,56 +280,49 @@ best_iter = 0 test_score = 0. start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter* n_minibatches): + + epoch = 0 + done_looping = False - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches + while (epoch < n_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(n_train_batches): - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - cost_ij = train_model(x,y) + minibatch_avg_cost = train_model(minibatch_index) + # iteration number + iter = epoch * n_train_batches + minibatch_index if (iter+1) % validation_frequency == 0: # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) + validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, n_minibatches, \ - this_validation_loss*100.)) + (epoch, minibatch_index+1,n_train_batches, \ + this_validation_loss*100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: - #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold : patience = max(patience, iter * patience_increase) - # save best validation score and iteration number best_validation_loss = this_validation_loss - best_iter = iter - # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) - print((' epoch %i, minibatch %i/%i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, n_minibatches, - test_score*100.)) + + test_losses = [test_model(i) for i in xrange(n_test_batches)] + test_score = numpy.mean(test_losses) + + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % \ + (epoch, minibatch_index+1, n_train_batches,test_score*100.)) if patience <= iter : - break + done_looping = True + break + end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' @@ -329,5 +332,5 @@ if __name__ == '__main__': - sgd_optimization_mnist() + test_mlp() diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/rbm.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/rbm.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,360 @@ +"""This tutorial introduces restricted boltzmann machines (RBM) using Theano. + +Boltzmann Machines (BMs) are a particular form of energy-based model which +contain hidden variables. Restricted Boltzmann Machines further restrict BMs +to those without visible-visible and hidden-hidden connections. +""" + + +import numpy, time, cPickle, gzip, PIL.Image + +import theano +import theano.tensor as T +import os + +from theano.tensor.shared_randomstreams import RandomStreams + +from utils import tile_raster_images +from logistic_sgd import load_data + +class RBM(object): + """Restricted Boltzmann Machine (RBM) """ + def __init__(self, input=None, n_visible=784, n_hidden=500, \ + W = None, hbias = None, vbias = None, numpy_rng = None, + theano_rng = None): + """ + RBM constructor. Defines the parameters of the model along with + basic operations for inferring hidden from visible (and vice-versa), + as well as for performing CD updates. + + :param input: None for standalone RBMs or symbolic variable if RBM is + part of a larger graph. + + :param n_visible: number of visible units + + :param n_hidden: number of hidden units + + :param W: None for standalone RBMs or symbolic variable pointing to a + shared weight matrix in case RBM is part of a DBN network; in a DBN, + the weights are shared between RBMs and layers of a MLP + + :param hbias: None for standalone RBMs or symbolic variable pointing + to a shared hidden units bias vector in case RBM is part of a + different network + + :param vbias: None for standalone RBMs or a symbolic variable + pointing to a shared visible units bias + """ + + self.n_visible = n_visible + self.n_hidden = n_hidden + + + if W is None : + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(n_hidden+n_visible)), + high = numpy.sqrt(6./(n_hidden+n_visible)), + size = (n_visible, n_hidden)), + dtype = theano.config.floatX) + # theano shared variables for weights and biases + W = theano.shared(value = initial_W, name = 'W') + + if hbias is None : + # create shared variable for hidden units bias + hbias = theano.shared(value = numpy.zeros(n_hidden, + dtype = theano.config.floatX), name='hbias') + + if vbias is None : + # create shared variable for visible units bias + vbias = theano.shared(value =numpy.zeros(n_visible, + dtype = theano.config.floatX),name='vbias') + + if numpy_rng is None: + # create a number generator + numpy_rng = numpy.random.RandomState(1234) + + if theano_rng is None : + theano_rng = RandomStreams(numpy_rng.randint(2**30)) + + + # initialize input layer for standalone RBM or layer0 of DBN + self.input = input if input else T.dmatrix('input') + + self.W = W + self.hbias = hbias + self.vbias = vbias + self.theano_rng = theano_rng + # **** WARNING: It is not a good idea to put things in this list + # other than shared variables created in this function. + self.params = [self.W, self.hbias, self.vbias] + self.batch_size = self.input.shape[0] + + def free_energy(self, v_sample): + ''' Function to compute the free energy ''' + wx_b = T.dot(v_sample, self.W) + self.hbias + vbias_term = T.sum(T.dot(v_sample, self.vbias)) + hidden_term = T.sum(T.log(1+T.exp(wx_b))) + return -hidden_term - vbias_term + + def sample_h_given_v(self, v0_sample): + ''' This function infers state of hidden units given visible units ''' + # compute the activation of the hidden units given a sample of the visibles + h1_mean = T.nnet.sigmoid(T.dot(v0_sample, self.W) + self.hbias) + # get a sample of the hiddens given their activation + h1_sample = self.theano_rng.binomial(size = h1_mean.shape, n = 1, prob = h1_mean) + return [h1_mean, h1_sample] + + def sample_v_given_h(self, h0_sample): + ''' This function infers state of visible units given hidden units ''' + # compute the activation of the visible given the hidden sample + v1_mean = T.nnet.sigmoid(T.dot(h0_sample, self.W.T) + self.vbias) + # get a sample of the visible given their activation + v1_sample = self.theano_rng.binomial(size = v1_mean.shape,n = 1,prob = v1_mean) + return [v1_mean, v1_sample] + + def gibbs_hvh(self, h0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the hidden state''' + v1_mean, v1_sample = self.sample_v_given_h(h0_sample) + h1_mean, h1_sample = self.sample_h_given_v(v1_sample) + return [v1_mean, v1_sample, h1_mean, h1_sample] + + def gibbs_vhv(self, v0_sample): + ''' This function implements one step of Gibbs sampling, + starting from the visible state''' + h1_mean, h1_sample = self.sample_h_given_v(v0_sample) + v1_mean, v1_sample = self.sample_v_given_h(h1_sample) + return [h1_mean, h1_sample, v1_mean, v1_sample] + + def cd(self, lr = 0.1, persistent=None): + """ + This functions implements one step of CD-1 or PCD-1 + + :param lr: learning rate used to train the RBM + :param persistent: None for CD. For PCD, shared variable containing old state + of Gibbs chain. This must be a shared variable of size (batch size, number of + hidden units). + + Returns the updates dictionary. The dictionary contains the update rules for weights + and biases but also an update of the shared variable used to store the persistent + chain, if one is used. + """ + + # compute positive phase + ph_mean, ph_sample = self.sample_h_given_v(self.input) + + # decide how to initialize persistent chain: + # for CD, we use the newly generate hidden sample + # for PCD, we initialize from the old state of the chain + if persistent is None: + chain_start = ph_sample + else: + chain_start = persistent + + # perform actual negative phase + [nv_mean, nv_sample, nh_mean, nh_sample] = self.gibbs_hvh(chain_start) + + # determine gradients on RBM parameters + g_vbias = T.sum( self.input - nv_mean, axis = 0)/self.batch_size + g_hbias = T.sum( ph_mean - nh_mean, axis = 0)/self.batch_size + g_W = T.dot(ph_mean.T, self.input )/ self.batch_size - \ + T.dot(nh_mean.T, nv_mean )/ self.batch_size + + gparams = [g_W.T, g_hbias, g_vbias] + + # constructs the update dictionary + updates = {} + for gparam, param in zip(gparams, self.params): + updates[param] = param + gparam * lr + + if persistent: + # Note that this works only if persistent is a shared variable + updates[persistent] = T.cast(nh_sample, dtype=theano.config.floatX) + # pseudo-likelihood is a better proxy for PCD + cost = self.get_pseudo_likelihood_cost(updates) + else: + # reconstruction cross-entropy is a better proxy for CD + cost = self.get_reconstruction_cost(updates, nv_mean) + + return cost, updates + + def get_pseudo_likelihood_cost(self, updates): + """Stochastic approximation to the pseudo-likelihood""" + + # index of bit i in expression p(x_i | x_{\i}) + bit_i_idx = theano.shared(value=0, name = 'bit_i_idx') + + # binarize the input image by rounding to nearest integer + xi = T.iround(self.input) + + # calculate free energy for the given bit configuration + fe_xi = self.free_energy(xi) + + # flip bit x_i of matrix xi and preserve all other bits x_{\i} + # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx] + # NB: slice(start,stop,step) is the python object used for + # slicing, e.g. to index matrix x as follows: x[start:stop:step] + xi_flip = T.setsubtensor(xi, 1-xi[:, bit_i_idx], + idx_list=(slice(None,None,None),bit_i_idx)) + + # calculate free energy with bit flipped + fe_xi_flip = self.free_energy(xi_flip) + + # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) + cost = self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)) + + # increment bit_i_idx % number as part of updates + updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible + + return cost + + def get_reconstruction_cost(self, updates, nv_mean): + """Approximation to the reconstruction error""" + + cross_entropy = T.mean( + T.sum(self.input*T.log(nv_mean) + + (1 - self.input)*T.log(1-nv_mean), axis = 1)) + + return cross_entropy + + + +def test_rbm(learning_rate=0.1, training_epochs = 15, + dataset='mnist.pkl.gz'): + """ + Demonstrate *** + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used for training the RBM + + :param training_epochs: number of epochs used for training + + :param dataset: path the the pickled dataset + + """ + datasets = load_data(dataset) + + train_set_x, train_set_y = datasets[0] + test_set_x , test_set_y = datasets[2] + + + batch_size = 20 # size of the minibatch + + # compute number of minibatches for training, validation and testing + n_train_batches = train_set_x.value.shape[0] / batch_size + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + + rng = numpy.random.RandomState(123) + theano_rng = RandomStreams( rng.randint(2**30)) + + # initialize storage fot the persistent chain (state = hidden layer of chain) + persistent_chain = theano.shared(numpy.zeros((batch_size, 500))) + + # construct the RBM class + rbm = RBM( input = x, n_visible=28*28, \ + n_hidden = 500,numpy_rng = rng, theano_rng = theano_rng) + + # get the cost and the gradient corresponding to one step of CD + cost, updates = rbm.cd(lr=learning_rate, persistent=persistent_chain) + + + ################################# + # Training the RBM # + ################################# + dirname = 'lr=%.5f'%learning_rate + os.makedirs(dirname) + os.chdir(dirname) + + # it is ok for a theano function to have no output + # the purpose of train_rbm is solely to update the RBM parameters + train_rbm = theano.function([index], cost, + updates = updates, + givens = { x: train_set_x[index*batch_size:(index+1)*batch_size]}) + + plotting_time = 0. + start_time = time.clock() + + + # go through training epochs + for epoch in xrange(training_epochs): + + # go through the training set + mean_cost = [] + for batch_index in xrange(n_train_batches): + mean_cost += [train_rbm(batch_index)] + + print 'Training epoch %d, cost is '%epoch, numpy.mean(mean_cost) + + # Plot filters after each training epoch + plotting_start = time.clock() + # Construct image from the weight matrix + image = PIL.Image.fromarray(tile_raster_images( X = rbm.W.value.T, + img_shape = (28,28),tile_shape = (10,10), + tile_spacing=(1,1))) + image.save('filters_at_epoch_%i.png'%epoch) + plotting_stop = time.clock() + plotting_time += (plotting_stop - plotting_start) + + end_time = time.clock() + + pretraining_time = (end_time - start_time) - plotting_time + + print ('Training took %f minutes' %(pretraining_time/60.)) + + + ################################# + # Sampling from the RBM # + ################################# + + # find out the number of test samples + number_of_test_samples = test_set_x.value.shape[0] + + # pick random test examples, with which to initialize the persistent chain + test_idx = rng.randint(number_of_test_samples-20) + persistent_vis_chain = theano.shared(test_set_x.value[test_idx:test_idx+20]) + + # define one step of Gibbs sampling (mf = mean-field) + [hid_mf, hid_sample, vis_mf, vis_sample] = rbm.gibbs_vhv(persistent_vis_chain) + + # the sample at the end of the channel is returned by ``gibbs_1`` as + # its second output; note that this is computed as a binomial draw, + # therefore it is formed of ints (0 and 1) and therefore needs to + # be converted to the same dtype as ``persistent_vis_chain`` + vis_sample = T.cast(vis_sample, dtype=theano.config.floatX) + + # construct the function that implements our persistent chain + # we generate the "mean field" activations for plotting and the actual samples for + # reinitializing the state of our persistent chain + sample_fn = theano.function([], [vis_mf, vis_sample], + updates = { persistent_vis_chain:vis_sample}) + + # sample the RBM, plotting every `plot_every`-th sample; do this + # until you plot at least `n_samples` + n_samples = 10 + plot_every = 1000 + + for idx in xrange(n_samples): + + # do `plot_every` intermediate samplings of which we do not care + for jdx in xrange(plot_every): + vis_mf, vis_sample = sample_fn() + + # construct image + image = PIL.Image.fromarray(tile_raster_images( + X = vis_mf, + img_shape = (28,28), + tile_shape = (10,10), + tile_spacing = (1,1) ) ) + print ' ... plotting sample ', idx + image.save('sample_%i_step_%i.png'%(idx,idx*jdx)) + +if __name__ == '__main__': + test_rbm() diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/test.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,18 @@ +#import convolutional_mlp, dbn, logistic_cg, logistic_sgd, mlp, rbm, SdA_loops, SdA +import convolutional_mlp, logistic_cg, logistic_sgd, mlp, SdA +from nose.plugins.skip import SkipTest +#TODO: dbn, rbm, SdA, SdA_loops, convolutional_mlp +def test_logistic_sgd(): + logistic_sgd.sgd_optimization_mnist(n_epochs=10) +def test_logistic_cg(): + logistic_cg.cg_optimization_mnist(n_epochs=10) +def test_mlp(): + mlp.test_mlp(n_epochs=5) +def test_convolutional_mlp(): + convolutional_mlp.evaluate_lenet5(n_epochs=5,nkerns=[5,5]) +def test_dbn(): + raise SkipTest('Implementation not finished') +def test_rbm(): + raise SkipTest('Implementation not finished') +def test_SdA(): + SdA.test_SdA(pretraining_epochs = 2, training_epochs = 3) diff -r 6f606b359df3 -r a9af079892ce code_tutoriel/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/code_tutoriel/utils.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,125 @@ +""" This file contains different utility functions that are not connected +in anyway to the networks presented in the tutorials, but rather help in +processing the outputs into a more understandable way. + +For example ``tile_raster_images`` helps in generating a easy to grasp +image from a set of samples or weights. +""" + + +import numpy + + +def scale_to_unit_interval(ndar,eps=1e-8): + """ Scales all values in the ndarray ndar to be between 0 and 1 """ + ndar = ndar.copy() + ndar -= ndar.min() + ndar *= 1.0 / (ndar.max()+eps) + return ndar + + +def tile_raster_images(X, img_shape, tile_shape,tile_spacing = (0,0), + scale_rows_to_unit_interval = True, output_pixel_vals = True): + """ + Transform an array with one flattened image per row, into an array in + which images are reshaped and layed out like tiles on a floor. + + This function is useful for visualizing datasets whose rows are images, + and also columns of matrices for transforming those rows + (such as the first layer of a neural net). + + :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can + be 2-D ndarrays or None; + :param X: a 2-D array in which every row is a flattened image. + + :type img_shape: tuple; (height, width) + :param img_shape: the original shape of each image + + :type tile_shape: tuple; (rows, cols) + :param tile_shape: the number of images to tile (rows, cols) + + :param output_pixel_vals: if output should be pixel values (i.e. int8 + values) or floats + + :param scale_rows_to_unit_interval: if the values need to be scaled before + being plotted to [0,1] or not + + + :returns: array suitable for viewing as an image. + (See:`PIL.Image.fromarray`.) + :rtype: a 2-d array with same dtype as X. + + """ + + assert len(img_shape) == 2 + assert len(tile_shape) == 2 + assert len(tile_spacing) == 2 + + # The expression below can be re-written in a more C style as + # follows : + # + # out_shape = [0,0] + # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - + # tile_spacing[0] + # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - + # tile_spacing[1] + out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp + in zip(img_shape, tile_shape, tile_spacing)] + + if isinstance(X, tuple): + assert len(X) == 4 + # Create an output numpy ndarray to store the image + if output_pixel_vals: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8') + else: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype) + + #colors default to 0, alpha defaults to 1 (opaque) + if output_pixel_vals: + channel_defaults = [0,0,0,255] + else: + channel_defaults = [0.,0.,0.,1.] + + for i in xrange(4): + if X[i] is None: + # if channel is None, fill it with zeros of the correct + # dtype + out_array[:,:,i] = numpy.zeros(out_shape, + dtype='uint8' if output_pixel_vals else out_array.dtype + )+channel_defaults[i] + else: + # use a recurrent call to compute the channel and store it + # in the output + out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals) + return out_array + + else: + # if we are dealing with only one channel + H, W = img_shape + Hs, Ws = tile_spacing + + # generate a matrix to store the output + out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype) + + + for tile_row in xrange(tile_shape[0]): + for tile_col in xrange(tile_shape[1]): + if tile_row * tile_shape[1] + tile_col < X.shape[0]: + if scale_rows_to_unit_interval: + # if we should scale values to be between 0 and 1 + # do this by calling the `scale_to_unit_interval` + # function + this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)) + else: + this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape) + # add the slice to the corresponding position in the + # output array + out_array[ + tile_row * (H+Hs):tile_row*(H+Hs)+H, + tile_col * (W+Ws):tile_col*(W+Ws)+W + ] \ + = this_img * (255 if output_pixel_vals else 1) + return out_array + + + diff -r 6f606b359df3 -r a9af079892ce data_generation/__init__.py diff -r 6f606b359df3 -r a9af079892ce data_generation/pipeline/pipeline.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/pipeline/pipeline.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,420 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +# This is intended to be run as a GIMP script +#from gimpfu import * + +import sys, os, getopt +import numpy +import ift6266.data_generation.transformations.filetensor as ft +import random +import copy + +# To debug locally, also call with -s 100 (to stop after ~100) +# (otherwise we allocate all needed memory, might be loonnng and/or crash +# if, lucky like me, you have an age-old laptop creaking from everywhere) +DEBUG = False +DEBUG_X = False +if DEBUG: + DEBUG_X = False # Debug under X (pylab.show()) + +DEBUG_IMAGES_PATH = None +if DEBUG: + # UNTESTED YET + # To avoid loading NIST if you don't have it handy + # (use with debug_images_iterator(), see main()) + # To use NIST, leave as = None + DEBUG_IMAGES_PATH = None#'/home/francois/Desktop/debug_images' + +# Directory where to dump images to visualize results +# (create it, otherwise it'll crash) +DEBUG_OUTPUT_DIR = 'debug_out' + +DEFAULT_NIST_PATH = '/data/lisa/data/ift6266h10/train_data.ft' +DEFAULT_LABEL_PATH = '/data/lisa/data/ift6266h10/train_labels.ft' +DEFAULT_OCR_PATH = '/data/lisa/data/ocr_breuel/filetensor/unlv-corrected-2010-02-01-shuffled.ft' +DEFAULT_OCRLABEL_PATH = '/data/lisa/data/ocr_breuel/filetensor/unlv-corrected-2010-02-01-labels-shuffled.ft' +ARGS_FILE = os.environ['PIPELINE_ARGS_TMPFILE'] + +# PARSE COMMAND LINE ARGUMENTS +def get_argv(): + with open(ARGS_FILE) as f: + args = [l.rstrip() for l in f.readlines()] + return args + +def usage(): + print ''' +Usage: run_pipeline.sh [-m ...] [-z ...] [-o ...] [-p ...] + -m, --max-complexity: max complexity to generate for an image + -z, --probability-zero: probability of using complexity=0 for an image + -o, --output-file: full path to file to use for output of images + -p, --params-output-file: path to file to output params to + -x, --labels-output-file: path to file to output labels to + -f, --data-file: path to filetensor (.ft) data file (NIST) + -l, --label-file: path to filetensor (.ft) labels file (NIST labels) + -c, --ocr-file: path to filetensor (.ft) data file (OCR) + -d, --ocrlabel-file: path to filetensor (.ft) labels file (OCR labels) + -a, --prob-font: probability of using a raw font image + -b, --prob-captcha: probability of using a captcha image + -g, --prob-ocr: probability of using an ocr image + -y, --seed: the job seed + -t, --type: [default: 0:full transformations], 1:Nist-friendly transformations + ''' + +try: + opts, args = getopt.getopt(get_argv(), "r:m:z:o:p:x:s:f:l:c:d:a:b:g:y:t:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "labels-output-file=", +"stop-after=", "data-file=", "label-file=", "ocr-file=", "ocrlabel-file=", "prob-font=", "prob-captcha=", "prob-ocr=", "seed=","type="]) +except getopt.GetoptError, err: + # print help information and exit: + print str(err) # will print something like "option -a not recognized" + usage() + pdb.gimp_quit(0) + sys.exit(2) + +for o, a in opts: + if o in ('-y','--seed'): + random.seed(int(a)) + numpy.random.seed(int(a)) + +type_pipeline = 0 +for o, a in opts: + if o in ('-t','--type'): + type_pipeline = int(a) + +if DEBUG_X: + import pylab + pylab.ion() + +from ift6266.data_generation.transformations.PoivreSel import PoivreSel +from ift6266.data_generation.transformations.thick import Thick +from ift6266.data_generation.transformations.BruitGauss import BruitGauss +from ift6266.data_generation.transformations.DistorsionGauss import DistorsionGauss +from ift6266.data_generation.transformations.PermutPixel import PermutPixel +from ift6266.data_generation.transformations.gimp_script import GIMP1 +from ift6266.data_generation.transformations.Rature import Rature +from ift6266.data_generation.transformations.contrast import Contrast +from ift6266.data_generation.transformations.local_elastic_distortions import LocalElasticDistorter +from ift6266.data_generation.transformations.slant import Slant +from ift6266.data_generation.transformations.Occlusion import Occlusion +from ift6266.data_generation.transformations.add_background_image import AddBackground +from ift6266.data_generation.transformations.affine_transform import AffineTransformation +from ift6266.data_generation.transformations.ttf2jpg import ttf2jpg +from ift6266.data_generation.transformations.pycaptcha.Facade import generateCaptcha + +if DEBUG: + from visualizer import Visualizer + # Either put the visualizer as in the MODULES_INSTANCES list + # after each module you want to visualize, or in the + # AFTER_EACH_MODULE_HOOK list (but not both, it's redundant) + VISUALIZER = Visualizer(to_dir=DEBUG_OUTPUT_DIR, on_screen=False) + +###---------------------order of transformation module +if type_pipeline == 0: + MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(),Rature(),Occlusion(), PermutPixel(),DistorsionGauss(),AddBackground(), PoivreSel(), BruitGauss(), Contrast()] + stop_idx = 0 +if type_pipeline == 1: + MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(False),Rature(),Occlusion(), PermutPixel(),DistorsionGauss(),AddBackground(), PoivreSel(), BruitGauss(), Contrast()] + stop_idx = 5 + #we disable transformation corresponding to MODULE_INSTANCES[stop_idx:] but we still need to apply them on dummy images + #in order to be sure to have the same random generator state than with the default pipeline. + #This is not optimal (we do more calculus than necessary) but it is a quick hack to produce similar results than previous generation + + + +# These should have a "after_transform_callback(self, image)" method +# (called after each call to transform_image in a module) +AFTER_EACH_MODULE_HOOK = [] +if DEBUG: + AFTER_EACH_MODULE_HOOK = [VISUALIZER] + +# These should have a "end_transform_callback(self, final_image" method +# (called after all modules have been called) +END_TRANSFORM_HOOK = [] +if DEBUG: + END_TRANSFORM_HOOK = [VISUALIZER] + +class Pipeline(): + def __init__(self, modules, num_img, image_size=(32,32)): + self.modules = modules + self.num_img = num_img + self.num_params_stored = 0 + self.image_size = image_size + + self.init_memory() + + def init_num_params_stored(self): + # just a dummy call to regenerate_parameters() to get the + # real number of params (only those which are stored) + self.num_params_stored = 0 + for m in self.modules: + self.num_params_stored += len(m.regenerate_parameters(0.0)) + + def init_memory(self): + self.init_num_params_stored() + + total = self.num_img + num_px = self.image_size[0] * self.image_size[1] + + self.res_data = numpy.empty((total, num_px), dtype=numpy.uint8) + # +1 to store complexity + self.params = numpy.empty((total, self.num_params_stored+len(self.modules))) + self.res_labels = numpy.empty(total, dtype=numpy.int32) + + def run(self, img_iterator, complexity_iterator): + img_size = self.image_size + + should_hook_after_each = len(AFTER_EACH_MODULE_HOOK) != 0 + should_hook_at_the_end = len(END_TRANSFORM_HOOK) != 0 + + for img_no, (img, label) in enumerate(img_iterator): + sys.stdout.flush() + + global_idx = img_no + + img = img.reshape(img_size) + + param_idx = 0 + mod_idx = 0 + for mod in self.modules: + # This used to be done _per batch_, + # ie. out of the "for img" loop + complexity = complexity_iterator.next() + #better to do a complexity sampling for each transformations in order to have more variability + #otherwise a lot of images similar to the source are generated (i.e. when complexity is close to 0 (1/8 of the time)) + #we need to save the complexity of each transformations and the sum of these complexity is a good indicator of the overall + #complexity + self.params[global_idx, mod_idx] = complexity + mod_idx += 1 + + p = mod.regenerate_parameters(complexity) + self.params[global_idx, param_idx+len(self.modules):param_idx+len(p)+len(self.modules)] = p + param_idx += len(p) + + if not(stop_idx) or stop_idx > mod_idx: + img = mod.transform_image(img) + else: + tmp = mod.transform_image(copy.copy(img)) + #this is done to be sure to have the same global random generator state + #we don't apply the transformation on the original image but on a copy in case of in-place transformations + + if should_hook_after_each: + for hook in AFTER_EACH_MODULE_HOOK: + hook.after_transform_callback(img) + + self.res_data[global_idx] = \ + img.reshape((img_size[0] * img_size[1],))*255 + self.res_labels[global_idx] = label + + if should_hook_at_the_end: + for hook in END_TRANSFORM_HOOK: + hook.end_transform_callback(img) + + def write_output(self, output_file_path, params_output_file_path, labels_output_file_path): + with open(output_file_path, 'wb') as f: + ft.write(f, self.res_data) + + #if type_pipeline == 0: #only needed for type 0 pipeline + numpy.save(params_output_file_path, self.params) + + with open(labels_output_file_path, 'wb') as f: + ft.write(f, self.res_labels) + + +############################################################################## +# COMPLEXITY ITERATORS +# They're called once every img, to get the complexity to use for that img +# they must be infinite (should never throw StopIteration when calling next()) + +# probability of generating 0 complexity, otherwise +# uniform over 0.0-max_complexity +def range_complexity_iterator(probability_zero, max_complexity): + assert max_complexity <= 1.0 + n = numpy.random.uniform(0.0, 1.0) + n = 2.0 #hack to bug fix, having a min complexity is not necessary and we need the same seed... + while True: + if n < probability_zero: + yield 0.0 + else: + yield numpy.random.uniform(0.0, max_complexity) + +############################################################################## +# DATA ITERATORS +# They can be used to interleave different data sources etc. + +''' +# Following code (DebugImages and iterator) is untested + +def load_image(filepath): + _RGB_TO_GRAYSCALE = [0.3, 0.59, 0.11, 0.0] + img = Image.open(filepath) + img = numpy.asarray(img) + if len(img.shape) > 2: + img = (img * _RGB_TO_GRAYSCALE).sum(axis=2) + return (img / 255.0).astype('float') + +class DebugImages(): + def __init__(self, images_dir_path): + import glob, os.path + self.filelist = glob.glob(os.path.join(images_dir_path, "*.png")) + +def debug_images_iterator(debug_images): + for path in debug_images.filelist: + yield load_image(path) +''' + +class NistData(): + def __init__(self, nist_path, label_path, ocr_path, ocrlabel_path): + self.train_data = open(nist_path, 'rb') + self.train_labels = open(label_path, 'rb') + self.dim = tuple(ft._read_header(self.train_data)[3]) + # in order to seek to the beginning of the file + self.train_data.close() + self.train_data = open(nist_path, 'rb') + self.ocr_data = open(ocr_path, 'rb') + self.ocr_labels = open(ocrlabel_path, 'rb') + +# cet iterator load tout en ram +def nist_supp_iterator(nist, prob_font, prob_captcha, prob_ocr, num_img): + img = ft.read(nist.train_data) + labels = ft.read(nist.train_labels) + if prob_ocr: + ocr_img = ft.read(nist.ocr_data) + ocr_labels = ft.read(nist.ocr_labels) + ttf = ttf2jpg() + L = [chr(ord('0')+x) for x in range(10)] + [chr(ord('A')+x) for x in range(26)] + [chr(ord('a')+x) for x in range(26)] + + for i in xrange(num_img): + r = numpy.random.rand() + if r <= prob_font: + yield ttf.generate_image() + elif r <=prob_font + prob_captcha: + (arr, charac) = generateCaptcha(0,1) + yield arr.astype(numpy.float32)/255, L.index(charac[0]) + elif r <= prob_font + prob_captcha + prob_ocr: + j = numpy.random.randint(len(ocr_labels)) + yield ocr_img[j].astype(numpy.float32)/255, ocr_labels[j] + else: + j = numpy.random.randint(len(labels)) + yield img[j].astype(numpy.float32)/255, labels[j] + + +# Mostly for debugging, for the moment, just to see if we can +# reload the images and parameters. +def reload(output_file_path, params_output_file_path): + images_ft = open(output_file_path, 'rb') + images_ft_dim = tuple(ft._read_header(images_ft)[3]) + + print "Images dimensions: ", images_ft_dim + + params = numpy.load(params_output_file_path) + + print "Params dimensions: ", params.shape + print params + + +############################################################################## +# MAIN + + +# Might be called locally or through dbidispatch. In all cases it should be +# passed to the GIMP executable to be able to use GIMP filters. +# Ex: +def _main(): + #global DEFAULT_NIST_PATH, DEFAULT_LABEL_PATH, DEFAULT_OCR_PATH, DEFAULT_OCRLABEL_PATH + #global getopt, get_argv + + max_complexity = 0.5 # default + probability_zero = 0.1 # default + output_file_path = None + params_output_file_path = None + labels_output_file_path = None + nist_path = DEFAULT_NIST_PATH + label_path = DEFAULT_LABEL_PATH + ocr_path = DEFAULT_OCR_PATH + ocrlabel_path = DEFAULT_OCRLABEL_PATH + prob_font = 0.0 + prob_captcha = 0.0 + prob_ocr = 0.0 + stop_after = None + reload_mode = False + + for o, a in opts: + if o in ('-m', '--max-complexity'): + max_complexity = float(a) + assert max_complexity >= 0.0 and max_complexity <= 1.0 + elif o in ('-r', '--reload'): + reload_mode = True + elif o in ("-z", "--probability-zero"): + probability_zero = float(a) + assert probability_zero >= 0.0 and probability_zero <= 1.0 + elif o in ("-o", "--output-file"): + output_file_path = a + elif o in ('-p', "--params-output-file"): + params_output_file_path = a + elif o in ('-x', "--labels-output-file"): + labels_output_file_path = a + elif o in ('-s', "--stop-after"): + stop_after = int(a) + elif o in ('-f', "--data-file"): + nist_path = a + elif o in ('-l', "--label-file"): + label_path = a + elif o in ('-c', "--ocr-file"): + ocr_path = a + elif o in ('-d', "--ocrlabel-file"): + ocrlabel_path = a + elif o in ('-a', "--prob-font"): + prob_font = float(a) + elif o in ('-b', "--prob-captcha"): + prob_captcha = float(a) + elif o in ('-g', "--prob-ocr"): + prob_ocr = float(a) + elif o in ('-y', "--seed"): + pass + elif o in ('-t', "--type"): + pass + else: + assert False, "unhandled option" + + if output_file_path == None or params_output_file_path == None or labels_output_file_path == None: + print "Must specify the three output files." + usage() + pdb.gimp_quit(0) + sys.exit(2) + + if reload_mode: + reload(output_file_path, params_output_file_path) + else: + if DEBUG_IMAGES_PATH: + ''' + # This code is yet untested + debug_images = DebugImages(DEBUG_IMAGES_PATH) + num_img = len(debug_images.filelist) + pl = Pipeline(modules=MODULE_INSTANCES, num_img=num_img, image_size=(32,32)) + img_it = debug_images_iterator(debug_images) + ''' + else: + nist = NistData(nist_path, label_path, ocr_path, ocrlabel_path) + num_img = 819200 # 800 Mb file + if stop_after: + num_img = stop_after + pl = Pipeline(modules=MODULE_INSTANCES, num_img=num_img, image_size=(32,32)) + img_it = nist_supp_iterator(nist, prob_font, prob_captcha, prob_ocr, num_img) + + cpx_it = range_complexity_iterator(probability_zero, max_complexity) + pl.run(img_it, cpx_it) + pl.write_output(output_file_path, params_output_file_path, labels_output_file_path) + +try: + _main() +except: + print "Unexpected error" + +if DEBUG_X: + pylab.ioff() + pylab.show() + +pdb.gimp_quit(0) + diff -r 6f606b359df3 -r a9af079892ce data_generation/pipeline/testtransformations.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/pipeline/testtransformations.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,156 @@ +#!/usr/bin/env python + + + +from pylearn.io import filetensor as ft +import copy +import pygame +import time +import numpy as N + +from ttf2jpg import ttf2jpg + +#from gimpfu import * + + +from PoivreSel import PoivreSel +from thick import Thick +from BruitGauss import BruitGauss +from DistorsionGauss import DistorsionGauss +from PermutPixel import PermutPixel +from gimp_script import GIMP1 +from Rature import Rature +from contrast import Contrast +from local_elastic_distortions import LocalElasticDistorter +from slant import Slant +from Occlusion import Occlusion +from add_background_image import AddBackground +from affine_transform import AffineTransformation + +###---------------------order of transformation module +MODULE_INSTANCES = [Slant(),Thick(),AffineTransformation(),LocalElasticDistorter(),GIMP1(False)] + +###---------------------complexity associated to each of them +complexity = 0.7 +#complexity = [0.5]*len(MODULE_INSTANCES) +#complexity = [0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.] +n=100 + +def createimage(path,d): + for i in range(n): + screen.fill(0) + a=d[i,:] + off1=4*32 + off2=0 + for u in range(n): + b=N.asarray(N.reshape(a,(32,32))) + c=N.asarray([N.reshape(a*255.0,(32,32))]*3).T + new=pygame.surfarray.make_surface(c) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + #new.set_palette(anglcolorpalette) + screen.blit(new,(0,0)) + exemple.blit(new,(0,0)) + + offset = 4*32 + offset2 = 0 + ct = 0 + ctmp = N.random.rand()*complexity + print u + for j in MODULE_INSTANCES: + #max dilation + #ctmp = N.random.rand()*complexity[ct] + ctmp = N.random.rand()*complexity + #print j.get_settings_names(), j.regenerate_parameters(ctmp) + th=j.regenerate_parameters(ctmp) + + b=j.transform_image(b) + c=N.asarray([b*255]*3).T + new=pygame.surfarray.make_surface(c) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + if u==0: + #new.set_palette(anglcolorpalette) + screen.blit(new,(offset,offset2)) + font = pygame.font.SysFont('liberationserif',18) + text = font.render('%s '%(int(ctmp*100.0)/100.0) + j.__module__,0,(255,255,255),(0,0,0)) + #if j.__module__ == 'Rature': + # text = font.render('%s,%s'%(th[-1],int(ctmp*100.0)/100.0) + j.__module__,0,(255,255,255),(0,0,0)) + screen.blit(text,(offset,offset2+4*32)) + if ct == len(MODULE_INSTANCES)/2-1: + offset = 0 + offset2 = 4*32+20 + else: + offset += 4*32 + ct+=1 + exemple.blit(new,(off1,off2)) + if off1 != 9*4*32: + off1+=4*32 + else: + off1=0 + off2+=4*32 + pygame.image.save(exemple,path+'/perimages/%s.PNG'%i) + pygame.image.save(screen,path+'/exemples/%s.PNG'%i) + + + + +nbmodule = len(MODULE_INSTANCES) + +pygame.surfarray.use_arraytype('numpy') + +#pygame.display.init() +screen = pygame.Surface((4*(nbmodule+1)/2*32,2*(4*32+20)),depth=32) +exemple = pygame.Surface((N.ceil(N.sqrt(n))*4*32,N.ceil(N.sqrt(n))*4*32),depth=32) + +anglcolorpalette=[(x,x,x) for x in xrange(0,256)] +#pygame.Surface.set_palette(anglcolorpalette) +#screen.set_palette(anglcolorpalette) + +pygame.font.init() + +d = N.zeros((n,1024)) + +#datapath = '/data/lisa/data/ocr_breuel/filetensor/unlv-corrected-2010-02-01-shuffled.ft' +#f = open(datapath) +#d = ft.read(f) +#d = d[0:n,:]/255.0 +#createimage('/u/glorotxa/transf/OCR',d) + + + +datapath = '/data/lisa/data/nist/by_class/' +f = open(datapath+'digits_reshuffled/digits_reshuffled_train_data.ft') +d = ft.read(f) +d = d[0:n,:]/255.0 +createimage('/u/glorotxa/transf/NIST_digits',d) + + + +datapath = '/data/lisa/data/nist/by_class/' +f = open(datapath+'upper/upper_train_data.ft') +d = ft.read(f) +d = d[0:n,:]/255.0 +createimage('/u/glorotxa/transf/NIST_upper',d) + +#from Facade import * + +#for i in range(n): + #d[i,:]=N.asarray(N.reshape(generateCaptcha(0.8,0),(1,1024))/255.0,dtype='float32') + +#createimage('/u/glorotxa/transf/capcha',d) + + +#for i in range(n): + #myttf2jpg = ttf2jpg() + #d[i,:]=N.reshape(myttf2jpg.generate_image()[0],(1,1024)) +#createimage('/u/glorotxa/transf/fonts',d) + +datapath = '/data/lisa/data/nist/by_class/' +f = open(datapath+'lower/lower_train_data.ft') +d = ft.read(f) +d = d[0:n,:]/255.0 +createimage('/u/glorotxa/transf/NIST_lower',d) + + +#pygame.display.quit() diff -r 6f606b359df3 -r a9af079892ce data_generation/pipeline/visualizer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/pipeline/visualizer.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,73 @@ +#!/usr/bin/python + +import numpy +import Image +from image_tiling import tile_raster_images +import pylab +import time + +class Visualizer(): + def __init__(self, num_columns=10, image_size=(32,32), to_dir=None, on_screen=False): + self.list = [] + self.image_size = image_size + self.num_columns = num_columns + + self.on_screen = on_screen + self.to_dir = to_dir + + self.cur_grid_image = None + + self.cur_index = 0 + + def visualize_stop_and_flush(self): + self.make_grid_image() + + if self.on_screen: + self.visualize() + if self.to_dir: + self.dump_to_disk() + + self.stop_and_wait() + self.flush() + + self.cur_index += 1 + + def make_grid_image(self): + num_rows = len(self.list) / self.num_columns + if len(self.list) % self.num_columns != 0: + num_rows += 1 + grid_shape = (num_rows, self.num_columns) + self.cur_grid_image = tile_raster_images(numpy.array(self.list), self.image_size, grid_shape, tile_spacing=(5,5), output_pixel_vals=False) + + def visualize(self): + pylab.imshow(self.cur_grid_image) + pylab.draw() + + def dump_to_disk(self): + gi = Image.fromarray((self.cur_grid_image * 255).astype('uint8'), "L") + gi.save(self.to_dir + "/grid_" + str(self.cur_index) + ".png") + + def stop_and_wait(self): + # can't raw_input under gimp, so sleep) + print "New image generated, sleeping 5 secs" + time.sleep(5) + + def flush(self): + self.list = [] + + def get_parameters_names(self): + return [] + + def regenerate_parameters(self): + return [] + + def after_transform_callback(self, image): + self.transform_image(image) + + def end_transform_callback(self, final_image): + self.visualize_stop_and_flush() + + def transform_image(self, image): + sz = self.image_size + self.list.append(image.copy().reshape((sz[0] * sz[1]))) + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/BruitGauss.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/BruitGauss.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,138 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Ajout de bruit gaussien dans les donnees. A chaque iteration, un bruit poivre +et sel est ajoute, puis un lissage gaussien autour de ce point est ajoute. +On fait un nombre d'iteration = 1024*complexity/25 ce qui equivaud +a complexity/25 des points qui recoivent le centre du noyau gaussien. +Il y en a beaucoup moins que le bruit poivre et sel, car la transformation +est plutôt aggressive et touche beaucoup de pixels autour du centre + +La grandeur de la gaussienne ainsi que son ecart type sont definit par complexity +et par une composante aleatoire normale. + +On a 25 % de chances d'effectuer le bruitage + +Ce fichier prend pour acquis que les images sont donnees une a la fois +sous forme de numpy.array de 1024 (32 x 32) valeurs entre 0 et 1. + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + +import numpy +#import random +import scipy +from scipy import ndimage + +class BruitGauss(): + + def __init__(self,complexity=1,seed=6378): + self.nb_chngmax =10 #Le nombre de pixels changes. Seulement pour fin de calcul + self.grandeurmax = 20 + self.sigmamax = 6.0 + self.regenerate_parameters(complexity) + self.seed=seed + + #numpy.random.seed(self.seed) + + def get_seed(self): + return self.seed + + def get_settings_names(self): + return ['nb_chng','sigma_gauss','grandeur'] + + def regenerate_parameters(self, complexity): + self.effectuer =numpy.random.binomial(1,0.25) ##### On a 25% de faire un bruit ##### + + + if self.effectuer and complexity > 0: + self.nb_chng=3+int(numpy.random.rand()*self.nb_chngmax*complexity) + self.sigma_gauss=2.0 + numpy.random.rand()*self.sigmamax*complexity + self.grandeur=12+int(numpy.random.rand()*self.grandeurmax*complexity) + #creation du noyau gaussien + self.gauss=numpy.zeros((self.grandeur,self.grandeur)) + x0 = y0 = self.grandeur/2.0 + for i in xrange(self.grandeur): + for j in xrange(self.grandeur): + self.gauss[i,j]=numpy.exp(-((i-x0)**2 + (j-y0)**2) / self.sigma_gauss**2) + #creation de la fenetre de moyennage + self.moy=numpy.zeros((self.grandeur,self.grandeur)) + x0 = y0 = self.grandeur/2 + for i in xrange(0,self.grandeur): + for j in xrange(0,self.grandeur): + self.moy[i,j]=((numpy.sqrt(2*(self.grandeur/2.0)**2) -\ + numpy.sqrt(numpy.abs(i-self.grandeur/2.0)**2+numpy.abs(j-self.grandeur/2.0)**2))/numpy.sqrt((self.grandeur/2.0)**2))**5 + else: + self.sigma_gauss = 1 # eviter division par 0 + self.grandeur=1 + self.nb_chng = 0 + self.effectuer = 0 + + return self._get_current_parameters() + + def _get_current_parameters(self): + return [self.nb_chng,self.sigma_gauss,self.grandeur] + + + def transform_image(self, image): + if self.effectuer == 0: + return image + image=image.reshape((32,32)) + filtered_image = ndimage.convolve(image,self.gauss,mode='constant') + assert image.shape == filtered_image.shape + filtered_image = (filtered_image - filtered_image.min() + image.min()) / (filtered_image.max() - filtered_image.min() + image.min()) * image.max() + + #construction of the moyennage Mask + Mask = numpy.zeros((32,32)) + + for i in xrange(0,self.nb_chng): + x_bruit=int(numpy.random.randint(0,32)) + y_bruit=int(numpy.random.randint(0,32)) + offsetxmin = 0 + offsetxmax = 0 + offsetymin = 0 + offsetymax = 0 + if x_bruit < self.grandeur / 2: + offsetxmin = self.grandeur / 2 - x_bruit + if 32-x_bruit < numpy.ceil(self.grandeur / 2.0): + offsetxmax = numpy.ceil(self.grandeur / 2.0) - (32-x_bruit) + if y_bruit < self.grandeur / 2: + offsetymin = self.grandeur / 2 - y_bruit + if 32-y_bruit < numpy.ceil(self.grandeur / 2.0): + offsetymax = numpy.ceil(self.grandeur / 2.0) - (32-y_bruit) + Mask[x_bruit - self.grandeur/2 + offsetxmin : x_bruit + numpy.ceil(self.grandeur/2.0) - offsetxmax,\ + y_bruit - self.grandeur/2 + offsetymin : y_bruit + numpy.ceil(self.grandeur/2.0)- offsetymax] +=\ + self.moy[offsetxmin:self.grandeur - offsetxmax,offsetymin:self.grandeur - offsetymax] + + return numpy.asarray((image + filtered_image*Mask)/(Mask+1),dtype='float32') + +#---TESTS--- + +def _load_image(): + f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. + d = ft.read(f) + w=numpy.asarray(d[0]) + return (w/255.0).astype('float') + +def _test(complexite): + img=_load_image() + transfo = BruitGauss() + pylab.imshow(img.reshape((32,32))) + pylab.show() + print transfo.get_settings_names() + print transfo.regenerate_parameters(complexite) + + img_trans=transfo.transform_image(img) + + pylab.imshow(img_trans.reshape((32,32))) + pylab.show() + + +if __name__ == '__main__': + from pylearn.io import filetensor as ft + import pylab + _test(0.5) + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/DistorsionGauss.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/DistorsionGauss.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,100 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Ajout d'une composante aleatoire dans chaque pixel de l'image. +C'est une distorsion gaussienne de moyenne 0 et d'écart type complexity/10 + +Il y a 30% d'effectuer le bruitage + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + +import numpy +import random + +class DistorsionGauss(): + + def __init__(self,seed=3459): + self.ecart_type=0.1 #L'ecart type de la gaussienne + self.effectuer=1 #1=on effectue et 0=rien faire + self.seed=seed + + #Les deux generateurs sont de types differents, avoir la meme seed n'a pas d'influence + #numpy.random.seed(self.seed) + #random.seed(self.seed) + + def get_settings_names(self): + return ['effectuer'] + + def get_seed(self): + return self.seed + + def get_settings_names_determined_by_complexity(self,complexity): + return ['ecart_type'] + + def regenerate_parameters(self, complexity): + self.ecart_type=float(complexity)/10 + self.effectuer =numpy.random.binomial(1,0.3) ##### On a 30% de faire un bruit ##### + return self._get_current_parameters() + + def _get_current_parameters(self): + return [self.effectuer] + + def get_parameters_determined_by_complexity(self,complexity): + return [float(complexity)/10] + + def transform_image(self, image): + if self.effectuer == 0: + return image + + image=image.reshape(1024,1) + aleatoire=numpy.zeros((1024,1)).astype('float32') + for i in xrange(0,1024): + aleatoire[i]=float(random.gauss(0,self.ecart_type)) + image=image+aleatoire + + + #Ramener tout entre 0 et 1. Ancienne facon de normaliser. + #Resultats moins interessant je trouve. +## if numpy.min(image) < 0: +## image-=numpy.min(image) +## if numpy.max(image) > 1: +## image/=numpy.max(image) + + for i in xrange(0,1024): + image[i]=min(1,max(0,image[i])) + + return image.reshape(32,32) + + +#---TESTS--- + +def _load_image(): + f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. + d = ft.read(f) + w=numpy.asarray(d[random.randint(0,100)]) + return (w/255.0).astype('float') + +def _test(complexite): + img=_load_image() + transfo = DistorsionGauss() + pylab.imshow(img.reshape((32,32))) + pylab.show() + print transfo.get_settings_names() + print transfo.regenerate_parameters(complexite) + + img_trans=transfo.transform_image(img) + + pylab.imshow(img_trans.reshape((32,32))) + pylab.show() + + +if __name__ == '__main__': + from pylearn.io import filetensor as ft + import pylab + for i in xrange(0,5): + _test(0.5) + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/Occlusion.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/Occlusion.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,161 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Ajout de bruit d'occlusion dans l'image originale. + +Le bruit provient d'un echantillon pris dans la seconde image puis rajoutee a +gauche ou a droite de l'image originale. De plus, il se peut aussi que le +bruit soit rajoute sur l'image originale, mais en plus pâle. + +Le fichier /data/lisa/data/ift6266h10/echantillon_occlusion.ft +(sur le reseau DIRO) est necessaire. + +Il y a 30% de chance d'avoir une occlusion quelconque. + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + + +import numpy + +from pylearn.io import filetensor as ft + +class Occlusion(): + + def __init__(self,seed=9854): + #Ces 4 variables representent la taille du "crop" sur l'image2 + #Ce "crop" est pris a partie de image1[15,15], le milieu de l'image1 + self.haut=2 + self.bas=2 + self.gauche=2 + self.droite=2 + + #Ces deux variables representent le deplacement en x et y par rapport + #au milieu du bord gauche ou droit + self.x_arrivee=0 + self.y_arrivee=0 + + #Cette variable =1 si l'image est mise a gauche et -1 si a droite + #et =0 si au centre, mais plus pale + self.endroit=-1 + + #Cette variable determine l'opacite de l'ajout dans le cas ou on est au milieu + self.opacite=0.5 #C'est completement arbitraire. Possible de le changer si voulu + + #Sert a dire si on fait quelque chose. 0=faire rien, 1 on fait quelque chose + self.appliquer=1 + + self.seed=seed + #numpy.random.seed(self.seed) + + f3 = open('/data/lisa/data/ift6266h10/echantillon_occlusion.ft') #Doit etre sur le reseau DIRO. + #f3 = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/echantillon_occlusion.ft') + #Il faut arranger le path sinon + w=ft.read(f3) + f3.close() + + self.longueur=len(w) + self.d=(w.astype('float'))/255 + + + def get_settings_names(self): + return ['haut','bas','gauche','droite','x_arrivee','y_arrivee','endroit','rajout','appliquer'] + + def get_seed(self): + return self.seed + + def regenerate_parameters(self, complexity): + self.haut=min(15,int(numpy.abs(numpy.random.normal(int(8*complexity),2)))) + self.bas=min(15,int(numpy.abs(numpy.random.normal(int(8*complexity),2)))) + self.gauche=min(15,int(numpy.abs(numpy.random.normal(int(8*complexity),2)))) + self.droite=min(15,int(numpy.abs(numpy.random.normal(int(8*complexity),2)))) + if self.haut+self.bas+self.gauche+self.droite==0: #Tres improbable + self.haut=1 + self.bas=1 + self.gauche=1 + self.droite=1 + + #Ces deux valeurs seront controlees afin d'etre certain de ne pas depasser + self.x_arrivee=int(numpy.abs(numpy.random.normal(0,2))) #Complexity n'entre pas en jeu, pas besoin + self.y_arrivee=int(numpy.random.normal(0,3)) + + self.rajout=numpy.random.randint(0,self.longueur-1) #les bouts de quelle lettre + self.appliquer=numpy.random.binomial(1,0.4) ##### 40 % du temps, on met une occlusion ##### + + if complexity == 0: #On ne fait rien dans ce cas + self.applique=0 + + self.endroit=numpy.random.randint(-1,2) + + return self._get_current_parameters() + + def _get_current_parameters(self): + return [self.haut,self.bas,self.gauche,self.droite,self.x_arrivee,self.y_arrivee,self.endroit,self.rajout,self.appliquer] + + + def transform_image(self, image): + if self.appliquer == 0: #Si on fait rien, on retourne tout de suite l'image + return image + + #Attrapper le bruit d'occlusion + bruit=self.d[self.rajout].reshape((32,32))[15-self.haut:15+self.bas+1,15-self.gauche:15+self.droite+1] + + if self.x_arrivee+self.gauche+self.droite>32: + self.endroit*=-1 #On change de bord et on colle sur le cote + self.x_arrivee=0 + if self.y_arrivee-self.haut <-16: + self.y_arrivee=self.haut-16#On colle le morceau en haut + if self.y_arrivee+self.bas > 15: + self.y_arrivee=15-self.bas #On colle le morceau en bas + + if self.endroit==-1: #a gauche + for i in xrange(-self.haut,self.bas+1): + for j in xrange(0,self.gauche+self.droite+1): + image[16+self.y_arrivee+i,self.x_arrivee+j]=\ + max(image[16+self.y_arrivee+i,self.x_arrivee+j],bruit[i+self.haut,j]) + + elif self.endroit==1: #a droite + for i in xrange(-self.haut,self.bas+1): + for j in xrange(-self.gauche-self.droite,1): + image[16+self.y_arrivee+i,31-self.x_arrivee+j]=\ + max(image[16+self.y_arrivee+i,31-self.x_arrivee+j],bruit[i+self.haut,j+self.gauche+self.droite]) + + elif self.endroit==0: #au milieu + for i in xrange(-self.haut,self.bas+1): + for j in xrange(-self.gauche,self.droite+1): + image[16+i,16+j]=max(image[16+i,16+j],bruit[i+self.haut,j+self.gauche]*self.opacite) + + + return image + +#---TESTS--- + +def _load_image(): + f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. + d = ft.read(f) + w=numpy.asarray(d[numpy.random.randint(0,50)]) + return (w/255.0).astype('float') + +def _test(complexite): + + transfo = Occlusion() + for i in xrange(0,20): + img = _load_image() + pylab.imshow(img.reshape((32,32))) + pylab.show() + print transfo.get_settings_names() + print transfo.regenerate_parameters(complexite) + + img_trans=transfo.transform_image(img.reshape((32,32))) + + print transfo.get_seed() + pylab.imshow(img_trans.reshape((32,32))) + pylab.show() + + +if __name__ == '__main__': + import pylab + import scipy + _test(0.5) diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/PermutPixel.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/PermutPixel.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,114 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Un echange de pixels est effectue entre certain pixels choisit aleatoirement +et un de ses 4 voisins, tout aussi choisi aleatoirement. + +Le nombre de pixels permutes est definit pas complexity*1024 + +Il y a proba 20% d'effectuer le bruitage + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + +import numpy +import random + +class PermutPixel(): + + def __init__(self,seed=7152): + self.nombre=10 #Le nombre de pixels a permuter + self.proportion=0.3 + self.effectuer=1 #1=on effectue, 0=rien faire + self.seed=seed + + #Les deux generateurs sont de types differents, avoir la meme seed n'a pas d'influence + #numpy.random.seed(self.seed) + #random.seed(self.seed) + + def get_seed(self): + return self.seed + + def get_settings_names(self): + return ['effectuer'] + + def get_settings_names_determined_by_complexity(self,complexity): + return ['nombre'] + + def regenerate_parameters(self, complexity): + self.proportion=float(complexity)/3 + self.nombre=int(256*self.proportion)*4 #Par multiple de 4 (256=1024/4) + self.echantillon=random.sample(xrange(0,1024),self.nombre) #Les pixels qui seront permutes + self.effectuer =numpy.random.binomial(1,0.2) ##### On a 20% de faire un bruit ##### + return self._get_current_parameters() + + def _get_current_parameters(self): + return [self.effectuer] + + def get_parameters_determined_by_complexity(self, complexity): + return [int(complexity*256)*4] + + def transform_image(self, image): + if self.effectuer==0: + return image + + image=image.reshape(1024,1) + temp=0 #variable temporaire + + for i in xrange(0,self.nombre,4): #Par bonds de 4 + #gauche + if self.echantillon[i] > 0: + temp=image[self.echantillon[i]-1] + image[self.echantillon[i]-1]=image[self.echantillon[i]] + image[self.echantillon[i]]=temp + #droite + if self.echantillon[i+1] < 1023: + temp=image[self.echantillon[i+1]+1] + image[self.echantillon[i+1]+1]=image[self.echantillon[i+1]] + image[self.echantillon[i+1]]=temp + #haut + if self.echantillon[i+2] > 31: + temp=image[self.echantillon[i+2]-32] + image[self.echantillon[i+2]-32]=image[self.echantillon[i+2]] + image[self.echantillon[i+2]]=temp + #bas + if self.echantillon[i+3] < 992: + temp=image[self.echantillon[i+3]+32] + image[self.echantillon[i+3]+32]=image[self.echantillon[i+3]] + image[self.echantillon[i+3]]=temp + + + return image.reshape((32,32)) + + +#---TESTS--- + +def _load_image(): + f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. + d = ft.read(f) + w=numpy.asarray(d[random.randint(0,100)]) + return (w/255.0).astype('float') + +def _test(complexite): + img=_load_image() + transfo = PermutPixel() + pylab.imshow(img.reshape((32,32))) + pylab.show() + print transfo.get_settings_names() + print transfo.regenerate_parameters(complexite) + + img_trans=transfo.transform_image(img) + + pylab.imshow(img_trans.reshape((32,32))) + pylab.show() + + +if __name__ == '__main__': + from pylearn.io import filetensor as ft + import pylab + for i in xrange(0,5): + _test(0.5) + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/PoivreSel.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/PoivreSel.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,94 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Ajout de bruit poivre et sel dans les donnees. Le bruit est distribue de facon +aleatoire tire d'une uniforme tout comme la clarte des bites changees. + +La proportion de bites aleatoires est definit par complexity/5. +Lorsque cette valeur est a 1 ==> Plus reconnaissable et 0 ==> Rien ne se passe + +On a maintenant 25% de chance d'effectuer un bruitage. + +Ce fichier prend pour acquis que les images sont donnees une a la fois +sous forme de numpy.array de 1024 (32 x 32) valeurs entre 0 et 1. + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + +import numpy +import random + +class PoivreSel(): + + def __init__(self,seed=9361): + self.proportion_bruit=0.08 #Le pourcentage des pixels qui seront bruites + self.nb_chng=10 #Le nombre de pixels changes. Seulement pour fin de calcul + self.effectuer=1 #Vaut 1 si on effectue et 0 sinon. + + self.seed=seed + #Les deux generateurs sont de types differents, avoir la meme seed n'a pas d'influence + #numpy.random.seed(self.seed) + #random.seed(self.seed) + + def get_seed(self): + return self.seed + + def get_settings_names(self): + return ['effectuer'] + + def get_settings_names_determined_by_complexity(self,complexity): + return ['proportion_bruit'] + + def regenerate_parameters(self, complexity): + self.proportion_bruit = float(complexity)/5 + self.nb_chng=int(1024*self.proportion_bruit) + self.changements=random.sample(xrange(1024),self.nb_chng) #Les pixels qui seront changes + self.effectuer =numpy.random.binomial(1,0.25) ##### On a 25% de faire un bruit ##### + return self._get_current_parameters() + + def _get_current_parameters(self): + return [self.effectuer] + + def get_parameters_determined_by_complexity(self, complexity): + return [float(complexity)/5] + + def transform_image(self, image): + if self.effectuer == 0: + return image + + image=image.reshape(1024,1) + for j in xrange(0,self.nb_chng): + image[self.changements[j]]=numpy.random.random() #On determine les nouvelles valeurs des pixels changes + return image.reshape(32,32) + + +#---TESTS--- + +def _load_image(): + f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. + d = ft.read(f) + w=numpy.asarray(d[0]) + return (w/255.0).astype('float') + +def _test(complexite): + img=_load_image() + transfo = PoivreSel() + pylab.imshow(img.reshape((32,32))) + pylab.show() + print transfo.get_settings_names() + print transfo.regenerate_parameters(complexite) + + img_trans=transfo.transform_image(img) + + pylab.imshow(img_trans.reshape((32,32))) + pylab.show() + + +if __name__ == '__main__': + from pylearn.io import filetensor as ft + import pylab + _test(0.5) + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/Rature.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/Rature.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,255 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Ajout d'une rature sur le caractère. La rature est en fait un 1 qui recoit une +rotation et qui est ensuite appliqué sur le caractère. Un grossissement, puis deux +erosions sont effectuees sur le 1 afin qu'il ne soit plus reconnaissable. +Il y a des chances d'avoir plus d'une seule rature ! + +Il y a 15% d'effectuer une rature. + +Ce fichier prend pour acquis que les images sont donnees une a la fois +sous forme de numpy.array de 1024 (32 x 32) valeurs entre 0 et 1. + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + +import numpy, Image, random +import scipy.ndimage.morphology +from pylearn.io import filetensor as ft + + +class Rature(): + + def __init__(self,seed=1256): + self.angle=0 #Angle en degre de la rotation (entre 0 et 180) + self.numero=0 #Le numero du 1 choisi dans la banque de 1 + self.gauche=-1 #Le numero de la colonne la plus a gauche contenant le 1 + self.droite=-1 + self.haut=-1 + self.bas=-1 + self.faire=1 #1=on effectue et 0=fait rien + + self.crop_haut=0 + self.crop_gauche=0 #Ces deux valeurs sont entre 0 et 31 afin de definir + #l'endroit ou sera pris le crop dans l'image du 1 + + self.largeur_bande=-1 #La largeur de la bande + self.smooth=-1 #La largeur de la matrice carree servant a l'erosion + self.nb_ratures=-1 #Le nombre de ratures appliques + self.fini=0 #1=fini de mettre toutes les couches 0=pas fini + self.complexity=0 #Pour garder en memoire la complexite si plusieurs couches sont necessaires + self.seed=seed + + #numpy.random.seed(self.seed) + + f3 = open('/data/lisa/data/ift6266h10/un_rature.ft') #Doit etre sur le reseau DIRO. + #f3 = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/un_rature.ft') + #Il faut arranger le path sinon + w=ft.read(f3) + f3.close() + self.d=(w.astype('float'))/255 + + self.patch=self.d[0].reshape((32,32)) #La patch de rature qui sera appliquee sur l'image + + def get_settings_names(self): + return ['angle','numero','faire','crop_haut','crop_gauche','largeur_bande','smooth','nb_ratures'] + + def get_seed(self): + return self.seed + + def regenerate_parameters(self, complexity,next_rature = False): + + + self.numero=random.randint(0,4999) #Ces bornes sont inclusives ! + self.fini=0 + self.complexity=complexity + + if float(complexity) > 0: + + self.gauche=self.droite=self.haut=self.bas=-1 #Remet tout a -1 + + self.angle=int(numpy.random.normal(90,100*complexity)) + + self.faire=numpy.random.binomial(1,0.15) ##### 15% d'effectuer une rature ##### + if next_rature: + self.faire = 1 + #self.faire=1 #Pour tester seulement + + self.crop_haut=random.randint(0,17) + self.crop_gauche=random.randint(0,17) + if complexity <= 0.25 : + self.smooth=6 + elif complexity <= 0.5: + self.smooth=5 + elif complexity <= 0.75: + self.smooth=4 + else: + self.smooth=3 + + p = numpy.random.rand() + if p < 0.5: + self.nb_ratures= 1 + else: + if p < 0.8: + self.nb_ratures = 2 + else: + self.nb_ratures = 3 + + #Creation de la "patch" de rature qui sera appliquee sur l'image + if self.faire == 1: + self.get_size() + self.get_image_rot() #On fait la "patch" + + else: + self.faire=0 #On ne fait rien si complexity=0 !! + + return self._get_current_parameters() + + + def get_image_rot(self): + image2=(self.d[self.numero].reshape((32,32))[self.haut:self.bas,self.gauche:self.droite]) + + im = Image.fromarray(numpy.asarray(image2*255,dtype='uint8')) + + #La rotation et le resize sont de belle qualite afin d'avoir une image nette + im2 = im.rotate(self.angle,Image.BICUBIC,expand=False) + im3=im2.resize((50,50),Image.ANTIALIAS) + + grosse=numpy.asarray(numpy.asarray(im3)/255.0,dtype='float32') + crop=grosse[self.haut:self.haut+32,self.gauche:self.gauche+32] + + self.get_patch(crop) + + def get_patch(self,crop): + smooting = numpy.ones((self.smooth,self.smooth)) + #Il y a deux erosions afin d'avoir un beau resultat. Pas trop large et + #pas trop mince + trans=scipy.ndimage.morphology.grey_erosion\ + (crop,size=smooting.shape,structure=smooting,mode='wrap') + trans1=scipy.ndimage.morphology.grey_erosion\ + (trans,size=smooting.shape,structure=smooting,mode='wrap') + + + patch_img=Image.fromarray(numpy.asarray(trans1*255,dtype='uint8')) + + patch_img2=patch_img.crop((4,4,28,28)).resize((32,32)) #Pour contrer les effets de bords ! + + trans2=numpy.asarray(numpy.asarray(patch_img2)/255.0,dtype='float32') + + + #Tout ramener entre 0 et 1 + trans2=trans2-trans2.min() #On remet tout positif + trans2=trans2/trans2.max() + + #La rayure a plus de chance d'etre en bas ou oblique le haut a 10h + if random.random() <= 0.5: #On renverse la matrice dans ce cas + for i in xrange(0,32): + self.patch[i,:]=trans2[31-i,:] + else: + self.patch=trans2 + + + + + def get_size(self): + image=self.d[self.numero].reshape((32,32)) + + #haut + for i in xrange(0,32): + for j in xrange(0,32): + if(image[i,j]) != 0: + if self.haut == -1: + self.haut=i + break + if self.haut > -1: + break + + #bas + for i in xrange(31,-1,-1): + for j in xrange(0,32): + if(image[i,j]) != 0: + if self.bas == -1: + self.bas=i + break + if self.bas > -1: + break + + #gauche + for i in xrange(0,32): + for j in xrange(0,32): + if(image[j,i]) != 0: + if self.gauche == -1: + self.gauche=i + break + if self.gauche > -1: + break + + #droite + for i in xrange(31,-1,-1): + for j in xrange(0,32): + if(image[j,i]) != 0: + if self.droite == -1: + self.droite=i + break + if self.droite > -1: + break + + + def _get_current_parameters(self): + return [self.angle,self.numero,self.faire,self.crop_haut,self.crop_gauche,self.largeur_bande,self.smooth,self.nb_ratures] + + def transform_image(self, image): + if self.faire == 0: #Rien faire !! + return image + + if self.fini == 0: #S'il faut rajouter des couches + patch_temp=self.patch + for w in xrange(1,self.nb_ratures): + self.regenerate_parameters(self.complexity,1) + for i in xrange(0,32): + for j in xrange(0,32): + patch_temp[i,j]=max(patch_temp[i,j],self.patch[i,j]) + self.fini=1 + self.patch=patch_temp + + for i in xrange(0,32): + for j in xrange(0,32): + image[i,j]=max(image[i,j],self.patch[i,j]) + self.patch*=0 #Remise a zero de la patch (pas necessaire) + return image + + +#---TESTS--- + +def _load_image(): + f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. + d = ft.read(f) + w=numpy.asarray(d[0:1000]) + return (w/255.0).astype('float') + +def _test(complexite): + img=_load_image() + transfo = Rature() + for i in xrange(0,10): + img2=img[random.randint(0,1000)] + pylab.imshow(img2.reshape((32,32))) + pylab.show() + print transfo.get_settings_names() + print transfo.regenerate_parameters(complexite) + img2=img2.reshape((32,32)) + + img2_trans=transfo.transform_image(img2) + + pylab.imshow(img2_trans.reshape((32,32))) + pylab.show() + + +if __name__ == '__main__': + from pylearn.io import filetensor as ft + import pylab + _test(1) + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,6 @@ +from pycaptcha import Facade + +__all__ = ['PoivreSel','thick','BruitGauss','DistorsionGauss','PermutPixel',\ + 'gimp_script','Rature','contrast','local_elastic_distortions', \ + 'slant','Occlusion','add_background_image','affine_transform',\ + 'ttf2jpg','Facade'] diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/add_background_image.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/add_background_image.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,112 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- + +''' + Implementation of random background adding to a specific image + + Author: Guillaume Sicard +''' + +import sys, os, random +import cPickle +import Image, numpy + +class AddBackground(): + def __init__(self, threshold = 128, complexity = 1): + self.h = 32 + self.w = 32 + self.threshold = 1; + try: #in order to load locally if it is available + self.bg_image_file = '/Tmp/image_net/' + f=open(self.bg_image_file+'filelist.pkl') + except: + self.bg_image_file = '/data/lisa/data/ift6266h10/image_net/' + f=open(self.bg_image_file+'filelist.pkl') + self.image_files = cPickle.load(f) + f.close() + self.regenerate_parameters(complexity) + + def get_current_parameters(self): + return [self.contrast] + # get threshold value + def get_settings_names(self): + return ['contrast'] + + # no need, except for testmod.py + def regenerate_parameters(self, complexity): + self.contrast = 1-numpy.random.rand()*complexity + return [self.contrast] + + # load an image + def load_image(self,filename): + image = Image.open(filename).convert('L') + image = numpy.asarray(image) + image = (image / 255.0).astype(numpy.float32) + return image + + # save an image + def save_image(self,array, filename): + image = (array * 255.0).astype('int') + image = Image.fromarray(image) + if (filename != ''): + image.save(filename) + else: + image.show() + + # make a random 32x32 crop of an image + def rand_crop(self,image): + i_w, i_h = image.shape + x, y = random.randint(0, i_w - self.w), random.randint(0, i_h - self.h) + return image[x:x + self.w, y:y + self.h] + + # select a random background image from "bg_image_file" and crops it + def rand_bg_image(self,maximage): + i = random.randint(0, len(self.image_files) - 1) + + image = self.load_image(self.bg_image_file + self.image_files[i]) + self.bg_image = self.rand_crop(image) + maxbg = self.bg_image.max() + self.bg_image = self.bg_image / maxbg * ( max(maximage - self.contrast,0.0) ) + + # set "bg_image" as background to "image", based on a pixels threshold + def set_bg(self,image): + tensor = numpy.asarray([self.bg_image,image],dtype='float32') + return tensor.max(0) + + # transform an image file and return an array + def transform_image_from_file(self, filename): + self.rand_bg_image() + image = self.load_image(filename) + image = self.set_bg(image) + return image + + # standard array to array transform + def transform_image(self, image): + self.rand_bg_image(image.max()) + image = self.set_bg(image) + return image + + # test method + def test(self,filename): + import time + + sys.stdout.write('Starting addBackground test : loading image') + sys.stdout.flush() + + image = self.load_image(filename) + + t = 0 + n = 500 + for i in range(n): + t0 = time.time() + image2 = self.transform_image(image) + t = ( i * t + (time.time() - t0) ) / (i + 1) + sys.stdout.write('.') + sys.stdout.flush() + + print "Done!\nAverage time : " + str(1000 * t) + " ms" + +if __name__ == '__main__': + + myAddBackground = AddBackground() + myAddBackground.test('./images/0-LiberationSans-Italic.ttf.jpg') diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/affine_transform.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/affine_transform.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Simple implementation of random affine transformations based on the Python +Imaging Module affine transformations. + + +Author: Razvan Pascanu +''' + +import numpy, Image + + + +class AffineTransformation(): + def __init__( self, complexity = .5): + self.shape = (32,32) + self.complexity = complexity + params = numpy.random.uniform(size=6) -.5 + self.a = 1. + params[0]*.6*complexity + self.b = 0. + params[1]*.6*complexity + self.c = params[2]*8.*complexity + self.d = 0. + params[3]*.6*complexity + self.e = 1. + params[4]*.6*complexity + self.f = params[5]*8.*complexity + + + def _get_current_parameters(self): + return [self.a, self.b, self.c, self.d, self.e, self.f] + + def get_settings_names(self): + return ['a','b','c','d','e','f'] + + def regenerate_parameters(self, complexity): + # generate random affine transformation + # a point (x',y') of the new image corresponds to (x,y) of the old + # image where : + # x' = params[0]*x + params[1]*y + params[2] + # y' = params[3]*x + params[4]*y _ params[5] + + # the ranges are set manually as to look acceptable + + self.complexity = complexity + params = numpy.random.uniform(size=6) -.5 + self.a = 1. + params[0]*.8*complexity + self.b = 0. + params[1]*.8*complexity + self.c = params[2]*9.*complexity + self.d = 0. + params[3]*.8*complexity + self.e = 1. + params[4]*.8*complexity + self.f = params[5]*9.*complexity + return self._get_current_parameters() + + + + + def transform_image(self,NIST_image): + + im = Image.fromarray( \ + numpy.asarray(\ + NIST_image.reshape(self.shape)*255.0, dtype='uint8')) + nwim = im.transform( (32,32), Image.AFFINE, [self.a,self.b,self.c,self.d,self.e,self.f]) + return numpy.asarray(numpy.asarray(nwim)/255.0,dtype='float32') + + + +if __name__ =='__main__': + print 'random test' + + from pylearn.io import filetensor as ft + import pylab + + datapath = '/data/lisa/data/nist/by_class/' + + f = open(datapath+'digits/digits_train_data.ft') + d = ft.read(f) + f.close() + + + transformer = AffineTransformation() + id = numpy.random.randint(30) + + pylab.figure() + pylab.imshow(d[id].reshape((32,32))) + pylab.figure() + pylab.imshow(transformer.transform_image(d[id]).reshape((32,32))) + + pylab.show() + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/contrast.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/contrast.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,137 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Simple implementation of random contrast. This always switch half the time the polarity. +then it decides of a random contrast dependant of the complexity, the mean of the maximum and minimum +pixel value stays 0 (to avoid import bias change between exemples). + +Author: Xavier Glorot +''' + +import numpy as N +import copy + + +class Contrast(): + def __init__(self,complexity = 1): + #---------- private attributes + self.__nx__ = 32 #xdim of the images + self.__ny__ = 32 #ydim of the images + self.__Pinvert__ = 0.5 #probability to switch polarity + self.__mincontrast__ = 0.15 + self.__resolution__ = 256 + self.__rangecontrastres__ = self.__resolution__ - N.int(self.__mincontrast__*self.__resolution__) + #------------------------------------------------ + + #---------- generation parameters + self.regenerate_parameters(complexity) + #------------------------------------------------ + + def _get_current_parameters(self): + return [self.invert,self.contrast] + + def get_settings_names(self): + return ['invert','contrast'] + + def regenerate_parameters(self, complexity): + self.invert = (N.random.uniform() < self.__Pinvert__) + self.contrast = self.__resolution__ - N.random.randint(1 + self.__rangecontrastres__ * complexity) + return self._get_current_parameters() + + def transform_1_image(self,image): #the real transformation method + maxi = image.max() + mini = image.min() + if self.invert: + newimage = 1 - (self.__resolution__- self.contrast) / (2 * float(self.__resolution__)) -\ + (image - mini) / float(maxi - mini) * self.contrast / float(self.__resolution__) + else: + newimage = (self.__resolution__- self.contrast) / (2 * float(self.__resolution__)) +\ + (image - mini) / float(maxi - mini) * self.contrast / float(self.__resolution__) + if image.dtype == 'uint8': + return N.asarray(newimage*255,dtype='uint8') + else: + return N.asarray(newimage,dtype=image.dtype) + + def transform_image(self,image): #handling different format + if image.shape == (self.__nx__,self.__ny__): + return self.transform_1_image(image) + if image.ndim == 3: + newimage = copy.copy(image) + for i in range(image.shape[0]): + newimage[i,:,:] = self.transform_1_image(image[i,:,:]) + return newimage + if image.ndim == 2 and image.shape != (self.__nx__,self.__ny__): + newimage = N.reshape(image,(image.shape[0],self.__nx__,self.__ny__)) + for i in range(image.shape[0]): + newimage[i,:,:] = self.transform_1_image(newimage[i,:,:]) + return N.reshape(newimage,image.shape) + if image.ndim == 1: + newimage = N.reshape(image,(self.__nx__,self.__ny__)) + newimage = self.transform_1_image(newimage) + return N.reshape(newimage,image.shape) + assert False #should never go there + + + + +#test on NIST (you need pylearn and access to NIST to do that) + +if __name__ == '__main__': + + from pylearn.io import filetensor as ft + import copy + import pygame + import time + datapath = '/data/lisa/data/nist/by_class/' + f = open(datapath+'digits/digits_train_data.ft') + d = ft.read(f) + + pygame.surfarray.use_arraytype('numpy') + + pygame.display.init() + screen = pygame.display.set_mode((8*2*32,8*32),0,8) + anglcolorpalette=[(x,x,x) for x in xrange(0,256)] + screen.set_palette(anglcolorpalette) + + MyContrast = Contrast() + + debut=time.time() + MyContrast.transform_image(d) + fin=time.time() + print '------------------------------------------------' + print d.shape[0],' images transformed in :', fin-debut, ' seconds' + print '------------------------------------------------' + print (fin-debut)/d.shape[0]*1000000,' microseconds per image' + print '------------------------------------------------' + print MyContrast.get_settings_names() + print MyContrast._get_current_parameters() + print MyContrast.regenerate_parameters(0) + print MyContrast.regenerate_parameters(0.5) + print MyContrast.regenerate_parameters(1) + for i in range(10000): + a=d[i,:] + b=N.asarray(N.reshape(a,(32,32))).T + + new=pygame.surfarray.make_surface(b) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new.set_palette(anglcolorpalette) + screen.blit(new,(0,0)) + + print MyContrast.get_settings_names(), MyContrast.regenerate_parameters(1) + c=MyContrast.transform_image(a) + b=N.asarray(N.reshape(c,(32,32))).T + + new=pygame.surfarray.make_surface(b) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new.set_palette(anglcolorpalette) + screen.blit(new,(8*32,0)) + + pygame.display.update() + raw_input('Press Enter') + + pygame.display.quit() diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/filetensor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/filetensor.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,232 @@ +""" +Read and write the matrix file format described at +U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} + +The format is for dense tensors: + + - magic number indicating type and endianness - 4bytes + - rank of tensor - int32 + - dimensions - int32, int32, int32, ... + - + +The number of dimensions and rank is slightly tricky: + - for scalar: rank=0, dimensions = [1, 1, 1] + - for vector: rank=1, dimensions = [?, 1, 1] + - for matrix: rank=2, dimensions = [?, ?, 1] + +For rank >= 3, the number of dimensions matches the rank exactly. + + +@todo: add complex type support + +""" +import sys +import numpy + +def _prod(lst): + p = 1 + for l in lst: + p *= l + return p + +_magic_dtype = { + 0x1E3D4C51 : ('float32', 4), + #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? + 0x1E3D4C53 : ('float64', 8), + 0x1E3D4C54 : ('int32', 4), + 0x1E3D4C55 : ('uint8', 1), + 0x1E3D4C56 : ('int16', 2), + } +_dtype_magic = { + 'float32': 0x1E3D4C51, + #'packed matrix': 0x1E3D4C52, + 'float64': 0x1E3D4C53, + 'int32': 0x1E3D4C54, + 'uint8': 0x1E3D4C55, + 'int16': 0x1E3D4C56 + } + +def _read_int32(f): + """unpack a 4-byte integer from the current position in file f""" + s = f.read(4) + s_array = numpy.fromstring(s, dtype='int32') + return s_array.item() + +def _read_header(f, debug=False): + """ + :returns: data type, element size, rank, shape, size + """ + #what is the data type of this matrix? + #magic_s = f.read(4) + #magic = numpy.fromstring(magic_s, dtype='int32') + magic = _read_int32(f) + magic_t, elsize = _magic_dtype[magic] + if debug: + print 'header magic', magic, magic_t, elsize + if magic_t == 'packed matrix': + raise NotImplementedError('packed matrix not supported') + + #what is the rank of the tensor? + ndim = _read_int32(f) + if debug: print 'header ndim', ndim + + #what are the dimensions of the tensor? + dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] + dim_size = _prod(dim) + if debug: print 'header dim', dim, dim_size + + return magic_t, elsize, ndim, dim, dim_size + +class arraylike(object): + """Provide an array-like interface to the filetensor in f. + + The rank parameter to __init__ controls how this object interprets the underlying tensor. + Its behaviour should be clear from the following example. + Suppose the underlying tensor is MxNxK. + + - If rank is 0, self[i] will be a scalar and len(self) == M*N*K. + + - If rank is 1, self[i] is a vector of length K, and len(self) == M*N. + + - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1. + + - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1. + + + :note: Objects of this class generally require exclusive use of the underlying file handle, because + they call seek() every time you access an element. + """ + + f = None + """File-like object""" + + magic_t = None + """numpy data type of array""" + + elsize = None + """number of bytes per scalar element""" + + ndim = None + """Rank of underlying tensor""" + + dim = None + """tuple of array dimensions (aka shape)""" + + dim_size = None + """number of scalars in the tensor (prod of dim)""" + + f_start = None + """The file position of the first element of the tensor""" + + readshape = None + """tuple of array dimensions of the block that we read""" + + readsize = None + """number of elements we must read for each block""" + + def __init__(self, f, rank=0, debug=False): + self.f = f + self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug) + self.f_start = f.tell() + + if rank <= self.ndim: + self.readshape = tuple(self.dim[self.ndim-rank:]) + else: + self.readshape = tuple(self.dim) + + #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim) + + if rank <= self.ndim: + padding = tuple() + else: + padding = (1,) * (rank - self.ndim) + + #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim) + self.returnshape = padding + self.readshape + self.readsize = _prod(self.readshape) + if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize + + def __len__(self): + return _prod(self.dim[:self.ndim-len(self.readshape)]) + + def __getitem__(self, idx): + if idx >= len(self): + raise IndexError(idx) + self.f.seek(self.f_start + idx * self.elsize * self.readsize) + return numpy.fromfile(self.f, + dtype=self.magic_t, + count=self.readsize).reshape(self.returnshape) + + +# +# TODO: implement item selection: +# e.g. load('some mat', subtensor=(:6, 2:5)) +# +# This function should be memory efficient by: +# - allocating an output matrix at the beginning +# - seeking through the file, reading subtensors from multiple places +def read(f, subtensor=None, debug=False): + """Load all or part of file 'f' into a numpy ndarray + + @param f: file from which to read + @type f: file-like object + + If subtensor is not None, it should be like the argument to + numpy.ndarray.__getitem__. The following two expressions should return + equivalent ndarray objects, but the one on the left may be faster and more + memory efficient if the underlying file f is big. + + read(f, subtensor) <===> read(f)[*subtensor] + + Support for subtensors is currently spotty, so check the code to see if your + particular type of subtensor is supported. + + """ + magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug) + f_start = f.tell() + + rval = None + if subtensor is None: + rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) + elif isinstance(subtensor, slice): + if subtensor.step not in (None, 1): + raise NotImplementedError('slice with step', subtensor.step) + if subtensor.start not in (None, 0): + bytes_per_row = _prod(dim[1:]) * elsize + f.seek(f_start + subtensor.start * bytes_per_row) + dim[0] = min(dim[0], subtensor.stop) - subtensor.start + rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) + else: + raise NotImplementedError('subtensor access not written yet:', subtensor) + + return rval + +def write(f, mat): + """Write a numpy.ndarray to file. + + @param f: file into which to write + @type f: file-like object + + @param mat: array to write to file + @type mat: numpy ndarray or compatible + + """ + def _write_int32(f, i): + i_array = numpy.asarray(i, dtype='int32') + if 0: print 'writing int32', i, i_array + i_array.tofile(f) + + try: + _write_int32(f, _dtype_magic[str(mat.dtype)]) + except KeyError: + raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) + + _write_int32(f, len(mat.shape)) + shape = mat.shape + if len(shape) < 3: + shape = list(shape) + [1] * (3 - len(shape)) + if 0: print 'writing shape =', shape + for sh in shape: + _write_int32(f, sh) + mat.tofile(f) + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/gimp_script.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/gimp_script.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# coding: utf-8 + +''' +Filtres GIMP sous Python +Auteur: Nicolas Boulanger-Lewandowski +Date: Hiver 2010 + +run with: gimp -i --batch-interpreter python-fu-eval --batch - < gimp_script.py +end with: pdb.gimp_quit(0) + +Implémente le motionblur et le pinch +''' + +from gimpfu import * +import numpy + +img = gimp.Image(32, 32, GRAY) +img.disable_undo() +layer1 = gimp.Layer(img, "layer1", 32, 32, GRAY_IMAGE, 100, NORMAL_MODE) +img.add_layer(layer1, 0) +dest_rgn = layer1.get_pixel_rgn(0, 0, 32, 32, True) + +def setpix(image): + dest_rgn[:,:] = (image.T*255).astype(numpy.uint8).tostring() + layer1.flush() + layer1.update(0, 0, 32, 32) + +def getpix(): + return numpy.fromstring(dest_rgn[:,:], 'UInt8').astype(numpy.float32).reshape((32,32)).T / 255.0 + +class GIMP1(): + def __init__(self, blur_bool = True): + #This is used to avoid blurring for PNIST + self.blur_bool = blur_bool + + def get_settings_names(self, blur_bool = True): + return ['mblur_length', 'mblur_angle', 'pinch'] + + def regenerate_parameters(self, complexity): + if complexity: + self.mblur_length = abs(int(round(numpy.random.normal(0, 3*complexity)))) + else: + self.mblur_length = 0 + self.mblur_angle = int(round(numpy.random.uniform(0,360))) + self.pinch = numpy.random.uniform(-complexity, 0.7*complexity) + + return [self.mblur_length, self.mblur_angle, self.pinch] + + def transform_image(self, image): + if self.mblur_length or self.pinch: + setpix(image) + if self.mblur_length and self.blur_bool: + pdb.plug_in_mblur(img, layer1, 0, self.mblur_length, self.mblur_angle, 0, 0) + if self.pinch: + pdb.plug_in_whirl_pinch(img, layer1, 0.0, self.pinch, 1.0) + image = getpix() + + return image + +# test +if __name__ == '__main__': + import Image + im = numpy.asarray(Image.open("a.bmp").convert("L")) / 255.0 + + test = GIMP1() + print test.get_settings_names(), '=', test.regenerate_parameters(1) + #for i in range(1000): + im = test.transform_image(im) + + import pylab + pylab.imshow(im, pylab.matplotlib.cm.Greys_r) + pylab.show() + + pdb.gimp_quit(0) diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/image_tiling.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/image_tiling.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,86 @@ +""" +Illustrate filters (or data) in a grid of small image-shaped tiles. + +Note: taken from the pylearn codebase on Feb 4, 2010 (fsavard) +""" + +import numpy +from PIL import Image + +def scale_to_unit_interval(ndar,eps=1e-8): + ndar = ndar.copy() + ndar -= ndar.min() + ndar *= 1.0 / (ndar.max()+eps) + return ndar + +def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0,0), + scale_rows_to_unit_interval=True, + output_pixel_vals=True + ): + """ + Transform an array with one flattened image per row, into an array in which images are + reshaped and layed out like tiles on a floor. + + This function is useful for visualizing datasets whose rows are images, and also columns of + matrices for transforming those rows (such as the first layer of a neural net). + + :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can be 2-D ndarrays or None + :param X: a 2-D array in which every row is a flattened image. + :type img_shape: tuple; (height, width) + :param img_shape: the original shape of each image + :type tile_shape: tuple; (rows, cols) + :param tile_shape: the number of images to tile (rows, cols) + + :returns: array suitable for viewing as an image. (See:`PIL.Image.fromarray`.) + :rtype: a 2-d array with same dtype as X. + + """ + assert len(img_shape) == 2 + assert len(tile_shape) == 2 + assert len(tile_spacing) == 2 + + out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp + in zip(img_shape, tile_shape, tile_spacing)] + + if isinstance(X, tuple): + assert len(X) == 4 + if output_pixel_vals: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8') + else: + out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype) + + #colors default to 0, alpha defaults to 1 (opaque) + if output_pixel_vals: + channel_defaults = [0,0,0,255] + else: + channel_defaults = [0.,0.,0.,1.] + + for i in xrange(4): + if X[i] is None: + out_array[:,:,i] = numpy.zeros(out_shape, + dtype='uint8' if output_pixel_vals else out_array.dtype + )+channel_defaults[i] + else: + out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals) + return out_array + + else: + H, W = img_shape + Hs, Ws = tile_spacing + + out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype) + for tile_row in xrange(tile_shape[0]): + for tile_col in xrange(tile_shape[1]): + if tile_row * tile_shape[1] + tile_col < X.shape[0]: + if scale_rows_to_unit_interval: + this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)) + else: + this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape) + out_array[ + tile_row * (H+Hs):tile_row*(H+Hs)+H, + tile_col * (W+Ws):tile_col*(W+Ws)+W + ] \ + = this_img * (255 if output_pixel_vals else 1) + return out_array + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/local_elastic_distortions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/local_elastic_distortions.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,456 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Implementation of elastic distortions as described in +Simard, Steinkraus, Platt, "Best Practices for Convolutional + Neural Networks Applied to Visual Document Analysis", 2003 + +Author: François Savard +Date: Fall 2009, revised Winter 2010 + +Usage: create the Distorter with proper alpha, sigma etc. + Then each time you want to change the distortion field applied, + call regenerate_field(). + + (The point behind this is that regeneration takes some time, + so we better reuse the fields a few times) +''' + +import sys +import math +import numpy +import numpy.random +import scipy.signal # convolve2d + +_TEST_DIR = "/u/savardf/ift6266/debug_images/" + +def _raw_zeros(size): + return [[0 for i in range(size[1])] for j in range(size[0])] + +class ElasticDistortionParams(): + def __init__(self, image_size=(32,32), alpha=0.0, sigma=0.0): + self.image_size = image_size + self.alpha = alpha + self.sigma = sigma + + h,w = self.image_size + + self.matrix_tl_corners_rows = _raw_zeros((h,w)) + self.matrix_tl_corners_cols = _raw_zeros((h,w)) + + self.matrix_tr_corners_rows = _raw_zeros((h,w)) + self.matrix_tr_corners_cols = _raw_zeros((h,w)) + + self.matrix_bl_corners_rows = _raw_zeros((h,w)) + self.matrix_bl_corners_cols = _raw_zeros((h,w)) + + self.matrix_br_corners_rows = _raw_zeros((h,w)) + self.matrix_br_corners_cols = _raw_zeros((h,w)) + + # those will hold the precomputed ratios for + # bilinear interpolation + self.matrix_tl_multiply = numpy.zeros((h,w)) + self.matrix_tr_multiply = numpy.zeros((h,w)) + self.matrix_bl_multiply = numpy.zeros((h,w)) + self.matrix_br_multiply = numpy.zeros((h,w)) + + def alpha_sigma(self): + return [self.alpha, self.sigma] + +class LocalElasticDistorter(): + def __init__(self, image_size=(32,32)): + self.image_size = image_size + + self.current_complexity_10 = 0 + self.current_complexity = 0 + + # number of precomputed fields + # (principle: as complexity doesn't change often, we can + # precompute a certain number of fields for a given complexity, + # each with its own parameters. That way, we have good + # randomization, but we're much faster). + self.to_precompute_per_complexity = 50 + + # Both use ElasticDistortionParams + self.current_params = None + self.precomputed_params = [[] for i in range(10)] + + # + self.kernel_size = None + self.kernel = None + + # set some defaults + self.regenerate_parameters(0.0) + + def get_settings_names(self): + return [] + + def _floor_complexity(self, complexity): + return self._to_complexity_10(complexity) / 10.0 + + def _to_complexity_10(self, complexity): + return min(9, max(0, int(complexity * 10))) + + def regenerate_parameters(self, complexity): + complexity_10 = self._to_complexity_10(complexity) + + if complexity_10 != self.current_complexity_10: + self.current_complexity_10 = complexity_10 + self.current_complexity = self._floor_complexity(complexity) + + if len(self.precomputed_params[complexity_10]) <= self.to_precompute_per_complexity: + # not yet enough params generated, produce one more + # and append to list + new_params = self._initialize_new_params() + new_params = self._generate_fields(new_params) + self.current_params = new_params + self.precomputed_params[complexity_10].append(new_params) + else: + # if we have enough precomputed fields, just select one + # at random and set parameters to match what they were + # when the field was generated + idx = numpy.random.randint(0, len(self.precomputed_params[complexity_10])) + self.current_params = self.precomputed_params[complexity_10][idx] + + # don't return anything, to avoid storing deterministic parameters + return [] # self.current_params.alpha_sigma() + + def get_parameters_determined_by_complexity(self, complexity): + tmp_params = self._initialize_new_params(_floor_complexity(complexity)) + return tmp_params.alpha_sigma() + + def get_settings_names_determined_by_complexity(self, complexity): + return ['alpha', 'sigma'] + + # adapted from http://blenderartists.org/forum/showthread.php?t=163361 + def _gen_gaussian_kernel(self, sigma): + # the kernel size can change DRAMATICALLY the time + # for the blur operation... so even though results are better + # with a bigger kernel, we need to compromise here + # 1*s is very different from 2*s, but there's not much difference + # between 2*s and 4*s + ks = self.kernel_size + s = sigma + target_ks = (1.5*s, 1.5*s) + if not ks is None and ks[0] == target_ks[0] and ks[1] == target_ks[1]: + # kernel size is good, ok, no need to regenerate + return + self.kernel_size = target_ks + h,w = self.kernel_size + a,b = h/2.0, w/2.0 + y,x = numpy.ogrid[0:w, 0:h] + gauss = numpy.exp(-numpy.square((x-a)/s))*numpy.exp(-numpy.square((y-b)/s)) + # Normalize so we don't reduce image intensity + self.kernel = gauss/gauss.sum() + + def _gen_distortion_field(self, params): + self._gen_gaussian_kernel(params.sigma) + + # we add kernel_size on all four sides so blurring + # with the kernel produces a smoother result on borders + ks0 = self.kernel_size[0] + ks1 = self.kernel_size[1] + sz0 = self.image_size[1] + ks0 + sz1 = self.image_size[0] + ks1 + field = numpy.random.uniform(-1.0, 1.0, (sz0, sz1)) + field = scipy.signal.convolve2d(field, self.kernel, mode='same') + + # crop only image_size in the middle + field = field[ks0:ks0+self.image_size[0], ks1:ks1+self.image_size[1]] + + return params.alpha * field + + + def _initialize_new_params(self, complexity=None): + if not complexity: + complexity = self.current_complexity + + params = ElasticDistortionParams(self.image_size) + + # pour faire progresser la complexité un peu plus vite + # tout en gardant les extrêmes de 0.0 et 1.0 + complexity = complexity ** (1./3.) + + # the smaller the alpha, the closest the pixels are fetched + # a max of 10 is reasonable + params.alpha = complexity * 10.0 + + # the bigger the sigma, the smoother is the distortion + # max of 1 is "reasonable", but produces VERY noisy results + # And the bigger the sigma, the bigger the blur kernel, and the + # slower the field generation, btw. + params.sigma = 10.0 - (7.0 * complexity) + + return params + + def _generate_fields(self, params): + ''' + Here's how the code works: + - We first generate "distortion fields" for x and y with these steps: + - Uniform noise over [-1, 1] in a matrix of size (h,w) + - Blur with a Gaussian kernel of spread sigma + - Multiply by alpha + - Then (conceptually) to compose the distorted image, we loop over each pixel + of the new image and use the corresponding x and y distortions + (from the matrices generated above) to identify pixels + of the old image from which we fetch color data. As the + coordinates are not integer, we interpolate between the + 4 nearby pixels (top left, top right etc.). + - That's just conceptually. Here I'm using matrix operations + to speed up the computation. I first identify the 4 nearby + pixels in the old image for each pixel in the distorted image. + I can then use them as "fancy indices" to extract the proper + pixels for each new pixel. + - Then I multiply those extracted nearby points by precomputed + ratios for the bilinear interpolation. + ''' + + p = params + + dist_fields = [None, None] + dist_fields[0] = self._gen_distortion_field(params) + dist_fields[1] = self._gen_distortion_field(params) + + #pylab.imshow(dist_fields[0]) + #pylab.show() + + # regenerate distortion index matrices + # "_rows" are row indices + # "_cols" are column indices + # (separated due to the way fancy indexing works in numpy) + h,w = p.image_size + + for y in range(h): + for x in range(w): + distort_x = dist_fields[0][y,x] + distort_y = dist_fields[1][y,x] + + # the "target" is the coordinate we fetch color data from + # (in the original image) + # target_left and _top are the rounded coordinate on the + # left/top of this target (float) coordinate + target_pixel = (y+distort_y, x+distort_x) + + target_left = int(math.floor(x + distort_x)) + target_top = int(math.floor(y + distort_y)) + + index_tl = [target_top, target_left] + index_tr = [target_top, target_left+1] + index_bl = [target_top+1, target_left] + index_br = [target_top+1, target_left+1] + + # x_ratio is the ratio of importance of left pixels + # y_ratio is the """" of top pixels + # (in bilinear combination) + y_ratio = 1.0 - (target_pixel[0] - target_top) + x_ratio = 1.0 - (target_pixel[1] - target_left) + + # We use a default background color of 0 for displacements + # outside of boundaries of the image. + + # if top left outside bounds + if index_tl[0] < 0 or index_tl[0] >= h or index_tl[1] < 0 or index_tl[1] >= w: + p.matrix_tl_corners_rows[y][x] = 0 + p.matrix_tl_corners_cols[y][x] = 0 + p.matrix_tl_multiply[y,x] = 0 + else: + p.matrix_tl_corners_rows[y][x] = index_tl[0] + p.matrix_tl_corners_cols[y][x] = index_tl[1] + p.matrix_tl_multiply[y,x] = x_ratio*y_ratio + + # if top right outside bounds + if index_tr[0] < 0 or index_tr[0] >= h or index_tr[1] < 0 or index_tr[1] >= w: + p.matrix_tr_corners_rows[y][x] = 0 + p.matrix_tr_corners_cols[y][x] = 0 + p.matrix_tr_multiply[y,x] = 0 + else: + p.matrix_tr_corners_rows[y][x] = index_tr[0] + p.matrix_tr_corners_cols[y][x] = index_tr[1] + p.matrix_tr_multiply[y,x] = (1.0-x_ratio)*y_ratio + + # if bottom left outside bounds + if index_bl[0] < 0 or index_bl[0] >= h or index_bl[1] < 0 or index_bl[1] >= w: + p.matrix_bl_corners_rows[y][x] = 0 + p.matrix_bl_corners_cols[y][x] = 0 + p.matrix_bl_multiply[y,x] = 0 + else: + p.matrix_bl_corners_rows[y][x] = index_bl[0] + p.matrix_bl_corners_cols[y][x] = index_bl[1] + p.matrix_bl_multiply[y,x] = x_ratio*(1.0-y_ratio) + + # if bottom right outside bounds + if index_br[0] < 0 or index_br[0] >= h or index_br[1] < 0 or index_br[1] >= w: + p.matrix_br_corners_rows[y][x] = 0 + p.matrix_br_corners_cols[y][x] = 0 + p.matrix_br_multiply[y,x] = 0 + else: + p.matrix_br_corners_rows[y][x] = index_br[0] + p.matrix_br_corners_cols[y][x] = index_br[1] + p.matrix_br_multiply[y,x] = (1.0-x_ratio)*(1.0-y_ratio) + + # not really necessary, but anyway + return p + + def transform_image(self, image): + p = self.current_params + + # index pixels to get the 4 corners for bilinear combination + tl_pixels = image[p.matrix_tl_corners_rows, p.matrix_tl_corners_cols] + tr_pixels = image[p.matrix_tr_corners_rows, p.matrix_tr_corners_cols] + bl_pixels = image[p.matrix_bl_corners_rows, p.matrix_bl_corners_cols] + br_pixels = image[p.matrix_br_corners_rows, p.matrix_br_corners_cols] + + # bilinear ratios, elemwise multiply + tl_pixels = numpy.multiply(tl_pixels, p.matrix_tl_multiply) + tr_pixels = numpy.multiply(tr_pixels, p.matrix_tr_multiply) + bl_pixels = numpy.multiply(bl_pixels, p.matrix_bl_multiply) + br_pixels = numpy.multiply(br_pixels, p.matrix_br_multiply) + + # sum to finish bilinear combination + return numpy.sum([tl_pixels,tr_pixels,bl_pixels,br_pixels], axis=0).astype(numpy.float32) + +# TESTS ---------------------------------------------------------------------- + +def _load_image(filepath): + _RGB_TO_GRAYSCALE = [0.3, 0.59, 0.11, 0.0] + img = Image.open(filepath) + img = numpy.asarray(img) + if len(img.shape) > 2: + img = (img * _RGB_TO_GRAYSCALE).sum(axis=2) + return (img / 255.0).astype('float') + +def _specific_test(): + imgpath = os.path.join(_TEST_DIR, "d.png") + img = _load_image(imgpath) + dist = LocalElasticDistorter((32,32)) + print dist.regenerate_parameters(0.5) + img = dist.transform_image(img) + print dist.get_parameters_determined_by_complexity(0.4) + pylab.imshow(img) + pylab.show() + +def _complexity_tests(): + imgpath = os.path.join(_TEST_DIR, "d.png") + dist = LocalElasticDistorter((32,32)) + orig_img = _load_image(imgpath) + html_content = '''Original:
''' + for complexity in numpy.arange(0.0, 1.1, 0.1): + html_content += '
Complexity: ' + str(complexity) + '
' + for i in range(10): + t1 = time.time() + dist.regenerate_parameters(complexity) + t2 = time.time() + print "diff", t2-t1 + img = dist.transform_image(orig_img) + filename = "complexity_" + str(complexity) + "_" + str(i) + ".png" + new_path = os.path.join(_TEST_DIR, filename) + _save_image(img, new_path) + html_content += '' + html_content += "" + html_file = open(os.path.join(_TEST_DIR, "complexity.html"), "w") + html_file.write(html_content) + html_file.close() + +def _complexity_benchmark(): + imgpath = os.path.join(_TEST_DIR, "d.png") + dist = LocalElasticDistorter((32,32)) + orig_img = _load_image(imgpath) + + for cpx in (0.21, 0.35): + # time the first 10 + t1 = time.time() + for i in range(10): + dist.regenerate_parameters(cpx) + img = dist.transform_image(orig_img) + t2 = time.time() + + print "first 10, total = ", t2-t1, ", avg=", (t2-t1)/10 + + # time the next 40 + t1 = time.time() + for i in range(40): + dist.regenerate_parameters(cpx) + img = dist.transform_image(orig_img) + t2 = time.time() + + print "next 40, total = ", t2-t1, ", avg=", (t2-t1)/40 + + # time the next 50 + t1 = time.time() + for i in range(50): + dist.regenerate_parameters(cpx) + img = dist.transform_image(orig_img) + t2 = time.time() + + print "next 50, total = ", t2-t1, ", avg=", (t2-t1)/50 + + # time the next 1000 + t1 = time.time() + for i in range(1000): + dist.regenerate_parameters(cpx) + img = dist.transform_image(orig_img) + t2 = time.time() + + print "next 1000, total = ", t2-t1, ", avg=", (t2-t1)/1000 + + # time the next 1000 with old complexity + t1 = time.time() + for i in range(1000): + dist.regenerate_parameters(0.21) + img = dist.transform_image(orig_img) + t2 = time.time() + + print "next 1000, total = ", t2-t1, ", avg=", (t2-t1)/1000 + + + + +def _save_image(img, path): + img2 = Image.fromarray((img * 255).astype('uint8'), "L") + img2.save(path) + +# TODO: reformat to follow new class... it function of complexity now +''' +def _distorter_tests(): + #import pylab + #pylab.imshow(img) + #pylab.show() + + for letter in ("d", "a", "n", "o"): + img = _load_image("tests/" + letter + ".png") + for alpha in (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0): + for sigma in (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0): + id = LocalElasticDistorter((32,32)) + img2 = id.distort_image(img) + img2 = Image.fromarray((img2 * 255).astype('uint8'), "L") + img2.save("tests/"+letter+"_alpha"+str(alpha)+"_sigma"+str(sigma)+".png") +''' + +def _benchmark(): + img = _load_image("tests/d.png") + dist = LocalElasticDistorter((32,32)) + dist.regenerate_parameters(0.0) + import time + t1 = time.time() + for i in range(10000): + if i % 1000 == 0: + print "-" + dist.distort_image(img) + t2 = time.time() + print "t2-t1", t2-t1 + print "avg", 10000/(t2-t1) + +if __name__ == '__main__': + import time + import pylab + import Image + import os.path + #_distorter_tests() + #_benchmark() + #_specific_test() + #_complexity_tests() + _complexity_benchmark() + + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/.DS_Store Binary file data_generation/transformations/pycaptcha/.DS_Store has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/BUGS --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/BUGS Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,7 @@ +Known bugs: + +- PersistentFactory() is almost certainly horrible at concurrent access +- Tests are never invalidated with PersistentStorage(), as they aren't written back to the database +- All files in Captcha/data are installed, including silly things like .svn directories and *~ + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/COPYING Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,19 @@ +Copyright (c) 2004 Micah Dowty + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/.DS_Store Binary file data_generation/transformations/pycaptcha/Captcha/.DS_Store has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Base.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Base.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,127 @@ +""" Captcha.Base + +Base class for all types of CAPTCHA tests. All tests have one or +more solution, determined when the test is generated. Solutions +can be any python object, + +All tests can be solved by presenting at least some preset number +of correct solutions. Some tests may only have one solution and require +one solution, but other tests may require N correct solutions of M +possible solutions. +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +import random, string, time, shelve + +__all__ = ["BaseCaptcha", "Factory", "PersistentFactory"] + + +def randomIdentifier(alphabet = string.ascii_letters + string.digits, + length = 24): + return "".join([random.choice(alphabet) for i in xrange(length)]) + + +class BaseCaptcha(object): + """Base class for all CAPTCHA tests""" + # Subclasses can override these to set the solution criteria + minCorrectSolutions = 1 + maxIncorrectSolutions = 0 + + def __init__(self): + self.solutions = [] + self.valid = True + + # Each test has a unique identifier, used to refer to that test + # later, and a creation time so it can expire later. + self.id = randomIdentifier() + self.creationTime = time.time() + + def addSolution(self, solution): + self.solutions.append(solution) + + def testSolutions(self, solutions): + """Test whether the given solutions are sufficient for this CAPTCHA. + A given CAPTCHA can only be tested once, after that it is invalid + and always returns False. This makes random guessing much less effective. + """ + if not self.valid: + return False + self.valid = False + + numCorrect = 0 + numIncorrect = 0 + + for solution in solutions: + if solution in self.solutions: + numCorrect += 1 + else: + numIncorrect += 1 + + return numCorrect >= self.minCorrectSolutions and \ + numIncorrect <= self.maxIncorrectSolutions + + +class Factory(object): + """Creates BaseCaptcha instances on demand, and tests solutions. + CAPTCHAs expire after a given amount of time, given in seconds. + The default is 15 minutes. + """ + def __init__(self, lifetime=60*15): + self.lifetime = lifetime + self.storedInstances = {} + + def new(self, cls, *args, **kwargs): + """Create a new instance of our assigned BaseCaptcha subclass, passing + it any extra arguments we're given. This stores the result for + later testing. + """ + self.clean() + inst = cls(*args, **kwargs) + self.storedInstances[inst.id] = inst + return inst + + def get(self, id): + """Retrieve the CAPTCHA with the given ID. If it's expired already, + this will return None. A typical web application will need to + new() a CAPTCHA when generating an html page, then get() it later + when its images or sounds must be rendered. + """ + return self.storedInstances.get(id) + + def clean(self): + """Removed expired tests""" + expiredIds = [] + now = time.time() + for inst in self.storedInstances.itervalues(): + if inst.creationTime + self.lifetime < now: + expiredIds.append(inst.id) + for id in expiredIds: + del self.storedInstances[id] + + def test(self, id, solutions): + """Test the given list of solutions against the BaseCaptcha instance + created earlier with the given id. Returns True if the test passed, + False on failure. In either case, the test is invalidated. Returns + False in the case of an invalid id. + """ + self.clean() + inst = self.storedInstances.get(id) + if not inst: + return False + result = inst.testSolutions(solutions) + return result + + +class PersistentFactory(Factory): + """A simple persistent factory, for use in CGI or multi-process environments + where the state must remain across python interpreter sessions. + This implementation uses the 'shelve' module. + """ + def __init__(self, filename, lifetime=60*15): + Factory.__init__(self, lifetime) + self.storedInstances = shelve.open(filename) + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/File.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/File.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,59 @@ +""" Captcha.File + +Utilities for finding and picking random files from our 'data' directory +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +import os, random, cPickle + +# Determine the data directory. This can be overridden after import-time if needed. +dataDir = os.path.join(os.path.split(os.path.abspath(__file__))[0], "data") + + +class RandomFileFactory(object): + """Given a list of files and/or directories, this picks a random file. + Directories are searched for files matching any of a list of extensions. + Files are relative to our data directory plus a subclass-specified base path. + """ + extensions = [] + basePath = "." + + def __init__(self, *fileList): + self.fileList = fileList + self._fullPaths = None + + def _checkExtension(self, name): + """Check the file against our given list of extensions""" + for ext in self.extensions: + if name.endswith(ext): + return True + return False + + def _findFullPaths(self): + """From our given file list, find a list of full paths to files""" + paths = [] + for name in self.fileList: + if name[0] == '/': + path = name + else: + path = os.path.join(dataDir, self.basePath, name) + if os.path.isdir(path): + f = open(path + '/filelist.pkl') + filelist = cPickle.load(f) + f.close() + for content in filelist: + if self._checkExtension(content): + paths.append(os.path.join(path, content)) + else: + paths.append(path) + return paths + + def pick(self): + if self._fullPaths is None: + self._fullPaths = self._findFullPaths() + return random.choice(self._fullPaths) + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Visual/Backgrounds.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Backgrounds.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,95 @@ +""" Captcha.Visual.Backgrounds + +Background layers for visual CAPTCHAs +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +from Captcha.Visual import Layer, Pictures +import random, os +import ImageDraw, Image + + +class SolidColor(Layer): + """A solid color background. Very weak on its own, but good + to combine with other backgrounds. + """ + def __init__(self, color="white"): + self.color = color + + def render(self, image): + image.paste(self.color) + + +class Grid(Layer): + """A grid of lines, with a given foreground color. + The size is given in pixels. The background is transparent, + so another layer (like SolidColor) should be put behind it. + """ + def __init__(self, size=16, foreground="black"): + self.size = size + self.foreground = foreground + self.offset = (random.uniform(0, self.size), + random.uniform(0, self.size)) + + def render(self, image): + draw = ImageDraw.Draw(image) + + for i in xrange(image.size[0] / self.size + 1): + draw.line( (i*self.size+self.offset[0], 0, + i*self.size+self.offset[0], image.size[1]), fill=self.foreground) + + for i in xrange(image.size[0] / self.size + 1): + draw.line( (0, i*self.size+self.offset[1], + image.size[0], i*self.size+self.offset[1]), fill=self.foreground) + + +class TiledImage(Layer): + """Pick a random image and a random offset, and tile the rendered image with it""" + def __init__(self, imageFactory=Pictures.abstract): + self.tileName = imageFactory.pick() + self.offset = (random.uniform(0, 1), + random.uniform(0, 1)) + + def render(self, image): + tile = Image.open(self.tileName) + for j in xrange(-1, int(image.size[1] / tile.size[1]) + 1): + for i in xrange(-1, int(image.size[0] / tile.size[0]) + 1): + dest = (int((self.offset[0] + i) * tile.size[0]), + int((self.offset[1] + j) * tile.size[1])) + image.paste(tile, dest) + + +class CroppedImage(Layer): + """Pick a random image, cropped randomly. Source images should be larger than the CAPTCHA.""" + def __init__(self, imageFactory=Pictures.nature): + self.imageName = imageFactory.pick() + self.align = (random.uniform(0,1), + random.uniform(0,1)) + + def render(self, image): + i = Image.open(self.imageName) + image.paste(i, (int(self.align[0] * (image.size[0] - i.size[0])), + int(self.align[1] * (image.size[1] - i.size[1])))) + + +class RandomDots(Layer): + """Draw random colored dots""" + def __init__(self, colors=("white", "black"), dotSize=4, numDots=400): + self.colors = colors + self.dotSize = dotSize + self.numDots = numDots + self.seed = random.random() + + def render(self, image): + r = random.Random(self.seed) + for i in xrange(self.numDots): + bx = int(r.uniform(0, image.size[0]-self.dotSize)) + by = int(r.uniform(0, image.size[1]-self.dotSize)) + image.paste(r.choice(self.colors), (bx, by, + bx+self.dotSize-1, + by+self.dotSize-1)) + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Visual/Base.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Base.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,70 @@ +""" Captcha.Visual.BAse + +Base classes for visual CAPTCHAs. We use the Python Imaging Library +to manipulate these images. +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +import Captcha +import Image + +__all__ = ['ImageCaptcha', 'Layer'] + + +class ImageCaptcha(Captcha.BaseCaptcha): + """Base class for image-based CAPTCHA tests. + The render() function generates the CAPTCHA image at the given size by + combining Layer instances from self.layers, which should be created by + the subclass-defined getLayers(). + """ + defaultSize = (32,32) + # anciennement a defaultSize(256,96) + def __init__(self, *args, **kwargs): + Captcha.BaseCaptcha.__init__(self) + self._layers = self.getLayers(*args, **kwargs) + + def getImage(self): + """Get a PIL image representing this CAPTCHA test, creating it if necessary""" + if not self._image: + self._image = self.render() + return self._image + + def getLayers(self): + """Subclasses must override this to return a list of Layer instances to render. + Lists within the list of layers are recursively rendered. + """ + return [] + + def render(self, size=None): + """Render this CAPTCHA, returning a PIL image""" + if size is None: + size = self.defaultSize + img = Image.new("L", size) + # img = Image.new("RGB", size) + return self._renderList(self._layers, Image.new("L", size)) + + def _renderList(self, l, img): + for i in l: + if type(i) == tuple or type(i) == list: + img = self._renderList(i, img) + else: + img = i.render(img) or img + return img + + +class Layer(object): + """A renderable object representing part of a CAPTCHA. + The render() function should return approximately the same result, regardless + of the image size. This means any randomization must occur in the constructor. + + If the render() function returns something non-None, it is taken as an image to + replace the current image with. This can be used to implement transformations + that result in a separate image without having to copy the results back to the first. + """ + def render(self, img): + pass + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Visual/Distortions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Distortions.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,117 @@ +""" Captcha.Visual.Distortions + +Distortion layers for visual CAPTCHAs +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +from Captcha.Visual import Layer +import ImageDraw, Image +import random, math + + +class WigglyBlocks(Layer): + """Randomly select and shift blocks of the image""" + def __init__(self, blockSize=3, sigma=0.01, iterations=300): + self.blockSize = blockSize + self.sigma = sigma + self.iterations = iterations + self.seed = random.random() + + def render(self, image): + r = random.Random(self.seed) + for i in xrange(self.iterations): + # Select a block + bx = int(r.uniform(0, image.size[0]-self.blockSize)) + by = int(r.uniform(0, image.size[1]-self.blockSize)) + block = image.crop((bx, by, bx+self.blockSize-1, by+self.blockSize-1)) + + # Figure out how much to move it. + # The call to floor() is important so we always round toward + # 0 rather than to -inf. Just int() would bias the block motion. + mx = int(math.floor(r.normalvariate(0, self.sigma))) + my = int(math.floor(r.normalvariate(0, self.sigma))) + + # Now actually move the block + image.paste(block, (bx+mx, by+my)) + + +class WarpBase(Layer): + """Abstract base class for image warping. Subclasses define a + function that maps points in the output image to points in the input image. + This warping engine runs a grid of points through this transform and uses + PIL's mesh transform to warp the image. + """ + filtering = Image.BILINEAR + resolution = 10 + + def getTransform(self, image): + """Return a transformation function, subclasses should override this""" + return lambda x, y: (x, y) + + def render(self, image): + r = self.resolution + xPoints = image.size[0] / r + 2 + yPoints = image.size[1] / r + 2 + f = self.getTransform(image) + + # Create a list of arrays with transformed points + xRows = [] + yRows = [] + for j in xrange(yPoints): + xRow = [] + yRow = [] + for i in xrange(xPoints): + x, y = f(i*r, j*r) + + # Clamp the edges so we don't get black undefined areas + x = max(0, min(image.size[0]-1, x)) + y = max(0, min(image.size[1]-1, y)) + + xRow.append(x) + yRow.append(y) + xRows.append(xRow) + yRows.append(yRow) + + # Create the mesh list, with a transformation for + # each square between points on the grid + mesh = [] + for j in xrange(yPoints-1): + for i in xrange(xPoints-1): + mesh.append(( + # Destination rectangle + (i*r, j*r, + (i+1)*r, (j+1)*r), + # Source quadrilateral + (xRows[j ][i ], yRows[j ][i ], + xRows[j+1][i ], yRows[j+1][i ], + xRows[j+1][i+1], yRows[j+1][i+1], + xRows[j ][i+1], yRows[j ][i+1]), + )) + + return image.transform(image.size, Image.MESH, mesh, self.filtering) + + +class SineWarp(WarpBase): + """Warp the image using a random composition of sine waves""" + + def __init__(self, + amplitudeRange = (3, 6.5), + periodRange = (0.04, 0.1), + ): + self.amplitude = random.uniform(*amplitudeRange) + self.period = random.uniform(*periodRange) + self.offset = (random.uniform(0, math.pi * 2 / self.period), + random.uniform(0, math.pi * 2 / self.period)) + + def getTransform(self, image): + return (lambda x, y, + a = self.amplitude, + p = self.period, + o = self.offset: + (math.sin( (y+o[0])*p )*a + x, + math.sin( (x+o[1])*p )*a + y)) + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Visual/Pictures.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Pictures.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,23 @@ +""" Captcha.Visual.Pictures + +Random collections of images +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +from Captcha import File +import Image + + +class ImageFactory(File.RandomFileFactory): + """A factory that generates random images from a list""" + extensions = [".png", ".jpeg"] + basePath = "pictures" + + +abstract = ImageFactory("abstract") +nature = ImageFactory("nature") + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Visual/Tests.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Tests.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,65 @@ +""" Captcha.Visual.Tests + +Visual CAPTCHA tests +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +from Captcha.Visual import Text, Backgrounds, Distortions, ImageCaptcha +from Captcha import Words +import random + +__all__ = ["PseudoGimpy", "AngryGimpy", "AntiSpam"] + + +class PseudoGimpy(ImageCaptcha): + """A relatively easy CAPTCHA that's somewhat easy on the eyes""" + def getLayers(self): + word = Words.defaultWordList.pick() + self.addSolution(word) + return [ + # random.choice([ + # Backgrounds.CroppedImage(), + # Backgrounds.TiledImage(), + # ]), + Text.TextLayer(word, borderSize=1), + Distortions.SineWarp(), + ] + + +class AngryGimpy(ImageCaptcha): + """A harder but less visually pleasing CAPTCHA""" + def getLayers(self): + word = Words.defaultWordList.pick() + self.addSolution(word) + return [ + # suppression du background + # Backgrounds.TiledImage(), + # Backgrounds.RandomDots(), + Text.TextLayer(word, borderSize=1), + # Distortions.SineWarp(periodRange = (0.04, 0.07)) + Distortions.WigglyBlocks(), + ] + + +class AntiSpam(ImageCaptcha): + """A fixed-solution CAPTCHA that can be used to hide email addresses or URLs from bots""" + fontFactory = Text.FontFactory(20, "vera/VeraBd.ttf") + defaultSize = (512,50) + + def getLayers(self, solution="murray@example.com"): + self.addSolution(solution) + + textLayer = Text.TextLayer(solution, + borderSize = 2, + fontFactory = self.fontFactory) + + return [ + Backgrounds.CroppedImage(), + textLayer, + Distortions.SineWarp(amplitudeRange = (3, 5)), + ] + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Visual/Text.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/Text.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,111 @@ +""" Captcha.Visual.Text + +Text generation for visual CAPTCHAs. +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +import random, os +from Captcha import Visual, File +import ImageFont, ImageDraw + + +class FontFactory(File.RandomFileFactory): + """Picks random fonts and/or sizes from a given list. + 'sizes' can be a single size or a (min,max) tuple. + If any of the given files are directories, all *.ttf found + in that directory will be added. + """ + extensions = [".ttf", ".TTF"] + basePath = "fonts" + +# arguments variables a modifier pour mettre le chemin vers les fontes. + def __init__(self, sizes, *fileNames): + File.RandomFileFactory.__init__(self, *fileNames) + + if type(sizes) is tuple: + self.minSize = sizes[0] + self.maxSize = sizes[1] + else: + self.minSize = sizes + self.maxSize = sizes + + def pick(self): + """Returns a (fileName, size) tuple that can be passed to ImageFont.truetype()""" + fileName = File.RandomFileFactory.pick(self) + size = int(random.uniform(self.minSize, self.maxSize) + 0.5) + return (fileName, size) + +# Predefined font factories +defaultFontFactory = FontFactory(25, "/Tmp/allfonts") +#defaultFontFactory = FontFactory((30, 40), "vera") + +class TextLayer(Visual.Layer): + """Represents a piece of text rendered within the image. + Alignment is given such that (0,0) places the text in the + top-left corner and (1,1) places it in the bottom-left. + + The font and alignment are optional, if not specified one is + chosen randomly. If no font factory is specified, the default is used. + """ + def __init__(self, text, + alignment = None, + font = None, + fontFactory = None, + textColor = "white", + borderSize = 0, + borderColor = None, + ): + if fontFactory is None: + global defaultFontFactory + fontFactory = defaultFontFactory + + if font is None: + font = fontFactory.pick() + + if alignment is None: + alignment = (random.uniform(0,1), + random.uniform(0,1)) + + self.text = text + self.alignment = alignment + self.font = font + self.textColor = textColor + self.borderSize = borderSize + self.borderColor = borderColor + + def render(self, img): + + i=1 + while True: + try: + font = ImageFont.truetype(*self.font) + break + except: + print "try#", i, self.font + i += 1 + if i>10: raise + + textSize = font.getsize(self.text) + draw = ImageDraw.Draw(img) + + # Find the text's origin given our alignment and current image size + x = int((img.size[0] - textSize[0] - self.borderSize*2) * self.alignment[0] + 0.5) + y = int((img.size[1] - textSize[1] - self.borderSize*2) * self.alignment[1] + 0.5) + + # Draw the border if we need one. This is slow and ugly, but there doesn't + # seem to be a better way with PIL. + if self.borderSize > 0: + for bx in (-1,0,1): + for by in (-1,0,1): + if bx and by: + draw.text((x + bx * self.borderSize, + y + by * self.borderSize), + self.text, font=font, fill=self.borderColor) + + # And the text itself... + draw.text((x,y), self.text, font=font, fill=self.textColor) + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Visual/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Visual/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,14 @@ +""" Captcha.Visual + +This package contains functionality specific to visual CAPTCHA tests. + +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +# Convenience imports +from Base import * + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/Words.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/Words.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,58 @@ +""" Captcha.Words + +Utilities for managing word lists and finding random words +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +import random, os +import File + + +class WordList(object): + """A class representing a word list read from disk lazily. + Blank lines and comment lines starting with '#' are ignored. + Any number of words per line may be used. The list can + optionally ingore words not within a given length range. + """ + def __init__(self, fileName, minLength=None, maxLength=None): + self.words = None + self.fileName = fileName + self.minLength = minLength + self.maxLength = maxLength + + def read(self): + """Read words from disk""" + f = open(os.path.join(File.dataDir, "words", self.fileName)) + + self.words = [] + for line in f.xreadlines(): + line = line.strip() + if not line: + continue + if line[0] == '#': + continue + for word in line.split(): + if self.minLength is not None and len(word) < self.minLength: + continue + if self.maxLength is not None and len(word) > self.maxLength: + continue + self.words.append(word) + + def pick(self): + """Pick a random word from the list, reading it in if necessary""" + if self.words is None: + self.read() + return random.choice(self.words) + + +# Define several shared word lists that are read from disk on demand +basic_english = WordList("basic-english") +basic_english_restricted = WordList("basic-english", minLength=5, maxLength=8) +characters = WordList("characters") +defaultWordList = characters + + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,41 @@ +""" Captcha + +This is the PyCAPTCHA package, a collection of Python modules +implementing CAPTCHAs: automated tests that humans should pass, +but current computer programs can't. These tests are often +used for security. + +See http://www.captcha.net for more information and examples. + +This project was started because the CIA project, written in +Python, needed a CAPTCHA to automate its user creation process +safely. All existing implementations the author could find were +written in Java or for the .NET framework, so a simple Python +alternative was needed. +""" +# +# PyCAPTCHA Package +# Copyright (C) 2004 Micah Dowty +# + +__version__ = "0.3-pre" + + +# Check the python version here before we proceed further +requiredPythonVersion = (2,2,1) +def checkVersion(): + import sys, string + if sys.version_info < requiredPythonVersion: + raise Exception("%s requires at least Python %s, found %s instead." % ( + name, + string.join(map(str, requiredPythonVersion), "."), + string.join(map(str, sys.version_info), "."))) +checkVersion() + + +# Convenience imports +from Base import * +import File +import Words + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/.DS_Store Binary file data_generation/transformations/pycaptcha/Captcha/data/.DS_Store has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/.DS_Store Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/.DS_Store has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/allfonts --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/allfonts Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,1 @@ +/Tmp/allfonts \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/others/._atari-small.bdf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/others/._atari-small.bdf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/others/._cursive.bdf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/others/._cursive.bdf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/others/._radon-wide.bdf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/others/._radon-wide.bdf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/others/CIDFnmap --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/others/CIDFnmap Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,10 @@ +/Dotum-Bold (/usr/share/fonts/truetype/unfonts/UnDotumBold.ttf) /Adobe-Korea1-Unicode ; +/ZenHei (/usr/share/fonts/truetype/wqy/wqy-zenhei.ttf) /Adobe-GB1-Unicode ; +/Batang-Regular (/usr/share/fonts/truetype/unfonts/UnBatang.ttf) /Adobe-Korea1-Unicode ; +/VL-PGothic-Regular (/usr/share/fonts/truetype/vlgothic/VL-PGothic-Regular.ttf) /Adobe-Japan1-Unicode ; +/Dotum-Regular (/usr/share/fonts/truetype/unfonts/UnDotum.ttf) /Adobe-Korea1-Unicode ; +/VL-Gothic-Regular-JaH (/usr/share/fonts/truetype/vlgothic/VL-Gothic-Regular.ttf) /Adobe-Japan2-Unicode ; +/VL-Gothic-Regular (/usr/share/fonts/truetype/vlgothic/VL-Gothic-Regular.ttf) /Adobe-Japan1-Unicode ; +/VL-PGothic-Regular-JaH (/usr/share/fonts/truetype/vlgothic/VL-PGothic-Regular.ttf) /Adobe-Japan2-Unicode ; +/ZenHei-CNS (/usr/share/fonts/truetype/wqy/wqy-zenhei.ttf) /Adobe-CNS1-Unicode ; +/Batang-Bold (/usr/share/fonts/truetype/unfonts/UnBatangBold.ttf) /Adobe-Korea1-Unicode ; diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/others/FAPIfontmap --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/others/FAPIfontmap Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,155 @@ +/Garuda-Oblique << /Path (/usr/share/fonts/truetype/thai/Garuda-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/DejaVu-Sans << /Path (/usr/share/fonts/truetype/ttf-dejavu/DejaVuSans.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstOne << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstOne.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Vemana2000 << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/Vemana.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSerif-Bold << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSerif-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypo-Bold << /Path (/usr/share/fonts/truetype/thai/TlwgTypo-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSerif-BoldItalic << /Path (/usr/share/fonts/truetype/freefont/FreeSerifBoldItalic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Umpush-Oblique << /Path (/usr/share/fonts/truetype/thai/Umpush-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationMono-Italic << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationMono-Italic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Malige << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/Malige-b.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Loma-Oblique << /Path (/usr/share/fonts/truetype/thai/Loma-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstBook << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstBook.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Norasi-BoldItalic << /Path (/usr/share/fonts/truetype/thai/Norasi-BoldItalic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/DejaVu-Sans-Bold << /Path (/usr/share/fonts/truetype/ttf-dejavu/DejaVuSans-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Norasi-BoldOblique << /Path (/usr/share/fonts/truetype/thai/Norasi-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeMono-BoldOblique << /Path (/usr/share/fonts/truetype/freefont/FreeMonoBoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/DejaVu-Serif << /Path (/usr/share/fonts/truetype/ttf-dejavu/DejaVuSerif.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstOffice << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstOffice.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypist-Oblique << /Path (/usr/share/fonts/truetype/thai/TlwgTypist-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSans-Italic << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSans-Italic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Waree-Oblique << /Path (/usr/share/fonts/truetype/thai/Waree-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationMono-BoldItalic << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationMono-BoldItalic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstFarsi << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstFarsi.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgMono-Oblique << /Path (/usr/share/fonts/truetype/thai/TlwgMono-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Garuda-BoldOblique << /Path (/usr/share/fonts/truetype/thai/Garuda-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSans-BoldOblique << /Path (/usr/share/fonts/truetype/freefont/FreeSansBoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/utkal << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/utkal.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSerif-Italic << /Path (/usr/share/fonts/truetype/freefont/FreeSerifItalic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypist-Bold << /Path (/usr/share/fonts/truetype/thai/TlwgTypist-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSerif-Italic << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSerif-Italic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Sawasdee-BoldOblique << /Path (/usr/share/fonts/truetype/thai/SawasdeeBoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Umpush-BoldOblique << /Path (/usr/share/fonts/truetype/thai/Umpush-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/cmex10 << /Path (/usr/share/fonts/truetype/latex-xft-fonts/cmex10.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeMono-Bold << /Path (/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Norasi-Bold << /Path (/usr/share/fonts/truetype/thai/Norasi-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSans-Regular << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSans-Regular.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Loma << /Path (/usr/share/fonts/truetype/thai/Loma.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/wasy10 << /Path (/usr/share/fonts/truetype/latex-xft-fonts/wasy10.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Kinnari-BoldItalic << /Path (/usr/share/fonts/truetype/thai/Kinnari-BoldItalic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstNaskh << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstNaskh.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSans-Bold << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSans-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Waree << /Path (/usr/share/fonts/truetype/thai/Waree.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Garuda << /Path (/usr/share/fonts/truetype/thai/Garuda.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/cmsy10 << /Path (/usr/share/fonts/truetype/latex-xft-fonts/cmsy10.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypist-BoldOblique << /Path (/usr/share/fonts/truetype/thai/TlwgTypist-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Sawasdee-Bold << /Path (/usr/share/fonts/truetype/thai/SawasdeeBold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Purisa << /Path (/usr/share/fonts/truetype/thai/Purisa.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstPoster << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstPoster.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSans-Oblique << /Path (/usr/share/fonts/truetype/freefont/FreeSansOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypo-BoldOblique << /Path (/usr/share/fonts/truetype/thai/TlwgTypo-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Lohit-Punjabi << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/lohit_pa.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Waree-BoldOblique << /Path (/usr/share/fonts/truetype/thai/Waree-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypewriter-BoldOblique << /Path (/usr/share/fonts/truetype/thai/TlwgTypewriter-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Garuda-Bold << /Path (/usr/share/fonts/truetype/thai/Garuda-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/RachanaMedium << /Path (/usr/share/fonts/truetype/ttf-malayalam-fonts/Rachana_04.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstArt << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstArt.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationMono-Bold << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationMono-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypo-Oblique << /Path (/usr/share/fonts/truetype/thai/TlwgTypo-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSerif-Bold << /Path (/usr/share/fonts/truetype/freefont/FreeSerifBold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSerif-BoldItalic << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSerif-BoldItalic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstDecorative << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstDecorative.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Lohit-Hindi << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/lohit_hi.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Umpush-LightOblique << /Path (/usr/share/fonts/truetype/thai/Umpush-LightOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSerif-Medium << /Path (/usr/share/fonts/truetype/freefont/FreeSerif.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/mry_KacstQurn << /Path (/usr/share/fonts/truetype/ttf-kacst/mry_KacstQurn.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstDigital << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstDigital.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/DejaVu-Sans-Mono-Bold << /Path (/usr/share/fonts/truetype/ttf-dejavu/DejaVuSansMono-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Lohit-Gujarati << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/lohit_gu.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationMono-Regular << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationMono-Regular.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstLetter << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstLetter.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypo << /Path (/usr/share/fonts/truetype/thai/TlwgTypo.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/msbm10 << /Path (/usr/share/fonts/truetype/latex-xft-fonts/msbm10.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgMono-Bold << /Path (/usr/share/fonts/truetype/thai/TlwgMono-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/DejaVu-Sans-Mono << /Path (/usr/share/fonts/truetype/ttf-dejavu/DejaVuSansMono.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Norasi-Italic << /Path (/usr/share/fonts/truetype/thai/Norasi-Italic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstTitleL << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstTitleL.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypewriter << /Path (/usr/share/fonts/truetype/thai/TlwgTypewriter.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeMono-Medium << /Path (/usr/share/fonts/truetype/freefont/FreeMono.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Norasi-Oblique << /Path (/usr/share/fonts/truetype/thai/Norasi-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypewriter-Oblique << /Path (/usr/share/fonts/truetype/thai/TlwgTypewriter-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Phetsarath << /Path (/usr/share/fonts/truetype/ttf-lao/Phetsarath_OT.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/mukti << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/MuktiNarrow.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Sawasdee-Oblique << /Path (/usr/share/fonts/truetype/thai/SawasdeeOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/cmr10 << /Path (/usr/share/fonts/truetype/latex-xft-fonts/cmr10.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Umpush-Light << /Path (/usr/share/fonts/truetype/thai/Umpush-Light.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Umpush-Bold << /Path (/usr/share/fonts/truetype/thai/Umpush-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/DejaVu-Serif-Bold << /Path (/usr/share/fonts/truetype/ttf-dejavu/DejaVuSerif-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstTitle << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstTitle.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Norasi << /Path (/usr/share/fonts/truetype/thai/Norasi.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Kinnari-Oblique << /Path (/usr/share/fonts/truetype/thai/Kinnari-Oblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/muktinarrow << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/MuktiNarrowBold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Kinnari-Italic << /Path (/usr/share/fonts/truetype/thai/Kinnari-Italic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/kacstPen << /Path (/usr/share/fonts/truetype/ttf-kacst/kacstPen.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Kinnari-BoldOblique << /Path (/usr/share/fonts/truetype/thai/Kinnari-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypewriter-Bold << /Path (/usr/share/fonts/truetype/thai/TlwgTypewriter-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeMono-Oblique << /Path (/usr/share/fonts/truetype/freefont/FreeMonoOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSans-Medium << /Path (/usr/share/fonts/truetype/freefont/FreeSans.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSerif-Regular << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSerif-Regular.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Umpush << /Path (/usr/share/fonts/truetype/thai/Umpush.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Sawasdee << /Path (/usr/share/fonts/truetype/thai/Sawasdee.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgMono << /Path (/usr/share/fonts/truetype/thai/TlwgMono.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstQurn << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstQurn.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Kinnari << /Path (/usr/share/fonts/truetype/thai/Kinnari.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgMono-BoldOblique << /Path (/usr/share/fonts/truetype/thai/TlwgMono-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/KacstScreen << /Path (/usr/share/fonts/truetype/ttf-kacst/KacstScreen.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/FreeSans-Bold << /Path (/usr/share/fonts/truetype/freefont/FreeSansBold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/msam10 << /Path (/usr/share/fonts/truetype/latex-xft-fonts/msam10.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/cmmi10 << /Path (/usr/share/fonts/truetype/latex-xft-fonts/cmmi10.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Lohit-Tamil << /Path (/usr/share/fonts/truetype/ttf-indic-fonts-core/lohit_ta.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/TlwgTypist << /Path (/usr/share/fonts/truetype/thai/TlwgTypist.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Waree-Bold << /Path (/usr/share/fonts/truetype/thai/Waree-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Kinnari-Bold << /Path (/usr/share/fonts/truetype/thai/Kinnari-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Loma-Bold << /Path (/usr/share/fonts/truetype/thai/Loma-Bold.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/LiberationSans-BoldItalic << /Path (/usr/share/fonts/truetype/ttf-liberation/LiberationSans-BoldItalic.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Loma-BoldOblique << /Path (/usr/share/fonts/truetype/thai/Loma-BoldOblique.ttf) /FontType 1 /FAPI /FreeType /SubfontId 0 >> ; +/Palatino-Italic /URWPalladioL-Ital ; +/Palatino-Bold /URWPalladioL-Bold ; +/AvantGarde-BookOblique /URWGothicL-BookObli ; +/Times-Bold /NimbusRomNo9L-Medi ; +/HelveticaNarrow-BoldOblique /NimbusSanL-BoldCondItal ; +/Times-Roman /NimbusRomNo9L-Regu ; +/NewCenturySchlbk-Italic /CenturySchL-Ital ; +/HelveticaNarrow /NimbusSanL-ReguCond ; +/Helvetica-Narrow-Bold /NimbusSanL-BoldCond ; +/Bookman-Light /URWBookmanL-Ligh ; +/Palatino-BoldItalic /URWPalladioL-BoldItal ; +/Traditional /KacstBook ; +/Times-BoldItalic /NimbusRomNo9L-MediItal ; +/AvantGarde-Book /URWGothicL-Book ; +/AvantGarde-DemiOblique /URWGothicL-DemiObli ; +/Helvetica-Narrow-Oblique /NimbusSanL-ReguCondItal ; +/Helvetica-Bold /NimbusSanL-Bold ; +/Courier-Oblique /NimbusMonL-ReguObli ; +/Times-Italic /NimbusRomNo9L-ReguItal ; +/Courier /NimbusMonL-Regu ; +/Bookman-Demi /URWBookmanL-DemiBold ; +/Helvetica-BoldOblique /NimbusSanL-BoldItal ; +/Helvetica-Oblique /NimbusSanL-ReguItal ; +/HelveticaNarrow-Oblique /NimbusSanL-ReguCondItal ; +/NewCenturySchlbk-BoldItalic /CenturySchL-BoldItal ; +/Courier-BoldOblique /NimbusMonL-BoldObli ; +/HelveticaNarrow-Bold /NimbusSanL-BoldCond ; +/AvantGarde-Demi /URWGothicL-Demi ; +/Bookman-LightItalic /URWBookmanL-LighItal ; +/ZapfDingbats /Dingbats ; +/Helvetica-Narrow-BoldOblique /NimbusSanL-BoldCondItal ; +/ZapfChancery-MediumItalic /URWChanceryL-MediItal ; +/Helvetica /NimbusSanL-Regu ; +/Bookman-DemiItalic /URWBookmanL-DemiBoldItal ; +/Palatino-Roman /URWPalladioL-Roma ; +/NewCenturySchlbk-Bold /CenturySchL-Bold ; +/NewCenturySchlbk-Roman /CenturySchL-Roma ; +/Courier-Bold /NimbusMonL-Bold ; +/Arabic /KacstBook ; +/Helvetica-Narrow /NimbusSanL-ReguCond ; diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/others/Fontmap --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/others/Fontmap Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,116 @@ +/LMTypewriter10-CapsOblique (lmtcso10.pfb) ; +/Dingbats (d050000l.pfb) ; +/URWBookmanL-DemiBoldItal (b018035l.pfb) ; +/LMSansQuotation8-Bold (lmssqbx8.pfb) ; +/Symbol (Symbol.pfb) ; +/LMTypewriterVarWd10-DarkOblique (lmvtko10.pfb) ; +/LMRoman10-Demi (lmb10.pfb) ; +/URWPalladioL-Ital (p052023l.pfb) ; +/LMTypewriter10-DarkOblique (lmtko10.pfb) ; +/NimbusSanL-Regu (n019003l.pfb) ; +/LMTypewriter10-Italic (lmtti10.pfb) ; +/LMSansQuotation8-BoldOblique (lmssqbo8.pfb) ; +/URWPalladioL-Roma (p052003l.pfb) ; +/LMTypewriterVarWd10-Light (lmvtl10.pfb) ; +/NimbusRomNo9L-Medi (n021004l.pfb) ; +/NimbusSanL-ReguItal (n019023l.pfb) ; +/NimbusMonL-Regu (n022003l.pfb) ; +/LMSans10-Bold (lmssbx10.pfb) ; +/LMRoman10-CapsOblique (lmcsco10.pfb) ; +/CenturySchL-Roma (c059013l.pfb) ; +/URWGothicL-BookObli (a010033l.pfb) ; +/LMTypewriter10-LightCondensedOblique (lmtlco10.pfb) ; +/LMSans10-DemiCondensedOblique (lmssdo10.pfb) ; +/LMRoman10-CapsRegular (lmcsc10.pfb) ; +/CenturySchL-BoldItal (c059036l.pfb) ; +/LMRoman10-DemiOblique (lmbo10.pfb) ; +/LMRoman10-Unslanted (lmu10.pfb) ; +/LMRoman10-Bold (lmbx10.pfb) ; +/LMSans10-DemiCondensed (lmssdc10.pfb) ; +/URWChanceryL-MediItal (z003034l.pfb) ; +/URWGothicL-DemiObli (a010035l.pfb) ; +/LMTypewriterVarWd10-Oblique (lmvtto10.pfb) ; +/NimbusMonL-Bold (n022004l.pfb) ; +/LMTypewriter10-Oblique (lmtto10.pfb) ; +/LMRoman10-BoldItalic (lmbxi10.pfb) ; +/NimbusSanL-ReguCond (n019043l.pfb) ; +/CenturySchL-Bold (c059016l.pfb) ; +/LMTypewriterVarWd10-Regular (lmvtt10.pfb) ; +/URWBookmanL-Ligh (b018012l.pfb) ; +/LMSansQuotation8-Regular (lmssq8.pfb) ; +/LMSans10-Regular (lmss10.pfb) ; +/LMSans10-Oblique (lmsso10.pfb) ; +/NimbusSanL-BoldCond (n019044l.pfb) ; +/LMRoman10-Regular (lmr10.pfb) ; +/LMTypewriter10-LightCondensed (lmtlc10.pfb) ; +/LMTypewriterVarWd10-Dark (lmvtk10.pfb) ; +/LMTypewriter10-CapsRegular (lmtcsc10.pfb) ; +/LMSansQuotation8-Oblique (lmssqo8.pfb) ; +/StandardSymL (s050000l.pfb) ; +/NimbusRomNo9L-Regu (n021003l.pfb) ; +/LMTypewriterVarWd10-LightOblique (lmvtlo10.pfb) ; +/URWPalladioL-BoldItal (p052024l.pfb) ; +/CenturySchL-Ital (c059033l.pfb) ; +/LMRoman10-Dunhill (lmdunh10.pfb) ; +/URWPalladioL-Bold (p052004l.pfb) ; +/URWGothicL-Book (a010013l.pfb) ; +/LMTypewriter10-Dark (lmtk10.pfb) ; +/NimbusSanL-BoldItal (n019024l.pfb) ; +/URWGothicL-Demi (a010015l.pfb) ; +/LMTypewriter10-LightOblique (lmtlo10.pfb) ; +/LMTypewriter10-Light (lmtl10.pfb) ; +/NimbusSanL-BoldCondItal (n019064l.pfb) ; +/LMRoman10-Italic (lmri10.pfb) ; +/LMRoman10-DunhillOblique (lmduno10.pfb) ; +/NimbusMonL-ReguObli (n022023l.pfb) ; +/LMRoman10-Oblique (lmro10.pfb) ; +/NimbusSanL-ReguCondItal (n019063l.pfb) ; +/NimbusRomNo9L-MediItal (n021024l.pfb) ; +/LMRoman10-BoldOblique (lmbxo10.pfb) ; +/URWBookmanL-DemiBold (b018015l.pfb) ; +/NimbusSanL-Bold (n019004l.pfb) ; +/LMSans10-BoldOblique (lmssbo10.pfb) ; +/URWBookmanL-LighItal (b018032l.pfb) ; +/NimbusMonL-BoldObli (n022024l.pfb) ; +/NimbusRomNo9L-ReguItal (n021023l.pfb) ; +/LMTypewriter10-Regular (lmtt10.pfb) ; +/Palatino-Italic /URWPalladioL-Ital ; +/Palatino-Bold /URWPalladioL-Bold ; +/AvantGarde-BookOblique /URWGothicL-BookObli ; +/Times-Bold /NimbusRomNo9L-Medi ; +/HelveticaNarrow-BoldOblique /NimbusSanL-BoldCondItal ; +/Times-Roman /NimbusRomNo9L-Regu ; +/NewCenturySchlbk-Italic /CenturySchL-Ital ; +/HelveticaNarrow /NimbusSanL-ReguCond ; +/Helvetica-Narrow-Bold /NimbusSanL-BoldCond ; +/Bookman-Light /URWBookmanL-Ligh ; +/Palatino-BoldItalic /URWPalladioL-BoldItal ; +/Traditional /KacstBook ; +/Times-BoldItalic /NimbusRomNo9L-MediItal ; +/AvantGarde-Book /URWGothicL-Book ; +/AvantGarde-DemiOblique /URWGothicL-DemiObli ; +/Helvetica-Narrow-Oblique /NimbusSanL-ReguCondItal ; +/Helvetica-Bold /NimbusSanL-Bold ; +/Courier-Oblique /NimbusMonL-ReguObli ; +/Times-Italic /NimbusRomNo9L-ReguItal ; +/Courier /NimbusMonL-Regu ; +/Bookman-Demi /URWBookmanL-DemiBold ; +/Helvetica-BoldOblique /NimbusSanL-BoldItal ; +/Helvetica-Oblique /NimbusSanL-ReguItal ; +/HelveticaNarrow-Oblique /NimbusSanL-ReguCondItal ; +/NewCenturySchlbk-BoldItalic /CenturySchL-BoldItal ; +/Courier-BoldOblique /NimbusMonL-BoldObli ; +/HelveticaNarrow-Bold /NimbusSanL-BoldCond ; +/AvantGarde-Demi /URWGothicL-Demi ; +/Bookman-LightItalic /URWBookmanL-LighItal ; +/ZapfDingbats /Dingbats ; +/Helvetica-Narrow-BoldOblique /NimbusSanL-BoldCondItal ; +/ZapfChancery-MediumItalic /URWChanceryL-MediItal ; +/Helvetica /NimbusSanL-Regu ; +/Bookman-DemiItalic /URWBookmanL-DemiBoldItal ; +/Palatino-Roman /URWPalladioL-Roma ; +/NewCenturySchlbk-Bold /CenturySchL-Bold ; +/NewCenturySchlbk-Roman /CenturySchL-Roma ; +/Courier-Bold /NimbusMonL-Bold ; +/Arabic /KacstBook ; +/Helvetica-Narrow /NimbusSanL-ReguCond ; diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/others/cidfmap --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/others/cidfmap Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,10 @@ +/Dotum-Bold << /FileType /TrueType /Path (/usr/share/fonts/truetype/unfonts/UnDotumBold.ttf) /SubfontID 0 /CSI [(Korea1) 0] >> ; +/ZenHei << /FileType /TrueType /Path (/usr/share/fonts/truetype/wqy/wqy-zenhei.ttf) /SubfontID 0 /CSI [(GB1) 0] >> ; +/Batang-Regular << /FileType /TrueType /Path (/usr/share/fonts/truetype/unfonts/UnBatang.ttf) /SubfontID 0 /CSI [(Korea1) 0] >> ; +/VL-PGothic-Regular << /FileType /TrueType /Path (/usr/share/fonts/truetype/vlgothic/VL-PGothic-Regular.ttf) /SubfontID 0 /CSI [(Japan1) 0] >> ; +/Dotum-Regular << /FileType /TrueType /Path (/usr/share/fonts/truetype/unfonts/UnDotum.ttf) /SubfontID 0 /CSI [(Korea1) 0] >> ; +/VL-Gothic-Regular-JaH << /FileType /TrueType /Path (/usr/share/fonts/truetype/vlgothic/VL-Gothic-Regular.ttf) /SubfontID 0 /CSI [(Japan2) 0] >> ; +/VL-Gothic-Regular << /FileType /TrueType /Path (/usr/share/fonts/truetype/vlgothic/VL-Gothic-Regular.ttf) /SubfontID 0 /CSI [(Japan1) 0] >> ; +/VL-PGothic-Regular-JaH << /FileType /TrueType /Path (/usr/share/fonts/truetype/vlgothic/VL-PGothic-Regular.ttf) /SubfontID 0 /CSI [(Japan2) 0] >> ; +/ZenHei-CNS << /FileType /TrueType /Path (/usr/share/fonts/truetype/wqy/wqy-zenhei.ttf) /SubfontID 0 /CSI [(CNS1) 0] >> ; +/Batang-Bold << /FileType /TrueType /Path (/usr/share/fonts/truetype/unfonts/UnBatangBold.ttf) /SubfontID 0 /CSI [(Korea1) 0] >> ; diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/COPYRIGHT.TXT --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/COPYRIGHT.TXT Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,124 @@ +Bitstream Vera Fonts Copyright + +The fonts have a generous copyright, allowing derivative works (as +long as "Bitstream" or "Vera" are not in the names), and full +redistribution (so long as they are not *sold* by themselves). They +can be be bundled, redistributed and sold with any software. + +The fonts are distributed under the following copyright: + +Copyright +========= + +Copyright (c) 2003 by Bitstream, Inc. All Rights Reserved. Bitstream +Vera is a trademark of Bitstream, Inc. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the fonts accompanying this license ("Fonts") and associated +documentation files (the "Font Software"), to reproduce and distribute +the Font Software, including without limitation the rights to use, +copy, merge, publish, distribute, and/or sell copies of the Font +Software, and to permit persons to whom the Font Software is furnished +to do so, subject to the following conditions: + +The above copyright and trademark notices and this permission notice +shall be included in all copies of one or more of the Font Software +typefaces. + +The Font Software may be modified, altered, or added to, and in +particular the designs of glyphs or characters in the Fonts may be +modified and additional glyphs or characters may be added to the +Fonts, only if the fonts are renamed to names not containing either +the words "Bitstream" or the word "Vera". + +This License becomes null and void to the extent applicable to Fonts +or Font Software that has been modified and is distributed under the +"Bitstream Vera" names. + +The Font Software may be sold as part of a larger software package but +no copy of one or more of the Font Software typefaces may be sold by +itself. + +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL +BITSTREAM OR THE GNOME FOUNDATION BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, +OR CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT +SOFTWARE OR FROM OTHER DEALINGS IN THE FONT SOFTWARE. + +Except as contained in this notice, the names of Gnome, the Gnome +Foundation, and Bitstream Inc., shall not be used in advertising or +otherwise to promote the sale, use or other dealings in this Font +Software without prior written authorization from the Gnome Foundation +or Bitstream Inc., respectively. For further information, contact: +fonts at gnome dot org. + +Copyright FAQ +============= + + 1. I don't understand the resale restriction... What gives? + + Bitstream is giving away these fonts, but wishes to ensure its + competitors can't just drop the fonts as is into a font sale system + and sell them as is. It seems fair that if Bitstream can't make money + from the Bitstream Vera fonts, their competitors should not be able to + do so either. You can sell the fonts as part of any software package, + however. + + 2. I want to package these fonts separately for distribution and + sale as part of a larger software package or system. Can I do so? + + Yes. A RPM or Debian package is a "larger software package" to begin + with, and you aren't selling them independently by themselves. + See 1. above. + + 3. Are derivative works allowed? + Yes! + + 4. Can I change or add to the font(s)? + Yes, but you must change the name(s) of the font(s). + + 5. Under what terms are derivative works allowed? + + You must change the name(s) of the fonts. This is to ensure the + quality of the fonts, both to protect Bitstream and Gnome. We want to + ensure that if an application has opened a font specifically of these + names, it gets what it expects (though of course, using fontconfig, + substitutions could still could have occurred during font + opening). You must include the Bitstream copyright. Additional + copyrights can be added, as per copyright law. Happy Font Hacking! + + 6. If I have improvements for Bitstream Vera, is it possible they might get + adopted in future versions? + + Yes. The contract between the Gnome Foundation and Bitstream has + provisions for working with Bitstream to ensure quality additions to + the Bitstream Vera font family. Please contact us if you have such + additions. Note, that in general, we will want such additions for the + entire family, not just a single font, and that you'll have to keep + both Gnome and Jim Lyles, Vera's designer, happy! To make sense to add + glyphs to the font, they must be stylistically in keeping with Vera's + design. Vera cannot become a "ransom note" font. Jim Lyles will be + providing a document describing the design elements used in Vera, as a + guide and aid for people interested in contributing to Vera. + + 7. I want to sell a software package that uses these fonts: Can I do so? + + Sure. Bundle the fonts with your software and sell your software + with the fonts. That is the intent of the copyright. + + 8. If applications have built the names "Bitstream Vera" into them, + can I override this somehow to use fonts of my choosing? + + This depends on exact details of the software. Most open source + systems and software (e.g., Gnome, KDE, etc.) are now converting to + use fontconfig (see www.fontconfig.org) to handle font configuration, + selection and substitution; it has provisions for overriding font + names and subsituting alternatives. An example is provided by the + supplied local.conf file, which chooses the family Bitstream Vera for + "sans", "serif" and "monospace". Other software (e.g., the XFree86 + core server) has other mechanisms for font substitution. + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/README.TXT --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/README.TXT Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,11 @@ +Contained herin is the Bitstream Vera font family. + +The Copyright information is found in the COPYRIGHT.TXT file (along +with being incoporated into the fonts themselves). + +The releases notes are found in the file "RELEASENOTES.TXT". + +We hope you enjoy Vera! + + Bitstream, Inc. + The Gnome Project diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/RELEASENOTES.TXT --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/RELEASENOTES.TXT Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,162 @@ +Bitstream Vera Fonts - April 16, 2003 +===================================== + +The version number of these fonts is 1.10 to distinguish them from the +beta test fonts. + +Note that the Vera copyright is incorporated in the fonts themselves. +The License field in the fonts contains the copyright license as it +appears below. The TrueType copyright field is not large enough to +contain the full license, so the license is incorporated (as you might +think if you thought about it) into the license field, which +unfortunately can be obscure to find. (In pfaedit, see: Element->Font +Info->TTFNames->License). + +Our apologies for it taking longer to complete the fonts than planned. +Beta testers requested a tighter line spacing (less leading) and Jim +Lyles redesigned Vera's accents to bring its line spacing to more +typical of other fonts. This took additional time and effort. Our +thanks to Jim for this effort above and beyond the call of duty. + +There are four monospace and sans faces (normal, oblique, bold, bold +oblique) and two serif faces (normal and bold). Fontconfig/Xft2 (see +www.fontconfig.org) can artificially oblique the serif faces for you: +this loses hinting and distorts the faces slightly, but is visibly +different than normal and bold, and reasonably pleasing. + +On systems with fontconfig 2.0 or 2.1 installed, making your sans, +serif and monospace fonts default to these fonts is very easy. Just +drop the file local.conf into your /etc/fonts directory. This will +make the Bitstream fonts your default fonts for all applications using +fontconfig (if sans, serif, or monospace names are used, as they often +are as default values in many desktops). The XML in local.conf may +need modification to enable subpixel decimation, if appropriate, +however, the commented out phrase does so for XFree86 4.3, in the case +that the server does not have sufficient information to identify the +use of a flat panel. Fontconfig 2.2 adds Vera to the list of font +families and will, by default use it as the default sans, serif and +monospace fonts. + +During the testing of the final Vera fonts, we learned that screen +fonts in general are only typically hinted to work correctly at +integer pixel sizes. Vera is coded internally for integer sizes only. +We need to investigate further to see if there are commonly used fonts +that are hinted to be rounded but are not rounded to integer sizes due +to oversights in their coding. + +Most fonts work best at 8 pixels and below if anti-aliased only, as +the amount of work required to hint well at smaller and smaller sizes +becomes astronomical. GASP tables are typically used to control +whether hinting is used or not, but Freetype/Xft does not currently +support GASP tables (which are present in Vera). + +To mitigate this problem, both for Vera and other fonts, there will be +(very shortly) a new fontconfig 2.2 release that will, by default not +apply hints if the size is below 8 pixels. if you should have a font +that in fact has been hinted more agressively, you can use fontconfig +to note this exception. We believe this should improve many hinted +fonts in addition to Vera, though implemeting GASP support is likely +the right long term solution. + +Font rendering in Gnome or KDE is the combination of algorithms in +Xft2 and Freetype, along with hinting in the fonts themselves. It is +vital to have sufficient information to disentangle problems that you +may observe. + +Note that having your font rendering system set up correctly is vital +to proper judgement of problems of the fonts: + + * Freetype may or may not be configured to in ways that may + implement execution of possibly patented (in some parts of the world) + TrueType hinting algorithms, particularly at small sizes. Best + results are obtained while using these algorithms. + + * The freetype autohinter (used when the possibly patented + algorithms are not used) continues to improve with each release. If + you are using the autohinter, please ensure you are using an up to + date version of freetype before reporting problems. + + * Please identify what version of freetype you are using in any + bug reports, and how your freetype is configured. + + * Make sure you are not using the freetype version included in + XFree86 4.3, as it has bugs that significantly degrade most fonts, + including Vera. if you build XFree86 4.3 from source yourself, you may + have installed this broken version without intending it (as I + did). Vera was verified with the recently released Freetype 2.1.4. On + many systems, 'ldd" can be used to see which freetype shared library + is actually being used. + + * Xft/X Render does not (yet) implement gamma correction. This + causes significant problems rendering white text on a black background + (causing partial pixels to be insufficiently shaded) if the gamma of + your monitor has not been compensated for, and minor problems with + black text on a while background. The program "xgamma" can be used to + set a gamma correction value in the X server's color pallette. Most + monitors have a gamma near 2. + + * Note that the Vera family uses minimal delta hinting. Your + results on other systems when not used anti-aliased may not be + entirely satisfying. We are primarily interested in reports of + problems on open source systems implementing Xft2/fontconfig/freetype + (which implements antialiasing and hinting adjustements, and + sophisticated subpixel decimation on flatpanels). Also, the + algorithms used by Xft2 adjust the hints to integer widths and the + results are crisper on open source systems than on Windows or + MacIntosh. + + * Your fontconfig may (probably does) predate the release of + fontconfig 2.2, and you may see artifacts not present when the font is + used at very small sizes with hinting enabled. "vc-list -V" can be + used to see what version you have installed. + +We believe and hope that these fonts will resolve the problems +reported during beta test. The largest change is the reduction of +leading (interline spacing), which had annoyed a number of people, and +reduced Vera's utility for some applcations. The Vera monospace font +should also now make '0' and 'O' and '1' and 'l' more clearly +distinguishable. + +The version of these fonts is version 1.10. Fontconfig should be +choosing the new version of the fonts if both the released fonts and +beta test fonts are installed (though please discard them: they have +names of form tt20[1-12]gn.ttf). Note that older versions of +fontconfig sometimes did not rebuild their cache correctly when new +fonts are installed: please upgrade to fontconfig 2.2. "fc-cache -f" +can be used to force rebuilding fontconfig's cache files. + +If you note problems, please send them to fonts at gnome dot org, with +exactly which face and size and unicode point you observe the problem +at. The xfd utility from XFree86 CVS may be useful for this (e.g. "xfd +-fa sans"). A possibly more useful program to examine fonts at a +variety of sizes is the "waterfall" program found in Keith Packard's +CVS. + + $ cvs -d :pserver:anoncvs@keithp.com:/local/src/CVS login + Logging in to :pserver:anoncvs@keithp.com:2401/local/src/CVS + CVS password: + $ cvs -d :pserver:anoncvs@keithp.com:/local/src/CVS co waterfall + $ cd waterfall + $ xmkmf -a + $ make + # make install + # make install.man + +Again, please make sure you are running an up-to-date freetype, and +that you are only examining integer sizes. + +Reporting Problems +================== + +Please send problem reports to fonts at gnome org, with the following +information: + + 1. Version of Freetype, Xft2 and fontconfig + 2. Whether TT hinting is being used, or the autohinter + 3. Application being used + 4. Character/Unicode code point that has problems (if applicable) + 5. Version of which operating system + 6. Please include a screenshot, when possible. + +Please check the fonts list archives before reporting problems to cut +down on duplication. diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/Vera.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/Vera.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraBI.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraBI.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraBd.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraBd.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraIt.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraIt.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMoBI.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMoBI.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMoBd.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMoBd.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMoIt.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMoIt.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMono.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraMono.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraSe.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraSe.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraSeBd.ttf Binary file data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/VeraSeBd.ttf has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/local.conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/fonts/vera/local.conf Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,32 @@ + + + + + + + + serif + + Bitstream Vera Serif + + + + sans-serif + + Bitstream Vera Sans + + + + monospace + + Bitstream Vera Sans Mono + + + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/.DS_Store Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/.DS_Store has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/1.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/1.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/10.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/10.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/11.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/11.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/12.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/12.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/2.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/2.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/3.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/3.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/4.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/4.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/5.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/5.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/6.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/6.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/7.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/7.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/8.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/8.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/9.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/9.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/pictures/abstract/README Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,3 @@ +These images were created by the author with Fyre, expressly for PyCAPTCHA. + +Copyright (c) 2004 Micah Dowty diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/Craig_Barrington_ocotillo_and_mountains.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/Craig_Barrington_ocotillo_and_mountains.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/Kerry_Carloy_Chisos_Sunset.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/Kerry_Carloy_Chisos_Sunset.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/Paul_Dowty_Mt_Bross.jpeg Binary file data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/Paul_Dowty_Mt_Bross.jpeg has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/pictures/nature/README Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,2 @@ +These are uncopyrighted images gathered from various sources, +including the author's family and national park service web sites. \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/words/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/words/README Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,4 @@ +These word lists are from various sources: + +basic-english: + http://simple.wikipedia.org/wiki/Basic_English_Alphabetical_Wordlist diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/words/basic-english --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/words/basic-english Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,852 @@ +a +able +about +account +acid +across +act +addition +adjustment +advertisement +agreement +after +again +against +air +all +almost +among +amount +amusement +and +angle +angry +animal +answer +ant +any +apparatus +apple +approval +arch +argument +arm +army +art +as +at +attack +attempt +attention +attraction +authority +automatic +awake +baby +back +bad +bag +balance +ball +band +base +basin +basket +bath +be +beautiful +because +bed +bee +before +behavior +belief +bell +bent +berry +between +bird +birth +bit +bite +bitter +black +blade +blood +blow +blue +board +boat +body +boiling +bone +book +boot +bottle +box +boy +brain +brake +branch +brass +bread +breath +brick +bridge +bright +broken +brother +brown +brush +bucket +building +bulb +burn +burst +business +but +butter +button +by +cake +camera +canvas +card +care +carriage +cart +cat +cause +certain +chain +chalk +chance +change +cheap +cheese +chemical +chest +chief +chin +church +circle +clean +clear +clock +cloth +cloud +coal +coat +cold +collar +color +comb +come +comfort +committee +common +company +comparison +competition +complete +complex +condition +connection +conscious +control +cook +copper +copy +cord +cork +cotton +cough +country +cover +cow +crack +credit +crime +cruel +crush +cry +cup +current +curtain +curve +cushion +cut +damage +danger +dark +daughter +day +dead +dear +death +debt +decision +deep +degree +delicate +dependent +design +desire +destruction +detail +development +different +digestion +direction +dirty +discovery +discussion +disease +disgust +distance +distribution +division +do +dog +door +down +doubt +drain +drawer +dress +drink +driving +drop +dry +dust +ear +early +earth +east +edge +education +effect +egg +elastic +electric +end +engine +enough +equal +error +even +event +ever +every +example +exchange +existence +expansion +experience +expert +eye +face +fact +fall +false +family +far +farm +fat +father +fear +feather +feeble +feeling +female +fertile +fiction +field +fight +finger +fire +first +fish +fixed +flag +flame +flat +flight +floor +flower +fly +fold +food +foolish +foot +for +force +fork +form +forward +fowl +frame +free +frequent +friend +from +front +fruit +full +future +garden +general +get +girl +give +glass +glove +go +goat +gold +good +government +grain +grass +great +green +grey/gray +grip +group +growth +guide +gun +hair +hammer +hand +hanging +happy +harbor +hard +harmony +hat +hate +have +he +head +healthy +hearing +heart +heat +help +here +high +history +hole +hollow +hook +hope +horn +horse +hospital +hour +house +how +humor +ice +idea +if +ill +important +impulse +in +increase +industry +ink +insect +instrument +insurance +interest +invention +iron +island +jelly +jewel +join +journey +judge +jump +keep +kettle +key +kick +kind +kiss +knee +knife +knot +knowledge +land +language +last +late +laugh +law +lead +leaf +learning +leather +left +leg +let +letter +level +library +lift +light +like +limit +line +linen +lip +liquid +list +little +less +least +living +lock +long +loose +loss +loud +love +low +machine +make +male +man +manager +map +mark +market +married +match +material +mass +may +meal +measure +meat +medical +meeting +memory +metal +middle +military +milk +mind +mine +minute +mist +mixed +money +monkey +month +moon +morning +mother +motion +mountain +mouth +move +much +more +most +muscle +music +nail +name +narrow +nation +natural +near +necessary +neck +need +needle +nerve +net +new +news +night +no +noise +normal +north +nose +not +note +now +number +nut +observation +of +off +offer +office +oil +old +on +only +open +operation +opposite +opinion +other +or +orange +order +organization +ornament +out +oven +over +owner +page +pain +paint +paper +parallel +parcel +part +past +paste +payment +peace +pen +pencil +person +physical +picture +pig +pin +pipe +place +plane +plant +plate +play +please +pleasure +plough/plow +pocket +point +poison +polish +political +poor +porter +position +possible +pot +potato +powder +power +present +price +print +prison +private +probable +process +produce +profit +property +prose +protest +public +pull +pump +punishment +purpose +push +put +quality +question +quick +quiet +quite +rail +rain +range +rat +rate +ray +reaction +red +reading +ready +reason +receipt +record +regret +regular +relation +religion +representative +request +respect +responsible +rest +reward +rhythm +rice +right +ring +river +road +rod +roll +roof +room +root +rough +round +rub +rule +run +sad +safe +sail +salt +same +sand +say +scale +school +science +scissors +screw +sea +seat +second +secret +secretary +see +seed +selection +self +send +seem +sense +separate +serious +servant +sex +shade +shake +shame +sharp +sheep +shelf +ship +shirt +shock +shoe +short +shut +side +sign +silk +silver +simple +sister +size +skin +skirt +sky +sleep +slip +slope +slow +small +smash +smell +smile +smoke +smooth +snake +sneeze +snow +so +soap +society +sock +soft +solid +some +son +song +sort +sound +south +soup +space +spade +special +sponge +spoon +spring +square +stamp +stage +star +start +statement +station +steam +stem +steel +step +stick +still +stitch +stocking +stomach +stone +stop +store +story +strange +street +stretch +sticky +stiff +straight +strong +structure +substance +sugar +suggestion +summer +support +surprise +such +sudden +sun +sweet +swim +system +table +tail +take +talk +tall +taste +tax +teaching +tendency +test +than +that +the +then +theory +there +thick +thin +thing +this +thought +thread +throat +though +through +thumb +thunder +ticket +tight +tired +till +time +tin +to +toe +together +tomorrow +tongue +tooth +top +touch +town +trade +train +transport +tray +tree +trick +trousers +true +trouble +turn +twist +umbrella +under +unit +use +up +value +verse +very +vessel +view +violent +voice +walk +wall +waiting +war +warm +wash +waste +watch +water +wave +wax +way +weather +week +weight +well +west +wet +wheel +when +where +while +whip +whistle +white +who +why +wide +will +wind +window +wine +wing +winter +wire +wise +with +woman +wood +wool +word +work +worm +wound +writing +wrong +year +yellow +yes +yesterday +you +young \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Captcha/data/words/characters --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Captcha/data/words/characters Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,62 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/Facade.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/Facade.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,35 @@ +#!/usr/bin/env python +import sys, os +curdir = os.path.dirname(__file__) +if curdir != '': + sys.path.append(curdir) + +from Captcha.Visual.Tests import PseudoGimpy, AngryGimpy +import numpy + +# Une fonction simple pour generer un captcha +# ease : represente la difficulte du captcha a generer +# 0 = facile et 1 (ou autre chose) = difficile +#solution : specifie si on veut en retour un array numpy representant +#l image ou un tuple contenant l'array et la solution du captcha. + +# Des fontes additionnelles peuvent etre ajoutees au dossier pyCaptcha/Captcha/data/fonts/others +# Le programme choisit une fonte aleatoirement dans ce dossier ainsi que le dossir vera. + + +def generateCaptcha (ease=0, solution=0): + + if ease == 1: + g = AngryGimpy() + + else: + g = PseudoGimpy() + + i = g.render() + a = numpy.asarray(i) + + if solution == 0: + return a + + else : + return (a, g.solutions) diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/README Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,51 @@ +====================== +Python CAPTCHA package +====================== + +About +----- + +This is the PyCAPTCHA package, a collection of Python modules +implementing CAPTCHAs: automated tests that humans should pass, +but current computer programs can't. These tests are often +used for security. + +See http://www.captcha.net for more information and examples. + +This project was started because the CIA project, written in +Python, needed a CAPTCHA to automate its user creation process +safely. All existing implementations the author could find were +written in Java or for the .NET framework, so a simple Python +alternative was needed. + +Examples +-------- + +Included are several example programs: + + - simple_example.py is a bare-bones example that just generates + and displays an image. + + - http_example.py is a longer example that uses BaseHTTPServer + to simulate a CAPTCHA's use in a web environment. Running this + example and connecting to it from your web browser is a quick + and easy way to see PyCAPTCHA in action + + - modpython_example.py is a version of http_example that runs + from an Apache server equipped with a properly configured + mod_python. + + +Dependencies +------------ + +- Python 2.2.1 or later +- the Python Imaging Library, required for visual CAPTCHAs + + +Contacts +-------- + +Micah Dowty + +'scanline' on irc.freenode.net diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,1 @@ +__all__ = ['Facade','Captcha'] diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/http_example.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/http_example.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# +# An example that presents CAPTCHA tests in a web environment +# and gives the user a chance to solve them. Run it, optionally +# specifying a port number on the command line, then point your web +# browser at the given URL. +# + +from Captcha.Visual import Tests +from Captcha import Factory +import BaseHTTPServer, urlparse, sys + + +class RequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): + def do_GET(self): + scheme, host, path, parameters, query, fragment = urlparse.urlparse(self.path) + + # Split the path into segments + pathSegments = path.split('/')[1:] + + # Split the query into key-value pairs + args = {} + for pair in query.split("&"): + if pair.find("=") >= 0: + key, value = pair.split("=", 1) + args.setdefault(key, []).append(value) + else: + args[pair] = [] + + # A hack so it works with a proxy configured for VHostMonster :) + if pathSegments[0] == "vhost": + pathSegments = pathSegments[3:] + + if pathSegments[0] == "": + self.handleRootPage(args.get('test', Tests.__all__)[0]) + + elif pathSegments[0] == "images": + self.handleImagePage(pathSegments[1]) + + elif pathSegments[0] == "solutions": + self.handleSolutionPage(pathSegments[1], args['word'][0]) + + else: + self.handle404() + + def handle404(self): + self.send_response(404) + self.send_header("Content-Type", "text/html") + self.end_headers() + self.wfile.write("

No such resource

") + + def handleRootPage(self, testName): + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.end_headers() + + test = self.captchaFactory.new(getattr(Tests, testName)) + + # Make a list of tests other than the one we're using + others = [] + for t in Tests.__all__: + if t != testName: + others.append('
  • %s
  • ' % (t,t)) + others = "\n".join(others) + + self.wfile.write(""" + +PyCAPTCHA Example + + +

    PyCAPTCHA Example

    +

    + %s: + %s +

    + +

    +

    +

    + Enter the word shown: + +
    +

    + +

    +Or try... +

      +%s +
    +

    + + + +""" % (test.__class__.__name__, test.__doc__, test.id, test.id, others)) + + def handleImagePage(self, id): + test = self.captchaFactory.get(id) + if not test: + return self.handle404() + + self.send_response(200) + self.send_header("Content-Type", "image/jpeg") + self.end_headers() + test.render().save(self.wfile, "JPEG") + + def handleSolutionPage(self, id, word): + test = self.captchaFactory.get(id) + if not test: + return self.handle404() + + if not test.valid: + # Invalid tests will always return False, to prevent + # random trial-and-error attacks. This could be confusing to a user... + result = "Test invalidated, try another test" + elif test.testSolutions([word]): + result = "Correct" + else: + result = "Incorrect" + + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.end_headers() + self.wfile.write(""" + +PyCAPTCHA Example + + +

    PyCAPTCHA Example

    +

    %s

    +

    +

    %s

    +

    You guessed: %s

    +

    Possible solutions: %s

    +

    Try again

    + + +""" % (test.__class__.__name__, test.id, result, word, ", ".join(test.solutions))) + + +def main(port): + print "Starting server at http://localhost:%d/" % port + handler = RequestHandler + handler.captchaFactory = Factory() + BaseHTTPServer.HTTPServer(('', port), RequestHandler).serve_forever() + +if __name__ == "__main__": + # The port number can be specified on the command line, default is 8080 + if len(sys.argv) >= 2: + port = int(sys.argv[1]) + else: + port = 8080 + main(port) + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/modpython_example.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/modpython_example.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,113 @@ +# +# An example that presents CAPTCHA tests in a web environment +# and gives the user a chance to solve them. +# +# This example is for use with Apache using mod_python and its +# Publisher handler. For example, if your apache configuration +# included something like: +# +# AddHandler python-program .py +# PythonHandler mod_python.publisher +# +# You could place this script anywhere in your web space to see +# the demo. +# +# --Micah +# + +from Captcha.Visual import Tests +import Captcha +from mod_python import apache + + +def _getFactory(req): + return Captcha.PersistentFactory("/tmp/pycaptcha_%s" % req.interpreter) + + +def test(req, name=Tests.__all__[0]): + """Show a newly generated CAPTCHA of the given class. + Default is the first class name given in Tests.__all__ + """ + test = _getFactory(req).new(getattr(Tests, name)) + + # Make a list of tests other than the one we're using + others = [] + for t in Tests.__all__: + if t != name: + others.append('
  • %s
  • ' % (t,t)) + others = "\n".join(others) + + return """ + +PyCAPTCHA Example + + +

    PyCAPTCHA Example (for mod_python)

    +

    + %s: + %s +

    + +

    +

    +

    + Enter the word shown: + + +
    +

    + +

    +Or try... +

      +%s +
    +

    + + + +""" % (test.__class__.__name__, test.__doc__, test.id, test.id, others) + + +def image(req, id): + """Generate an image for the CAPTCHA with the given ID string""" + test = _getFactory(req).get(id) + if not test: + raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND + req.content_type = "image/jpeg" + test.render().save(req, "JPEG") + return apache.OK + + +def solution(req, id, word): + """Grade a CAPTCHA given a solution word""" + test = _getFactory(req).get(id) + if not test: + raise apache.SERVER_RETURN, apache.HTTP_NOT_FOUND + + if not test.valid: + # Invalid tests will always return False, to prevent + # random trial-and-error attacks. This could be confusing to a user... + result = "Test invalidated, try another test" + elif test.testSolutions([word]): + result = "Correct" + else: + result = "Incorrect" + + return """ + +PyCAPTCHA Example + + +

    PyCAPTCHA Example

    +

    %s

    +

    +

    %s

    +

    You guessed: %s

    +

    Possible solutions: %s

    +

    Try again

    + + +""" % (test.__class__.__name__, test.id, result, word, ", ".join(test.solutions)) + +### The End ### diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/output.png Binary file data_generation/transformations/pycaptcha/output.png has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/setup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/setup.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,28 @@ +#!/usr/bin/env python +from distutils.core import setup +from setup.my_install_data import * + +setup (name = "PyCAPTCHA", + version = "0.4", + description = "A Python framework for CAPTCHA tests", + maintainer = "Micah Dowty", + maintainer_email = "micah@navi.cx", + license = "MIT", + packages = [ + 'Captcha', + 'Captcha.Visual', + ], + cmdclass = { + 'install_data': my_install_data, + }, + data_files = [Data_Files( + preserve_path = 1, + base_dir = 'install_lib', + copy_to = 'Captcha/data', + strip_dirs = 2, + template = [ + 'graft Captcha/data', + ], + )], + ) + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/setup/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/setup/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,1 @@ +# Extra modules for use with distutils diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/setup/my_install_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/setup/my_install_data.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,191 @@ +"""my_install_data.py + +Provides a more sophisticated facility to install data files +than distutils' install_data does. +You can specify your files as a template like in MANIFEST.in +and you have more control over the copy process. + +Copyright 2000 by Rene Liebscher, Germany. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +Note: +This licence is only for this file. +PyOpenGL has its own licence. (it is almost identical.) +""" + +# created 2000/08/01, Rene Liebscher + +########################################################################### +# import some modules we need + +import os,sys,string +from types import StringType,TupleType,ListType +from distutils.util import change_root +from distutils.filelist import FileList +from distutils.command.install_data import install_data + +########################################################################### +# a container class for our more sophisticated install mechanism + +class Data_Files: + """ container for list of data files. + supports alternate base_dirs e.g. 'install_lib','install_header',... + supports a directory where to copy files + supports templates as in MANIFEST.in + supports preserving of paths in filenames + eg. foo/xyz is copied to base_dir/foo/xyz + supports stripping of leading dirs of source paths + eg. foo/bar1/xyz, foo/bar2/abc can be copied to bar1/xyz, bar2/abc + """ + + def __init__(self,base_dir=None,files=None,copy_to=None,template=None,preserve_path=0,strip_dirs=0): + self.base_dir = base_dir + self.files = files + self.copy_to = copy_to + self.template = template + self.preserve_path = preserve_path + self.strip_dirs = strip_dirs + self.finalized = 0 + + def warn (self, msg): + sys.stderr.write ("warning: %s: %s\n" % + ("install_data", msg)) + + def debug_print (self, msg): + """Print 'msg' to stdout if the global DEBUG (taken from the + DISTUTILS_DEBUG environment variable) flag is true. + """ + from distutils.core import DEBUG + if DEBUG: + print msg + + + def finalize(self): + """ complete the files list by processing the given template """ + if self.finalized: + return + if self.files == None: + self.files = [] + if self.template != None: + if type(self.template) == StringType: + self.template = string.split(self.template,";") + filelist = FileList(self.warn,self.debug_print) + for line in self.template: + filelist.process_template_line(string.strip(line)) + filelist.sort() + filelist.remove_duplicates() + self.files.extend(filelist.files) + self.finalized = 1 + +# end class Data_Files + +########################################################################### +# a more sophisticated install routine than distutils install_data + +class my_install_data (install_data): + + def check_data(self,d): + """ check if data are in new format, if not create a suitable object. + returns finalized data object + """ + if not isinstance(d, Data_Files): + self.warn(("old-style data files list found " + "-- please convert to Data_Files instance")) + if type(d) is TupleType: + if len(d) != 2 or not (type(d[1]) is ListType): + raise DistutilsSetupError, \ + ("each element of 'data_files' option must be an " + "Data File instance, a string or 2-tuple (string,[strings])") + d = Data_Files(copy_to=d[0],files=d[1]) + else: + if not (type(d) is StringType): + raise DistutilsSetupError, \ + ("each element of 'data_files' option must be an " + "Data File instance, a string or 2-tuple (string,[strings])") + d = Data_Files(files=[d]) + d.finalize() + return d + + def run(self): + self.outfiles = [] + install_cmd = self.get_finalized_command('install') + + for d in self.data_files: + d = self.check_data(d) + + install_dir = self.install_dir + # alternative base dir given => overwrite install_dir + if d.base_dir != None: + install_dir = getattr(install_cmd,d.base_dir) + + # copy to an other directory + if d.copy_to != None: + if not os.path.isabs(d.copy_to): + # relatiev path to install_dir + dir = os.path.join(install_dir, d.copy_to) + elif install_cmd.root: + # absolute path and alternative root set + dir = change_root(self.root,d.copy_to) + else: + # absolute path + dir = d.copy_to + else: + # simply copy to install_dir + dir = install_dir + # warn if necceassary + self.warn("setup script did not provide a directory to copy files to " + " -- installing right in '%s'" % install_dir) + + dir=os.path.normpath(dir) + # create path + self.mkpath(dir) + + # copy all files + for src in d.files: + if d.strip_dirs > 0: + dst = string.join(string.split(src,os.sep)[d.strip_dirs:],os.sep) + else: + dst = src + if d.preserve_path: + # preserve path in filename + self.mkpath(os.path.dirname(os.path.join(dir,dst))) + out = self.copy_file(src, os.path.join(dir,dst)) + else: + out = self.copy_file(src, dir) + if type(out) is TupleType: + out = out[0] + self.outfiles.append(out) + + return self.outfiles + + def get_inputs (self): + inputs = [] + for d in self.data_files: + d = self.check_data(d) + inputs.append(d.files) + return inputs + + def get_outputs (self): + return self.outfiles + + +########################################################################### + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/simple_example.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/simple_example.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# +# A very simple example that creates a random image from the +# PseudoGimpy CAPTCHA, saves and shows it, and prints the list +# of solutions. Normally you would call testSolutions rather +# than reading this list yourself. +# +from Captcha.Visual.Tests import PseudoGimpy, AngryGimpy +import numpy +#from numpy import * + +#g = AngryGimpy() +#i = g.render() +#a = numpy.asarray(i) +#b = numpy.zeros((2, 2), numpy.int8) +#c = a == b +#print c +#i.save("output.png") +#i.show() +#print a +#print g.solutions diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/test.png Binary file data_generation/transformations/pycaptcha/test.png has changed diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/pycaptcha/transformations.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/pycaptcha/transformations.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,25 @@ + +import Numeric, Image + #""" Transforme une image PIL en objet numpy.array et vice versa""" + + +def image2array(im): + #""" image vers array numpy""" + if im.mode not in ("L", "F"): + raise ValueError, "can only convert single-layer images" + if im.mode == "L": + a = Numeric.fromstring(im.tostring(), Numeric.UnsignedInt8) + else: + a = Numeric.fromstring(im.tostring(), Numeric.Float32) + a.shape = im.size[1], im.size[0] + return a + +def array2image(a): + #""" array numpy vers image""" + if a.typecode() == Numeric.UnsignedInt8: + mode = "L" + elif a.typecode() == Numeric.Float32: + mode = "F" + else: + raise ValueError, "unsupported image mode" + return Image.fromstring(mode, (a.shape[1], a.shape[0]), a.tostring()) diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/slant.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/slant.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,104 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Author: Youssouf + +this module add a slant effect to the image. + +To obtain the slant effect, each row of the array is shifted proportionately by a step controlled by the complexity. + +''' + +import numpy + + +class Slant(): + def __init__(self, complexity=1): + #---------- private attributes + self.direction = 1 + self.angle = 0 + + #---------- generation parameters + self.regenerate_parameters(complexity) + #------------------------------------------------ + + def _get_current_parameters(self): + return [self.angle, self.direction] + + def get_settings_names(self): + return ['angle', 'direction'] + + def regenerate_parameters(self, complexity): + self.angle = numpy.random.uniform(0.0, complexity) + P = numpy.random.uniform() + self.direction = 1; + if P < 0.5: + self.direction = -1; + return self._get_current_parameters() + + + def transform_image(self,image): + if self.angle == 0: + return image + + ysize, xsize = image.shape + slant = self.direction*self.angle + + output = image.copy() + + # shift all the rows + for i in range(ysize): + line = image[i] + delta = round((i*slant)) % xsize + line1 = line[:xsize-delta] + line2 = line[xsize-delta:xsize] + + output[i][delta:xsize] = line1 + output[i][0:delta] = line2 + + + #correction to center the image + correction = (self.direction)*round(self.angle*ysize/2) + correction = (xsize - correction) % xsize + + # center the region + line1 = output[0:ysize,0:xsize-correction].copy() + line2 = output[0:ysize,xsize-correction:xsize].copy() + output[0:ysize,correction:xsize] = line1 + output[0:ysize,0:correction] = line2 + + + return output + + +# Test function +# Load an image in local and create several samples of the effect on the +# original image with different parameter. All the samples are saved in a single image, the 1st image being the original. + +def test_slant(): + import scipy + img_name = "test_img/mnist_0.png" + dest_img_name = "test_img/slanted.png" + nb_samples = 10 + im = Image.open(img_name) + im = im.convert("L") + image = numpy.asarray(im) + + image_final = image + slant = Slant() + for i in range(nb_samples): + slant.regenerate_parameters(1) + image_slant = slant.transform_image(image) + image_final = scipy.hstack((image_final,image_slant)) + + im = Image.fromarray(image_final.astype('uint8'), "L") + im.save(dest_img_name) + +# Test +if __name__ == '__main__': + import sys, os, fnmatch + import Image + + test_slant() + diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/testmod.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/testmod.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,130 @@ +# This script is to test your modules to see if they conform to the module API +# defined on the wiki. +import random, numpy, gc, time, math, sys + +# this is an example module that does stupid image value shifting + +class DummyModule(object): + def get_settings_names(self): + return ['value'] + + def regenerate_parameters(self, complexity): + self._value = random.gauss(0, 0.5*complexity) + return [self._value] + + def transform_image(self, image): + return numpy.clip(image+self._value, 0, 1) + +#import + +# instanciate your class here (rather than DummyModule) +mod = DummyModule() + +def error(msg): + print "ERROR:", msg + sys.exit(1) + +def warn(msg): + print "WARNING:", msg + +def timeit(f, lbl): + + gc.disable() + t = time.time() + f() + est = time.time() - t + gc.enable() + + loops = max(1, int(10**math.floor(math.log(10/est, 10)))) + + gc.disable() + t = time.time() + for _ in xrange(loops): + f() + + print lbl, "(", loops, "loops ):", (time.time() - t)/loops, "s" + gc.enable() + +######################## +# get_settings_names() # +######################## + +print "Testing get_settings_names()" + +names = mod.get_settings_names() + +if type(names) is not list: + error("Must return a list") + +if not all(type(e) is str for e in names): + warn("The elements of the list should be strings") + +########################### +# regenerate_parameters() # +########################### + +print "Testing regenerate_parameters()" + +params = mod.regenerate_parameters(0.2) + +if type(params) is not list: + error("Must return a list") + +if len(params) != len(names): + error("the returned parameter list must have the same length as the number of parameters") + +params2 = mod.regenerate_parameters(0.2) +if len(names) != 0 and params == params2: + error("the complexity parameter determines the distribution of the parameters, not their value") + +mod.regenerate_parameters(0.0) +mod.regenerate_parameters(1.0) + +mod.regenerate_parameters(0.5) + +##################### +# transform_image() # +##################### + +print "Testing transform_image()" + +imgr = numpy.random.random_sample((32, 32)).astype(numpy.float32) +img1 = numpy.ones((32, 32), dtype=numpy.float32) +img0 = numpy.zeros((32, 32), dtype=numpy.float32) + +resr = mod.transform_image(imgr) + +if type(resr) is not numpy.ndarray: + error("Must return an ndarray") + +if resr.shape != (32, 32): + error("Must return 32x32 array") + +if resr.dtype != numpy.float32: + error("Must return float32 array") + +res1 = mod.transform_image(img1) +res0 = mod.transform_image(img0) + +if res1.max() > 1.0 or res0.max() > 1.0: + error("Must keep array values between 0 and 1") + +if res1.min() < 0.0 or res0.min() < 0.0: + error("Must keep array values between 0 and 1") + +mod.regenerate_parameters(0.0) +mod.transform_image(imgr) +mod.regenerate_parameters(1.0) +mod.transform_image(imgr) + +print "Bonus Stage: timings" + +timeit(lambda: None, "empty") +timeit(lambda: mod.regenerate_parameters(0.5), "regenerate_parameters()") +timeit(lambda: mod.transform_image(imgr), "tranform_image()") + +def f(): + mod.regenerate_parameters(0.2) + mod.transform_image(imgr) + +timeit(f, "regen and transform") diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/thick.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/thick.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,198 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Simple implementation of random thickness deformation using morphological +operation of scipy. +Only one morphological operation applied (dilation or erosion), the kernel is random +out of a list of 12 symmetric kernels. (only 5 to be chosen for erosion because it can +hurt the recognizability of the charater and 12 for dilation). + +Author: Xavier Glorot + +''' + +import scipy.ndimage.morphology +import numpy as N + + +class Thick(): + def __init__(self,complexity = 1): + #---------- private attributes + self.__nx__ = 32 #xdim of the images + self.__ny__ = 32 #ydim of the images + self.__erodemax__ = 5 #nb of index max of erode structuring elements + self.__dilatemax__ = 9 #nb of index max of dilation structuring elements + self.__structuring_elements__ = [N.asarray([[1,1]]),N.asarray([[1],[1]]),\ + N.asarray([[1,1],[1,1]]),N.asarray([[0,1,0],[1,1,1],[0,1,0]]),\ + N.asarray([[1,1,1],[1,1,1]]),N.asarray([[1,1],[1,1],[1,1]]),\ + N.asarray([[1,1,1],[1,1,1],[1,1,1]]),\ + N.asarray([[1,1,1,1],[1,1,1,1],[1,1,1,1]]),\ + N.asarray([[1,1,1],[1,1,1],[1,1,1],[1,1,1]]),\ + N.asarray([[0,0,1,0,0],[0,1,1,1,0],[1,1,1,1,1],[0,1,1,1,0],[0,0,1,0,0]]),\ + N.asarray([[1,1,1,1],[1,1,1,1]]),N.asarray([[1,1],[1,1],[1,1],[1,1]])] + #------------------------------------------------ + + #---------- generation parameters + self.regenerate_parameters(complexity) + #------------------------------------------------ + + def _get_current_parameters(self): + return [self.thick_param] + + def get_settings_names(self): + return ['thick_param'] + + def regenerate_parameters(self, complexity): + self.erodenb = N.ceil(complexity * self.__erodemax__) + self.dilatenb = N.ceil(complexity * self.__dilatemax__) + self.Perode = self.erodenb / (self.dilatenb + self.erodenb + 1.0) + self.Pdilate = self.dilatenb / (self.dilatenb + self.erodenb + 1.0) + assert (self.Perode + self.Pdilate <= 1) & (self.Perode + self.Pdilate >= 0) + assert (complexity >= 0) & (complexity <= 1) + P = N.random.uniform() + if P>1-(self.Pdilate+self.Perode): + if P>1-(self.Pdilate+self.Perode)+self.Perode: + self.meth = 1 + self.nb=N.random.randint(self.dilatenb) + else: + self.meth = -1 + self.nb=N.random.randint(self.erodenb) + else: + self.meth = 0 + self.nb = -1 + self.thick_param = self.meth*self.nb + return self._get_current_parameters() + + def transform_1_image(self,image): #the real transformation method + if self.meth!=0: + maxi = float(N.max(image)) + mini = float(N.min(image)) + + imagenorm=image/maxi + + if self.meth==1: + trans=scipy.ndimage.morphology.grey_dilation\ + (imagenorm,size=self.__structuring_elements__[self.nb].shape,structure=self.__structuring_elements__[self.nb]) + else: + trans=scipy.ndimage.morphology.grey_erosion\ + (imagenorm,size=self.__structuring_elements__[self.nb].shape,structure=self.__structuring_elements__[self.nb]) + + #------renormalizing + maxit = N.max(trans) + minit = N.min(trans) + trans= N.asarray((trans - (minit+mini)) / (maxit - (minit+mini)) * maxi,dtype=image.dtype) + #-------- + return trans + else: + return image + + def transform_image(self,image): #handling different format + if image.shape == (self.__nx__,self.__ny__): + return self.transform_1_image(image) + if image.ndim == 3: + newimage = copy.copy(image) + for i in range(image.shape[0]): + newimage[i,:,:] = self.transform_1_image(image[i,:,:]) + return newimage + if image.ndim == 2 and image.shape != (self.__nx__,self.__ny__): + newimage = N.reshape(image,(image.shape[0],self.__nx__,self.__ny__)) + for i in range(image.shape[0]): + newimage[i,:,:] = self.transform_1_image(newimage[i,:,:]) + return N.reshape(newimage,image.shape) + if image.ndim == 1: + newimage = N.reshape(image,(self.__nx__,self.__ny__)) + newimage = self.transform_1_image(newimage) + return N.reshape(newimage,image.shape) + assert False #should never go there + + + + +#test on NIST (you need pylearn and access to NIST to do that) + +if __name__ == '__main__': + + from pylearn.io import filetensor as ft + import copy + import pygame + import time + datapath = '/data/lisa/data/nist/by_class/' + f = open(datapath+'digits/digits_train_data.ft') + d = ft.read(f) + + pygame.surfarray.use_arraytype('numpy') + + pygame.display.init() + screen = pygame.display.set_mode((8*4*32,8*32),0,8) + anglcolorpalette=[(x,x,x) for x in xrange(0,256)] + screen.set_palette(anglcolorpalette) + + MyThick = Thick() + + #debut=time.time() + #MyThick.transform_image(d) + #fin=time.time() + #print '------------------------------------------------' + #print d.shape[0],' images transformed in :', fin-debut, ' seconds' + #print '------------------------------------------------' + #print (fin-debut)/d.shape[0]*1000000,' microseconds per image' + #print '------------------------------------------------' + #print MyThick.get_settings_names() + #print MyThick._get_current_parameters() + #print MyThick.regenerate_parameters(0) + #print MyThick.regenerate_parameters(0.5) + #print MyThick.regenerate_parameters(1) + for i in range(10000): + a=d[i,:] + b=N.asarray(N.reshape(a,(32,32))).T + + new=pygame.surfarray.make_surface(b) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new.set_palette(anglcolorpalette) + screen.blit(new,(0,0)) + + #max dilation + MyThick.meth=1 + MyThick.nb=MyThick.__dilatemax__ + c=MyThick.transform_image(a) + b=N.asarray(N.reshape(c,(32,32))).T + + new=pygame.surfarray.make_surface(b) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new.set_palette(anglcolorpalette) + screen.blit(new,(8*32,0)) + + #max erosion + MyThick.meth=-1 + MyThick.nb=MyThick.__erodemax__ + c=MyThick.transform_image(a) + b=N.asarray(N.reshape(c,(32,32))).T + + new=pygame.surfarray.make_surface(b) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new.set_palette(anglcolorpalette) + screen.blit(new,(8*2*32,0)) + + #random + print MyThick.get_settings_names(), MyThick.regenerate_parameters(1) + c=MyThick.transform_image(a) + b=N.asarray(N.reshape(c,(32,32))).T + + new=pygame.surfarray.make_surface(b) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new=pygame.transform.scale2x(new) + new.set_palette(anglcolorpalette) + screen.blit(new,(8*3*32,0)) + + pygame.display.update() + raw_input('Press Enter') + + pygame.display.quit() diff -r 6f606b359df3 -r a9af079892ce data_generation/transformations/ttf2jpg.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_generation/transformations/ttf2jpg.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,123 @@ +#!/usr/bin/python +# -*- coding: iso-8859-1 -*- + +''' + Implementation of font image generator + download fonts from http://www.dafont.com for exemple + + Author: Guillaume Sicard +''' + +import sys, os, fnmatch, random +import Image, ImageFont, ImageDraw, numpy +import cPickle + +class ttf2jpg(): + def __init__(self, font_file = ''): + self.w = 32 + self.h = 32 + self.font_dir = '/Tmp/allfonts/' + self.font_file = font_file + self.image_dir = './images/' + self.pattern = '*.ttf' + self.char_list = [] + for i in range(0,10): + self.char_list.append(chr(ord('0') + i) ) + for i in range(0,26): + self.char_list.append(chr(ord('A') + i) ) + for i in range(0,26): + self.char_list.append(chr(ord('a') + i) ) + f = open( self.font_dir + 'filelist.pkl' ,'r') + self.font_files = cPickle.load(f) + f.close() + + # get font name + def get_settings_names(self): + return [self.font_file] + + # save an image + def save_image(self,array, filename = ''): + image = (array * 255.0).astype('int') + image = Image.fromarray(image).convert('L') + if (filename != ''): + image.save(filename) + else: + image.show() + + # set a random font for character generation + def set_random_font(self): + i = random.randint(0, len(self.font_files) - 1) + self.font_file = self.font_dir + self.font_files[i] + + # return a picture array of "text" with font "font_file" + def create_image(self, text): + # create a w x h black picture, and a drawing space + image = Image.new('L', (self.w, self.h), 'Black') + draw = ImageDraw.Draw(image) + + # load the font with the right size + font = ImageFont.truetype(self.font_file, 28) + d_w,d_h = draw.textsize(text, font=font) + + # write text and aligns it + draw.text(((32 - d_w) / 2, ((32 - d_h) / 2)), text, font=font, fill='White') + + image = numpy.asarray(image) + image = (image / 255.0).astype(numpy.float32) + + return image + + # write all the letters and numbers into pictures + def process_font(self): + for i in range(0, len(self.char_list) ): + image = self.create_image(self.char_list[i]) + self.save_image(image, self.image_dir + self.char_list[i] + '-' + os.path.basename(self.font_file) + '.jpg') + sys.stdout.write('.') + sys.stdout.flush() + return (len(self.char_list)) + + # generate the character from the font_file and returns a numpy array + def generate_image_from_char(self, character, font_file = ''): + if (font_file != ''): + self.font_file = font_file + + return self.create_image(character) + + # generate random character from random font file as a numpy array + def generate_image(self): + self.set_random_font() + i = random.randint(0, len(self.char_list) - 1) + return self.generate_image_from_char(self.char_list[i]), i + + # test method, create character images for all fonts in "font_dir" in dir "image_dir" + def test(self): + import time + + # look for ttf files + files = os.listdir(self.font_dir) + font_files = fnmatch.filter(files, self.pattern) + + # create "image_dir" if it doesn't exist + if not os.path.isdir(self.image_dir): + os.mkdir(self.image_dir) + + sys.stdout.write( str(len(font_files)) + ' fonts found, generating jpg images in folder ' + self.image_dir ) + sys.stdout.flush() + + # main loop + t = time.time() + n = 0 + + for font_file in font_files: + self.font_file = self.font_dir + font_file + n += self.process_font() + t = time.time() - t + + sys.stdout.write('\nall done!\n' + str(n) + ' images generated in ' + str(t) + 's (average : ' + str(1000 * t / n) + ' ms/im)\n') + +if __name__ == '__main__': + + myttf2jpg = ttf2jpg() + #myttf2jpg.test() + image, i = myttf2jpg.generate_image() + myttf2jpg.save_image(image, '') diff -r 6f606b359df3 -r a9af079892ce datasets/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,1 @@ +from defs import * diff -r 6f606b359df3 -r a9af079892ce datasets/dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/dataset.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,43 @@ +from dsetiter import DataIterator + +class DataSet(object): + def test(self, batchsize, bufsize=None): + r""" + Returns an iterator over the test examples. + + Parameters + batchsize (int) -- the size of the minibatches + bufsize (int, optional) -- the size of the in-memory buffer, + 0 to disable. + """ + return self._return_it(batchsize, bufsize, self._test) + + def train(self, batchsize, bufsize=None): + r""" + Returns an iterator over the training examples. + + Parameters + batchsize (int) -- the size of the minibatches + bufsize (int, optional) -- the size of the in-memory buffer, + 0 to disable. + """ + return self._return_it(batchsize, bufsize, self._train) + + def valid(self, batchsize, bufsize=None): + r""" + Returns an iterator over the validation examples. + + Parameters + batchsize (int) -- the size of the minibatches + bufsize (int, optional) -- the size of the in-memory buffer, + 0 to disable. + """ + return self._return_it(batchsize, bufsize, self._valid) + + def _return_it(batchsize, bufsize, data): + r""" + Must return an iterator over the specified dataset (`data`). + + Implement this in subclassses. + """ + raise NotImplemented diff -r 6f606b359df3 -r a9af079892ce datasets/defs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/defs.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,57 @@ +__all__ = ['nist_digits', 'nist_lower', 'nist_upper', 'nist_all', 'ocr', + 'nist_P07', 'mnist'] + +from ftfile import FTDataSet +from gzpklfile import GzpklDataSet +import theano +import os + +# if the environmental variables exist, get the path from them, +# otherwise fall back on the default +NIST_PATH = os.getenv('NIST_PATH','/data/lisa/data/nist/by_class/') +DATA_PATH = os.getenv('DATA_PATH','/data/lisa/data/ift6266h10/') + +nist_digits = lambda maxsize=None: FTDataSet(train_data = [os.path.join(NIST_PATH,'digits/digits_train_data.ft')], + train_lbl = [os.path.join(NIST_PATH,'digits/digits_train_labels.ft')], + test_data = [os.path.join(NIST_PATH,'digits/digits_test_data.ft')], + test_lbl = [os.path.join(NIST_PATH,'digits/digits_test_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) +nist_lower = lambda maxsize=None: FTDataSet(train_data = [os.path.join(NIST_PATH,'lower/lower_train_data.ft')], + train_lbl = [os.path.join(NIST_PATH,'lower/lower_train_labels.ft')], + test_data = [os.path.join(NIST_PATH,'lower/lower_test_data.ft')], + test_lbl = [os.path.join(NIST_PATH,'lower/lower_test_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) +nist_upper = lambda maxsize=None: FTDataSet(train_data = [os.path.join(NIST_PATH,'upper/upper_train_data.ft')], + train_lbl = [os.path.join(NIST_PATH,'upper/upper_train_labels.ft')], + test_data = [os.path.join(NIST_PATH,'upper/upper_test_data.ft')], + test_lbl = [os.path.join(NIST_PATH,'upper/upper_test_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) + +nist_all = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'train_data.ft')], + train_lbl = [os.path.join(DATA_PATH,'train_labels.ft')], + test_data = [os.path.join(DATA_PATH,'test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'valid_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) + +ocr = lambda maxsize=None: FTDataSet(train_data = [os.path.join(DATA_PATH,'ocr_train_data.ft')], + train_lbl = [os.path.join(DATA_PATH,'ocr_train_labels.ft')], + test_data = [os.path.join(DATA_PATH,'ocr_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'ocr_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'ocr_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'ocr_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) + +#There is 2 more arguments here to can choose smaller datasets based on the file number. +#This is usefull to get different data for pre-training and finetuning +nist_P07 = lambda maxsize=None, min_file=0, max_file=100: FTDataSet(train_data = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_data.ft') for i in range(min_file, max_file)], + train_lbl = [os.path.join(DATA_PATH,'data/P07_train'+str(i)+'_labels.ft') for i in range(min_file, max_file)], + test_data = [os.path.join(DATA_PATH,'data/P07_test_data.ft')], + test_lbl = [os.path.join(DATA_PATH,'data/P07_test_labels.ft')], + valid_data = [os.path.join(DATA_PATH,'data/P07_valid_data.ft')], + valid_lbl = [os.path.join(DATA_PATH,'data/P07_valid_labels.ft')], + indtype=theano.config.floatX, inscale=255., maxsize=maxsize) + +mnist = lambda maxsize=None: GzpklDataSet(os.path.join(DATA_PATH,'mnist.pkl.gz'), + maxsize=maxsize) diff -r 6f606b359df3 -r a9af079892ce datasets/dsetiter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/dsetiter.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,167 @@ +import numpy + +class DummyFile(object): + def __init__(self, size): + self.size = size + + def read(self, num): + if num > self.size: + num = self.size + self.size -= num + return numpy.zeros((num, 3, 2)) + +class DataIterator(object): + + def __init__(self, files, batchsize, bufsize=None): + r""" + Makes an iterator which will read examples from `files` + and return them in `batchsize` lots. + + Parameters: + files -- list of numpy readers + batchsize -- (int) the size of returned batches + bufsize -- (int, default=None) internal read buffer size. + + Tests: + >>> d = DataIterator([DummyFile(930)], 10, 100) + >>> d.batchsize + 10 + >>> d.bufsize + 100 + >>> d = DataIterator([DummyFile(1)], 10) + >>> d.batchsize + 10 + >>> d.bufsize + 10000 + >>> d = DataIterator([DummyFile(1)], 99) + >>> d.batchsize + 99 + >>> d.bufsize + 9999 + >>> d = DataIterator([DummyFile(1)], 10, 121) + >>> d.batchsize + 10 + >>> d.bufsize + 120 + >>> d = DataIterator([DummyFile(1)], 10, 1) + >>> d.batchsize + 10 + >>> d.bufsize + 10 + >>> d = DataIterator([DummyFile(1)], 2000) + >>> d.batchsize + 2000 + >>> d.bufsize + 20000 + >>> d = DataIterator([DummyFile(1)], 2000, 31254) + >>> d.batchsize + 2000 + >>> d.bufsize + 30000 + >>> d = DataIterator([DummyFile(1)], 2000, 10) + >>> d.batchsize + 2000 + >>> d.bufsize + 2000 + """ + self.batchsize = batchsize + if bufsize is None: + self.bufsize = max(10*batchsize, 10000) + else: + self.bufsize = bufsize + self.bufsize -= self.bufsize % self.batchsize + if self.bufsize < self.batchsize: + self.bufsize = self.batchsize + self.files = iter(files) + self.curfile = self.files.next() + self.empty = False + self._fill_buf() + + def _fill_buf(self): + r""" + Fill the internal buffer. + + Will fill across files in case the current one runs out. + + Test: + >>> d = DataIterator([DummyFile(20)], 10, 10) + >>> d._fill_buf() + >>> d.curpos + 0 + >>> len(d.buffer) + 10 + >>> d = DataIterator([DummyFile(11), DummyFile(9)], 10, 10) + >>> d._fill_buf() + >>> len(d.buffer) + 10 + >>> d._fill_buf() + Traceback (most recent call last): + ... + StopIteration + >>> d = DataIterator([DummyFile(10), DummyFile(9)], 10, 10) + >>> d._fill_buf() + >>> len(d.buffer) + 9 + >>> d._fill_buf() + Traceback (most recent call last): + ... + StopIteration + """ + self.buffer = None + if self.empty: + raise StopIteration + buf = self.curfile.read(self.bufsize) + + while len(buf) < self.bufsize: + try: + self.curfile = self.files.next() + except StopIteration: + self.empty = True + if len(buf) == 0: + raise + break + tmpbuf = self.curfile.read(self.bufsize - len(buf)) + buf = numpy.row_stack((buf, tmpbuf)) + + self.cursize = len(buf) + self.buffer = buf + self.curpos = 0 + + def __next__(self): + r""" + Returns the next portion of the dataset. + + Test: + >>> d = DataIterator([DummyFile(20)], 10, 20) + >>> len(d.next()) + 10 + >>> len(d.next()) + 10 + >>> d.next() + Traceback (most recent call last): + ... + StopIteration + >>> d.next() + Traceback (most recent call last): + ... + StopIteration + >>> d = DataIterator([DummyFile(13)], 10, 50) + >>> len(d.next()) + 10 + >>> len(d.next()) + 3 + >>> d.next() + Traceback (most recent call last): + ... + StopIteration + """ + if self.curpos >= self.cursize: + self._fill_buf() + res = self.buffer[self.curpos:self.curpos+self.batchsize] + self.curpos += self.batchsize + return res + + next = __next__ + + def __iter__(self): + return self diff -r 6f606b359df3 -r a9af079892ce datasets/ftfile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/ftfile.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,229 @@ +from pylearn.io.filetensor import _read_header, _prod +import numpy, theano +from dataset import DataSet +from dsetiter import DataIterator +from itertools import izip, imap + +class FTFile(object): + def __init__(self, fname, scale=1, dtype=None): + r""" + Tests: + >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') + """ + self.file = open(fname, 'rb') + self.magic_t, self.elsize, _, self.dim, _ = _read_header(self.file, False) + self.size = self.dim[0] + self.scale = scale + self.dtype = dtype + + def skip(self, num): + r""" + Skips `num` items in the file. + + If `num` is negative, skips size-num. + + Tests: + >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') + >>> f.size + 58646 + >>> f.elsize + 4 + >>> f.file.tell() + 20 + >>> f.skip(1000) + >>> f.file.tell() + 4020 + >>> f.size + 57646 + >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') + >>> f.size + 58646 + >>> f.file.tell() + 20 + >>> f.skip(-1000) + >>> f.file.tell() + 230604 + >>> f.size + 1000 + """ + if num < 0: + num += self.size + if num < 0: + raise ValueError('Skipping past the start of the file') + if num >= self.size: + self.size = 0 + else: + self.size -= num + f_start = self.file.tell() + self.file.seek(f_start + (self.elsize * _prod(self.dim[1:]) * num)) + + def read(self, num): + r""" + Reads `num` elements from the file and return the result as a + numpy matrix. Last read is truncated. + + Tests: + >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_labels.ft') + >>> f.read(1) + array([6], dtype=int32) + >>> f.read(10) + array([7, 4, 7, 5, 6, 4, 8, 0, 9, 6], dtype=int32) + >>> f.skip(58630) + >>> f.read(10) + array([9, 2, 4, 2, 8], dtype=int32) + >>> f.read(10) + array([], dtype=int32) + >>> f = FTFile('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') + >>> f.read(1) + array([[0, 0, 0, ..., 0, 0, 0]], dtype=uint8) + """ + if num > self.size: + num = self.size + self.dim[0] = num + self.size -= num + res = numpy.fromfile(self.file, dtype=self.magic_t, count=_prod(self.dim)).reshape(self.dim) + if self.dtype is not None: + res = res.astype(self.dtype) + if self.scale != 1: + res /= self.scale + return res + +class FTSource(object): + def __init__(self, file, skip=0, size=None, maxsize=None, + dtype=None, scale=1): + r""" + Create a data source from a possible subset of a .ft file. + + Parameters: + `file` -- (string) the filename + `skip` -- (int, optional) amount of examples to skip from + the start of the file. If negative, skips + filesize - skip. + `size` -- (int, optional) truncates number of examples + read (after skipping). If negative truncates to + filesize - size (also after skipping). + `maxsize` -- (int, optional) the maximum size of the file + `dtype` -- (dtype, optional) convert the data to this + dtype after reading. + `scale` -- (number, optional) scale (that is divide) the + data by this number (after dtype conversion, if + any). + + Tests: + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1000) + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=10) + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=100, size=120) + """ + self.file = file + self.skip = skip + self.size = size + self.dtype = dtype + self.scale = scale + self.maxsize = maxsize + + def open(self): + r""" + Returns an FTFile that corresponds to this dataset. + + Tests: + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft') + >>> f = s.open() + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=1) + >>> len(s.open().read(2)) + 1 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646) + >>> s.open().size + 1000 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', skip=57646, size=1) + >>> s.open().size + 1 + >>> s = FTSource('/data/lisa/data/nist/by_class/digits/digits_test_data.ft', size=-10) + >>> s.open().size + 58636 + """ + f = FTFile(self.file, scale=self.scale, dtype=self.dtype) + if self.skip != 0: + f.skip(self.skip) + if self.size is not None and self.size < f.size: + if self.size < 0: + f.size += self.size + if f.size < 0: + f.size = 0 + else: + f.size = self.size + if self.maxsize is not None and f.size > self.maxsize: + f.size = self.maxsize + return f + +class FTData(object): + r""" + This is a list of FTSources. + """ + def __init__(self, datafiles, labelfiles, skip=0, size=None, maxsize=None, + inscale=1, indtype=None, outscale=1, outdtype=None): + if maxsize is not None: + maxsize /= len(datafiles) + self.inputs = [FTSource(f, skip, size, maxsize, scale=inscale, dtype=indtype) + for f in datafiles] + self.outputs = [FTSource(f, skip, size, maxsize, scale=outscale, dtype=outdtype) + for f in labelfiles] + + def open_inputs(self): + return [f.open() for f in self.inputs] + + def open_outputs(self): + return [f.open() for f in self.outputs] + + +class FTDataSet(DataSet): + def __init__(self, train_data, train_lbl, test_data, test_lbl, + valid_data=None, valid_lbl=None, indtype=None, outdtype=None, + inscale=1, outscale=1, maxsize=None): + r""" + Defines a DataSet from a bunch of files. + + Parameters: + `train_data` -- list of train data files + `train_label` -- list of train label files (same length as `train_data`) + `test_data`, `test_labels` -- same thing as train, but for + test. The number of files + can differ from train. + `valid_data`, `valid_labels` -- same thing again for validation. + (optional) + `indtype`, `outdtype`, -- see FTSource.__init__() + `inscale`, `outscale` (optional) + `maxsize` -- maximum size of the set returned + + + If `valid_data` and `valid_labels` are not supplied then a sample + approximately equal in size to the test set is taken from the train + set. + """ + if valid_data is None: + total_valid_size = sum(FTFile(td).size for td in test_data) + if maxsize is not None: + total_valid_size = min(total_valid_size, maxsize) + valid_size = total_valid_size/len(train_data) + self._train = FTData(train_data, train_lbl, size=-valid_size, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype, + maxsize=maxsize) + self._valid = FTData(train_data, train_lbl, skip=-valid_size, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype, + maxsize=maxsize) + else: + self._train = FTData(train_data, train_lbl, maxsize=maxsize, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype) + self._valid = FTData(valid_data, valid_lbl, maxsize=maxsize, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype) + self._test = FTData(test_data, test_lbl, maxsize=maxsize, + inscale=inscale, outscale=outscale, + indtype=indtype, outdtype=outdtype) + + def _return_it(self, batchsize, bufsize, ftdata): + return izip(DataIterator(ftdata.open_inputs(), batchsize, bufsize), + DataIterator(ftdata.open_outputs(), batchsize, bufsize)) diff -r 6f606b359df3 -r a9af079892ce datasets/gzpklfile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets/gzpklfile.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,40 @@ +import gzip +try: + import cPickle as pickle +except ImportError: + import pickle + +from dataset import DataSet +from dsetiter import DataIterator +from itertools import izip + +class ArrayFile(object): + def __init__(self, ary): + self.ary = ary + self.pos = 0 + + def read(self, num): + res = self.ary[self.pos:self.pos+num] + self.pos += num + return res + +class GzpklDataSet(DataSet): + def __init__(self, fname, maxsize): + self._fname = fname + self.maxsize = maxsize + self._train = 0 + self._valid = 1 + self._test = 2 + + def _load(self): + f = gzip.open(self._fname, 'rb') + try: + self.datas = pickle.load(f) + finally: + f.close() + + def _return_it(self, batchsz, bufsz, id): + if not hasattr(self, 'datas'): + self._load() + return izip(DataIterator([ArrayFile(self.datas[id][0][:self.maxsize])], batchsz, bufsz), + DataIterator([ArrayFile(self.datas[id][1][:self.maxsize])], batchsz, bufsz)) diff -r 6f606b359df3 -r a9af079892ce deep/__init__.py diff -r 6f606b359df3 -r a9af079892ce deep/autoencoder/DA_training.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/autoencoder/DA_training.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,606 @@ +""" + This tutorial introduces stacked denoising auto-encoders (SdA) using Theano. + + Denoising autoencoders are the building blocks for SDAE. + They are based on auto-encoders as the ones used in Bengio et al. 2007. + An autoencoder takes an input x and first maps it to a hidden representation + y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting + latent representation y is then mapped back to a "reconstructed" vector + z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight + matrix W' can optionally be constrained such that W' = W^T, in which case + the autoencoder is said to have tied weights. The network is trained such + that to minimize the reconstruction error (the error between x and z). + + For the denosing autoencoder, during training, first x is corrupted into + \tilde{x}, where \tilde{x} is a partially destroyed version of x by means + of a stochastic mapping. Afterwards y is computed as before (using + \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction + error is now measured between z and the uncorrupted input x, which is + computed as the cross-entropy : + - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] + + For X iteration of the main program loop it takes *** minutes on an + Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor). + + + References : + - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and + Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, + 2008 + - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise + Training of Deep Networks, Advances in Neural Information Processing + Systems 19, 2007 + +""" + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import gzip +import cPickle + +from pylearn.io import filetensor as ft + +class dA(): + """Denoising Auto-Encoder class (dA) + + A denoising autoencoders tries to reconstruct the input from a corrupted + version of it by projecting it first in a latent space and reprojecting + it afterwards back in the input space. Please refer to Vincent et al.,2008 + for more details. If x is the input then equation (1) computes a partially + destroyed version of x by means of a stochastic mapping q_D. Equation (2) + computes the projection of the input into the latent space. Equation (3) + computes the reconstruction of the input, while equation (4) computes the + reconstruction error. + + .. math:: + + \tilde{x} ~ q_D(\tilde{x}|x) (1) + + y = s(W \tilde{x} + b) (2) + + z = s(W' y + b') (3) + + L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) + + """ + + def __init__(self, n_visible= 784, n_hidden= 500, complexity = 0.1, input= None): + """ + Initialize the DAE class by specifying the number of visible units (the + dimension d of the input ), the number of hidden units ( the dimension + d' of the latent or hidden space ) and by giving a symbolic variable + for the input. Such a symbolic variable is useful when the input is + the result of some computations. For example when dealing with SDAEs, + the dA on layer 2 gets as input the output of the DAE on layer 1. + This output can be written as a function of the input to the entire + model, and as such can be computed by theano whenever needed. + + :param n_visible: number of visible units + + :param n_hidden: number of hidden units + + :param input: a symbolic description of the input or None + + """ + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + # create a numpy random generator + numpy_rng = numpy.random.RandomState() + + # print the parameter of the DA + if True : + print 'input size = %d' %n_visible + print 'hidden size = %d' %n_hidden + print 'complexity = %2.2f' %complexity + + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_visible+n_hidden)), \ + high = numpy.sqrt(6./(n_visible+n_hidden)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden) + + # W' is initialized with `initial_W_prime` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_b_prime= numpy.zeros(n_visible) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + x = T.dmatrix(name = 'input') + else: + x = input + # Equation (1) + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 0.9 and 0 of 0.1 + + tilde_x = theano_rng.binomial( x.shape, 1, 1-complexity) * x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(tilde_x, self.W ) + self.b) + # Equation (3) + z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + self.L = - T.sum( x*T.log(z) + (1-x)*T.log(1-z), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + # note : y is computed from the corrupted `tilde_x`. Later on, + # we will need the hidden layer obtained from the uncorrupted + # input when for example we will pass this as input to the layer + # above + self.hidden_values = T.nnet.sigmoid( T.dot(x, self.W) + self.b) + + + +def sgd_optimization_nist( learning_rate=0.01, \ + n_iter = 300, n_code_layer = 400, \ + complexity = 0.1): + """ + Demonstrate stochastic gradient descent optimization for a denoising autoencoder + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :param pretraining_epochs: number of epoch to do pretraining + + :param pretrain_lr: learning rate to be used during pre-training + + :param n_iter: maximal number of iterations ot run the optimizer + + """ + #open file to save the validation and test curve + filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \ + 'c_' + str(complexity) + '.txt' + + result_file = open(filename, 'w') + + + + data_path = '/data/lisa/data/nist/by_class/' + f = open(data_path+'all/all_train_data.ft') + g = open(data_path+'all/all_train_labels.ft') + h = open(data_path+'all/all_test_data.ft') + i = open(data_path+'all/all_test_labels.ft') + + train_set_x = ft.read(f) + train_set_y = ft.read(g) + test_set_x = ft.read(h) + test_set_y = ft.read(i) + + f.close() + g.close() + i.close() + h.close() + + # make minibatches of size 20 + batch_size = 20 # sized of the minibatch + + #create a validation set the same size as the test size + #use the end of the training array for this purpose + #discard the last remaining so we get a %batch_size number + test_size=len(test_set_y) + test_size = int(test_size/batch_size) + test_size*=batch_size + train_size = len(train_set_x) + train_size = int(train_size/batch_size) + train_size*=batch_size + validation_size =test_size + offset = train_size-test_size + if True: + print 'train size = %d' %train_size + print 'test size = %d' %test_size + print 'valid size = %d' %validation_size + print 'offset = %d' %offset + + + #train_set = (train_set_x,train_set_y) + train_batches = [] + for i in xrange(0, train_size-test_size, batch_size): + train_batches = train_batches + \ + [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + + test_batches = [] + for i in xrange(0, test_size, batch_size): + test_batches = test_batches + \ + [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + + valid_batches = [] + for i in xrange(0, test_size, batch_size): + valid_batches = valid_batches + \ + [(train_set_x[offset+i:offset+i+batch_size], \ + train_set_y[offset+i:offset+i+batch_size])] + + + ishape = (32,32) # this is the size of NIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + # construct the denoising autoencoder class + n_ins = 32*32 + encoder = dA(n_ins, n_code_layer, complexity, input = x.reshape((batch_size,n_ins))) + + # Train autoencoder + + # compute gradients of the layer parameters + gW = T.grad(encoder.cost, encoder.W) + gb = T.grad(encoder.cost, encoder.b) + gb_prime = T.grad(encoder.cost, encoder.b_prime) + # compute the updated value of the parameters after one step + updated_W = encoder.W - gW * learning_rate + updated_b = encoder.b - gb * learning_rate + updated_b_prime = encoder.b_prime - gb_prime * learning_rate + + # defining the function that evaluate the symbolic description of + # one update step + train_model = theano.function([x], encoder.cost, updates=\ + { encoder.W : updated_W, \ + encoder.b : updated_b, \ + encoder.b_prime : updated_b_prime } ) + + + + + # compiling a theano function that computes the mistakes that are made + # by the model on a minibatch + test_model = theano.function([x], encoder.cost) + + normalize = numpy.asarray(255, dtype=theano.config.floatX) + + + n_minibatches = len(train_batches) + + # early-stopping parameters + patience = 10000000 / batch_size # look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + # have a maximum of `n_iter` iterations through the entire dataset + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + ''' + if iter == 0: + b = numpy.asarray(255, dtype=theano.config.floatX) + x = x / b + print x + print y + print x.__class__ + print x.shape + print x.dtype.name + print y.dtype.name + print x.min(), x.max() + ''' + + cost_ij = train_model(x/normalize) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + # sum up the errors for each minibatch + this_validation_loss += test_model(x/normalize) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(valid_batches) + + print('epoch %i, minibatch %i/%i, validation error %f ' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss)) + + # save value in file + result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n') + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + best_validation_loss = this_validation_loss + best_iter = iter + # test it on the test set + + test_score = 0. + for x,y in test_batches: + test_score += test_model(x/normalize) + test_score /= len(test_batches) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f ') % + (epoch, minibatch_index+1, n_minibatches, + test_score)) + + if patience <= iter : + print('iter (%i) is superior than patience(%i). break', (iter, patience)) + break + + + + end_time = time.clock() + print(('Optimization complete with best validation score of %f ,' + 'with test performance %f ') % + (best_validation_loss, test_score)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + result_file.close() + + return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter) + +def sgd_optimization_mnist( learning_rate=0.01, \ + n_iter = 1, n_code_layer = 400, \ + complexity = 0.1): + """ + Demonstrate stochastic gradient descent optimization for a denoising autoencoder + + This is demonstrated on MNIST. + + :param learning_rate: learning rate used (factor for the stochastic + gradient + + :param pretraining_epochs: number of epoch to do pretraining + + :param pretrain_lr: learning rate to be used during pre-training + + :param n_iter: maximal number of iterations ot run the optimizer + + """ + #open file to save the validation and test curve + filename = 'lr_' + str(learning_rate) + 'ni_' + str(n_iter) + 'nc_' + str(n_code_layer) + \ + 'c_' + str(complexity) + '.txt' + + result_file = open(filename, 'w') + + # Load the dataset + f = gzip.open('/u/lisa/HTML/deep/data/mnist/mnist.pkl.gz','rb') + train_set, valid_set, test_set = cPickle.load(f) + f.close() + + # make minibatches of size 20 + batch_size = 20 # sized of the minibatch + + # Dealing with the training set + # get the list of training images (x) and their labels (y) + (train_set_x, train_set_y) = train_set + # initialize the list of training minibatches with empty list + train_batches = [] + for i in xrange(0, len(train_set_x), batch_size): + # add to the list of minibatches the minibatch starting at + # position i, ending at position i+batch_size + # a minibatch is a pair ; the first element of the pair is a list + # of datapoints, the second element is the list of corresponding + # labels + train_batches = train_batches + \ + [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] + + # Dealing with the validation set + (valid_set_x, valid_set_y) = valid_set + # initialize the list of validation minibatches + valid_batches = [] + for i in xrange(0, len(valid_set_x), batch_size): + valid_batches = valid_batches + \ + [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] + + # Dealing with the testing set + (test_set_x, test_set_y) = test_set + # initialize the list of testing minibatches + test_batches = [] + for i in xrange(0, len(test_set_x), batch_size): + test_batches = test_batches + \ + [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] + + + ishape = (28,28) # this is the size of MNIST images + + # allocate symbolic variables for the data + x = T.fmatrix() # the data is presented as rasterized images + y = T.lvector() # the labels are presented as 1D vector of + # [long int] labels + + # construct the denoising autoencoder class + n_ins = 28*28 + encoder = dA(n_ins, n_code_layer, complexity, input = x.reshape((batch_size,n_ins))) + + # Train autoencoder + + # compute gradients of the layer parameters + gW = T.grad(encoder.cost, encoder.W) + gb = T.grad(encoder.cost, encoder.b) + gb_prime = T.grad(encoder.cost, encoder.b_prime) + # compute the updated value of the parameters after one step + updated_W = encoder.W - gW * learning_rate + updated_b = encoder.b - gb * learning_rate + updated_b_prime = encoder.b_prime - gb_prime * learning_rate + + # defining the function that evaluate the symbolic description of + # one update step + train_model = theano.function([x], encoder.cost, updates=\ + { encoder.W : updated_W, \ + encoder.b : updated_b, \ + encoder.b_prime : updated_b_prime } ) + + + + + # compiling a theano function that computes the mistakes that are made + # by the model on a minibatch + test_model = theano.function([x], encoder.cost) + + + + + n_minibatches = len(train_batches) + + # early-stopping parameters + patience = 10000# look as this many examples regardless + patience_increase = 2 # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = n_minibatches # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + + best_params = None + best_validation_loss = float('inf') + best_iter = 0 + test_score = 0. + start_time = time.clock() + # have a maximum of `n_iter` iterations through the entire dataset + for iter in xrange(n_iter* n_minibatches): + + # get epoch and minibatch index + epoch = iter / n_minibatches + minibatch_index = iter % n_minibatches + + # get the minibatches corresponding to `iter` modulo + # `len(train_batches)` + x,y = train_batches[ minibatch_index ] + cost_ij = train_model(x) + + if (iter+1) % validation_frequency == 0: + # compute zero-one loss on validation set + this_validation_loss = 0. + for x,y in valid_batches: + # sum up the errors for each minibatch + this_validation_loss += test_model(x) + # get the average by dividing with the number of minibatches + this_validation_loss /= len(valid_batches) + + print('epoch %i, minibatch %i/%i, validation error %f ' % \ + (epoch, minibatch_index+1, n_minibatches, \ + this_validation_loss)) + + # save value in file + result_file.write(str(epoch) + ' ' + str(this_validation_loss)+ '\n') + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + best_validation_loss = this_validation_loss + best_iter = iter + # test it on the test set + + test_score = 0. + for x,y in test_batches: + test_score += test_model(x) + test_score /= len(test_batches) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f ') % + (epoch, minibatch_index+1, n_minibatches, + test_score)) + + if patience <= iter : + print('iter (%i) is superior than patience(%i). break', iter, patience) + break + + + end_time = time.clock() + print(('Optimization complete with best validation score of %f ,' + 'with test performance %f ') % + (best_validation_loss, test_score)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + + result_file.close() + + return (best_validation_loss, test_score, (end_time-start_time)/60, best_iter) + + +def experiment(state,channel): + + (best_validation_loss, test_score, minutes_trained, iter) = \ + sgd_optimization_mnist(state.learning_rate, state.n_iter, state.n_code_layer, + state.complexity) + + state.best_validation_loss = best_validation_loss + state.test_score = test_score + state.minutes_trained = minutes_trained + state.iter = iter + + return channel.COMPLETE + +def experiment_nist(state,channel): + + (best_validation_loss, test_score, minutes_trained, iter) = \ + sgd_optimization_nist(state.learning_rate, state.n_iter, state.n_code_layer, + state.complexity) + + state.best_validation_loss = best_validation_loss + state.test_score = test_score + state.minutes_trained = minutes_trained + state.iter = iter + + return channel.COMPLETE + + +if __name__ == '__main__': + + sgd_optimization_nist() + + diff -r 6f606b359df3 -r a9af079892ce deep/autoencoder/__init__.py diff -r 6f606b359df3 -r a9af079892ce deep/convolutional_dae/__init__.py diff -r 6f606b359df3 -r a9af079892ce deep/convolutional_dae/run_exp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/run_exp.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,85 @@ +from ift6266.deep.convolutional_dae.scdae import * + +class dumb(object): + def save(self): + pass + +def go(state, channel): + from ift6266 import datasets + from ift6266.deep.convolutional_dae.sgd_opt import sgd_opt + import pylearn, theano, ift6266 + import pylearn.version + import sys + + # params: bsize, pretrain_lr, train_lr, nfilts1, nfilts2, nftils3, nfilts4 + # pretrain_rounds, noise, mlp_sz + + pylearn.version.record_versions(state, [theano, ift6266, pylearn]) + # TODO: maybe record pynnet version? + channel.save() + + dset = datasets.nist_digits() + + nfilts = [] + if state.nfilts1 != 0: + nfilts.append(state.nfilts1) + if state.nfilts2 != 0: + nfilts.append(state.nfilts2) + if state.nfilts3 != 0: + nfilts.append(state.nfilts3) + if state.nfilts4 != 0: + nfilts.append(state.nfilts4) + + fsizes = [(5,5)]*len(nfilts) + subs = [(2,2)]*len(nfilts) + noise = [state.noise]*len(nfilts) + + pretrain_funcs, trainf, evalf, net = build_funcs( + img_size=(32, 32), + batch_size=state.bsize, + filter_sizes=fsizes, + num_filters=nfilts, + subs=subs, + noise=noise, + mlp_sizes=[state.mlp_sz], + out_size=62, + dtype=numpy.float32, + pretrain_lr=state.pretrain_lr, + train_lr=state.train_lr) + + t_it = repeat_itf(dset.train, state.bsize) + pretrain_fs, train, valid, test = massage_funcs( + t_it, t_it, dset, state.bsize, + pretrain_funcs, trainf,evalf) + + series = create_series() + + print "pretraining ..." + sys.stdout.flush() + do_pretrain(pretrain_fs, state.pretrain_rounds, series['recons_error']) + + print "training ..." + sys.stdout.flush() + best_valid, test_score = sgd_opt(train, valid, test, + training_epochs=100000, patience=10000, + patience_increase=2., + improvement_threshold=0.995, + validation_frequency=1000, + series=series, net=net) + state.best_valid = best_valid + state.test_score = test_score + channel.save() + return channel.COMPLETE + +if __name__ == '__main__': + st = dumb() + st.bsize = 100 + st.pretrain_lr = 0.01 + st.train_lr = 0.1 + st.nfilts1 = 4 + st.nfilts2 = 4 + st.nfilts3 = 0 + st.pretrain_rounds = 500 + st.noise=0.2 + st.mlp_sz = 500 + go(st, dumb()) diff -r 6f606b359df3 -r a9af079892ce deep/convolutional_dae/scdae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/scdae.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,219 @@ +from pynnet import * + +import numpy +import theano +import theano.tensor as T + +from itertools import izip +from ift6266.utils.seriestables import * + +class cdae(LayerStack): + def __init__(self, filter_size, num_filt, num_in, subsampling, corruption, + dtype): + LayerStack.__init__(self, [ConvAutoencoder(filter_size=filter_size, + num_filt=num_filt, + num_in=num_in, + noisyness=corruption, + dtype=dtype), + MaxPoolLayer(subsampling)]) + + def build(self, input, input_shape=None): + LayerStack.build(self, input, input_shape) + self.cost = self.layers[0].cost + self.pre_params = self.layers[0].pre_params + +def scdae(filter_sizes, num_filts, subsamplings, corruptions, dtype): + layers = [] + old_nfilt = 1 + for fsize, nfilt, subs, corr in izip(filter_sizes, num_filts, + subsamplings, corruptions): + layers.append(cdae(fsize, nfilt, old_nfilt, subs, corr, dtype)) + old_nfilt = nfilt + return LayerStack(layers) + +def mlp(layer_sizes, dtype): + layers = [] + old_size = layer_sizes[0] + for size in layer_sizes[1:]: + layers.append(SimpleLayer(old_size, size, activation=nlins.tanh, + dtype=dtype)) + old_size = size + return LayerStack(layers) + +def scdae_net(in_size, num_in, filter_sizes, num_filts, subsamplings, + corruptions, layer_sizes, out_size, dtype): + rl1 = ReshapeLayer((None,)+in_size) + ls = scdae(num_in, filter_sizes, num_filts, subsamplings, + corruptions, dtype) + x = T.tensor4() + ls.build(x, input_shape=(1,)+in_size) + outs = numpy.prod(ls.output_shape) + rl2 = ReshapeLayer((None, outs)) + layer_sizes = [outs]+layer_sizes + ls2 = mlp(layer_sizes, dtype) + lrl = SimpleLayer(layer_sizes[-1], out_size, activation=nlins.softmax) + return NNet([rl1, ls, rl2, ls2, lrl], error=errors.nll) + +def build_funcs(batch_size, img_size, filter_sizes, num_filters, subs, + noise, mlp_sizes, out_size, dtype, pretrain_lr, train_lr): + + n = scdae_net((1,)+img_size, batch_size, filter_sizes, num_filters, subs, + noise, mlp_sizes, out_size, dtype) + + n.save('start.net') + + x = T.fmatrix('x') + y = T.ivector('y') + + def pretrainfunc(net, alpha): + up = trainers.get_updates(net.pre_params, net.cost, alpha) + return theano.function([x], net.cost, updates=up) + + def trainfunc(net, alpha): + up = trainers.get_updates(net.params, net.cost, alpha) + return theano.function([x, y], net.cost, updates=up) + + n.build(x, y, input_shape=(bsize, 1)+img_size) + pretrain_funcs_opt = [pretrainfunc(l, pretrain_lr) for l in n.layers[1].layers] + trainf_opt = trainfunc(n, train_lr) + evalf_opt = theano.function([x, y], errors.class_error(n.output, y)) + + n.build(x, y) + pretrain_funcs_reg = [pretrainfunc(l, 0.01) for l in n.layers[1].layers] + trainf_reg = trainfunc(n, 0.1) + evalf_reg = theano.function([x, y], errors.class_error(n.output, y)) + + def select_f(f1, f2, bsize): + def f(x): + if x.shape[0] == bsize: + return f1(x) + else: + return f2(x) + return f + + pretrain_funcs = [select_f(p_opt, p_reg, batch_size) for p_opt, p_reg in zip(pretrain_funcs_opt, pretrain_funcs_reg)] + + def select_f2(f1, f2, bsize): + def f(x, y): + if x.shape[0] == bsize: + return f1(x, y) + else: + return f2(x, y) + return f + + trainf = select_f2(trainf_opt, trainf_reg, batch_size) + evalf = select_f2(evalf_opt, evalf_reg, batch_size) + return pretrain_funcs, trainf, evalf, n + +def do_pretrain(pretrain_funcs, pretrain_epochs, serie): + for layer, f in enumerate(pretrain_funcs): + for epoch in xrange(pretrain_epochs): + serie.append((layer, epoch), f()) + +def massage_funcs(pretrain_it, train_it, dset, batch_size, pretrain_funcs, + trainf, evalf): + def pretrain_f(f): + def res(): + for x, y in pretrain_it: + yield f(x) + it = res() + return lambda: it.next() + + pretrain_fs = map(pretrain_f, pretrain_funcs) + + def train_f(f): + def dset_it(): + for x, y in train_it: + yield f(x, y) + it = dset_it() + return lambda: it.next() + + train = train_f(trainf) + + def eval_f(f, dsetf): + def res(): + c = 0 + i = 0 + for x, y in dsetf(batch_size): + i += x.shape[0] + c += f(x, y)*x.shape[0] + return c/i + return res + + test = eval_f(evalf, dset.test) + valid = eval_f(evalf, dset.valid) + + return pretrain_fs, train, valid, test + +def repeat_itf(itf, *args, **kwargs): + while True: + for e in itf(*args, **kwargs): + yield e + +def create_series(): + import tables + + series = {} + h5f = tables.openFile('series.h5', 'w') + + series['recons_error'] = AccumulatorSeriesWrapper( + base_series=ErrorSeries(error_name='reconstruction_error', + table_name='reconstruction_error', + hdf5_file=h5f, + index_names=('layer', 'epoch'), + title="Reconstruction error (mse)"), + reduce_every=100) + + series['train_error'] = AccumulatorSeriesWrapper( + base_series=ErrorSeries(error_name='training_error', + table_name='training_error', + hdf5_file=h5f, + index_names=('iter',), + title='Training error (nll)'), + reduce_every=100) + + series['valid_error'] = ErrorSeries(error_name='valid_error', + table_name='valid_error', + hdf5_file=h5f, + index_names=('iter',), + title='Validation error (class)') + + series['test_error'] = ErrorSeries(error_name='test_error', + table_name='test_error', + hdf5_file=h5f, + index_names=('iter',), + title='Test error (class)') + + return series + +if __name__ == '__main__': + from ift6266 import datasets + from sgd_opt import sgd_opt + import sys, time + + batch_size = 100 + dset = datasets.mnist() + + pretrain_funcs, trainf, evalf, net = build_funcs( + img_size = (28, 28), + batch_size=batch_size, filter_sizes=[(5,5), (3,3)], + num_filters=[4, 4], subs=[(2,2), (2,2)], noise=[0.2, 0.2], + mlp_sizes=[500], out_size=10, dtype=numpy.float32, + pretrain_lr=0.01, train_lr=0.1) + + t_it = repeat_itf(dset.train, batch_size) + pretrain_fs, train, valid, test = massage_funcs( + t_it, t_it, dset, batch_size, + pretrain_funcs, trainf, evalf) + + print "pretraining ...", + sys.stdout.flush() + start = time.time() + do_pretrain(pretrain_fs, 2500, DummySeries()) + end = time.time() + print "done (in", end-start, "s)" + + sgd_opt(train, valid, test, training_epochs=10000, patience=1000, + patience_increase=2., improvement_threshold=0.995, + validation_frequency=250) + diff -r 6f606b359df3 -r a9af079892ce deep/convolutional_dae/sgd_opt.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/sgd_opt.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,66 @@ +import time +import sys, os + +from ift6266.utils.seriestables import * + +default_series = { + 'train_error' : DummySeries(), + 'valid_error' : DummySeries(), + 'test_error' : DummySeries() + } + +def sgd_opt(train, valid, test, training_epochs=10000, patience=10000, + patience_increase=2., improvement_threshold=0.995, net=None, + validation_frequency=None, series=default_series): + + if validation_frequency is None: + validation_frequency = patience/2 + + start_time = time.clock() + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + + start_time = time.clock() + + for epoch in xrange(1, training_epochs+1): + series['train_error'].append((epoch,), train()) + + if epoch % validation_frequency == 0: + this_validation_loss = valid() + series['valid_error'].append((epoch,), this_validation_loss*100.) + print('epoch %i, validation error %f %%' % \ + (epoch, this_validation_loss*100.)) + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, epoch * patience_increase) + + # save best validation score and epoch number + best_validation_loss = this_validation_loss + best_epoch = epoch + + # test it on the test set + test_score = test() + series['test_error'].append((epoch,), test_score*100.) + print((' epoch %i, test error of best model %f %%') % + (epoch, test_score*100.)) + if net is not None: + net.save('best.net.new') + os.rename('best.net.new', 'best.net') + + if patience <= epoch: + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + + return best_validation_loss, test_score diff -r 6f606b359df3 -r a9af079892ce deep/convolutional_dae/stacked_convolutional_dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/convolutional_dae/stacked_convolutional_dae.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,323 @@ +import numpy +import theano +import time +import sys +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +#import theano.sandbox.softsign + +from theano.tensor.signal import downsample +from theano.tensor.nnet import conv + +from ift6266 import datasets +from ift6266.baseline.log_reg.log_reg import LogisticRegression + +batch_size = 100 + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.tanh(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + +class dA_conv(object): + + def __init__(self, input, filter_shape, corruption_level = 0.1, + shared_W = None, shared_b = None, image_shape = None, + poolsize = (2,2)): + + theano_rng = RandomStreams() + + fan_in = numpy.prod(filter_shape[1:]) + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) + + center = theano.shared(value = 1, name="center") + scale = theano.shared(value = 2, name="scale") + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + initial_W = numpy.asarray( numpy.random.uniform( + low = -numpy.sqrt(6./(fan_in+fan_out)), + high = numpy.sqrt(6./(fan_in+fan_out)), + size = filter_shape), dtype = theano.config.floatX) + initial_b = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros((filter_shape[1],),dtype=theano.config.floatX) + + self.b_prime = theano.shared(value = initial_b_prime, name = "b_prime") + + self.x = input + + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level,dtype=theano.config.floatX) * self.x + + conv1_out = conv.conv2d(self.tilde_x, self.W, filter_shape=filter_shape, + image_shape=image_shape, border_mode='valid') + + self.y = T.tanh(conv1_out + self.b.dimshuffle('x', 0, 'x', 'x')) + + da_filter_shape = [ filter_shape[1], filter_shape[0], + filter_shape[2], filter_shape[3] ] + initial_W_prime = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(fan_in+fan_out)), \ + high = numpy.sqrt(6./(fan_in+fan_out)), \ + size = da_filter_shape), dtype = theano.config.floatX) + self.W_prime = theano.shared(value = initial_W_prime, name = "W_prime") + + conv2_out = conv.conv2d(self.y, self.W_prime, + filter_shape = da_filter_shape, + border_mode='full') + + self.z = (T.tanh(conv2_out + self.b_prime.dimshuffle('x', 0, 'x', 'x'))+center) / scale + + scaled_x = (self.x + center) / scale + + self.L = - T.sum( scaled_x*T.log(self.z) + (1-scaled_x)*T.log(1-self.z), axis=1 ) + + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + +class LeNetConvPoolLayer(object): + + def __init__(self, rng, input, filter_shape, image_shape=None, poolsize=(2,2)): + self.input = input + + W_values = numpy.zeros(filter_shape, dtype=theano.config.floatX) + self.W = theano.shared(value=W_values) + + b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + self.b = theano.shared(value=b_values) + + conv_out = conv.conv2d(input, self.W, + filter_shape=filter_shape, image_shape=image_shape) + + + fan_in = numpy.prod(filter_shape[1:]) + fan_out = filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize) + + W_bound = numpy.sqrt(6./(fan_in + fan_out)) + self.W.value = numpy.asarray( + rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), + dtype = theano.config.floatX) + + + pooled_out = downsample.max_pool2D(conv_out, poolsize, ignore_border=True) + + self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) + self.params = [self.W, self.b] + + +class SdA(): + def __init__(self, input, n_ins_mlp, conv_hidden_layers_sizes, + mlp_hidden_layers_sizes, corruption_levels, rng, n_out, + pretrain_lr, finetune_lr, img_shape): + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + self.conv_n_layers = len(conv_hidden_layers_sizes) + self.mlp_n_layers = len(mlp_hidden_layers_sizes) + + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + + for i in xrange( self.conv_n_layers ): + filter_shape=conv_hidden_layers_sizes[i][0] + image_shape=conv_hidden_layers_sizes[i][1] + max_poolsize=conv_hidden_layers_sizes[i][2] + + if i == 0 : + layer_input=self.x.reshape((self.x.shape[0], 1) + img_shape) + else: + layer_input=self.layers[-1].output + + layer = LeNetConvPoolLayer(rng, input=layer_input, + image_shape=image_shape, + filter_shape=filter_shape, + poolsize=max_poolsize) + print 'Convolutional layer', str(i+1), 'created' + + self.layers += [layer] + self.params += layer.params + + da_layer = dA_conv(corruption_level = corruption_levels[0], + input = layer_input, + shared_W = layer.W, shared_b = layer.b, + filter_shape = filter_shape, + image_shape = image_shape ) + + gparams = T.grad(da_layer.cost, da_layer.params) + + updates = {} + for param, gparam in zip(da_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + update_fn = theano.function([self.x], da_layer.cost, updates = updates) + + self.pretrain_functions += [update_fn] + + for i in xrange( self.mlp_n_layers ): + if i == 0 : + input_size = n_ins_mlp + else: + input_size = mlp_hidden_layers_sizes[i-1] + + if i == 0 : + if len( self.layers ) == 0 : + layer_input=self.x + else : + layer_input = self.layers[-1].output.flatten(2) + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + mlp_hidden_layers_sizes[i] ) + + self.layers += [layer] + self.params += layer.params + + print 'MLP layer', str(i+1), 'created' + + self.logLayer = LogisticRegression(input=self.layers[-1].output, \ + n_in=mlp_hidden_layers_sizes[-1], n_out=n_out) + self.params += self.logLayer.params + + cost = self.logLayer.negative_log_likelihood(self.y) + + gparams = T.grad(cost, self.params) + + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([self.x, self.y], cost, updates = updates) + + self.errors = self.logLayer.errors(self.y) + +def sgd_optimization_mnist(learning_rate=0.1, pretraining_epochs = 1, + pretrain_lr = 0.1, training_epochs = 1000, + kernels = [[4,5,5], [4,3,3]], mlp_layers=[500], + corruption_levels = [0.2, 0.2, 0.2], + batch_size = batch_size, img_shape=(28, 28), + max_pool_layers = [[2,2], [2,2]], + dataset=datasets.mnist(5000)): + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + x = T.matrix('x') # the data is presented as rasterized images + y = T.ivector('y') # the labels are presented as 1d vector of + # [int] labels + + layer0_input = x.reshape((x.shape[0],1)+img_shape) + + rng = numpy.random.RandomState(1234) + conv_layers=[] + init_layer = [[kernels[0][0],1,kernels[0][1],kernels[0][2]], + None, # do not specify the batch size since it can + # change for the last one and then theano will + # crash. + max_pool_layers[0]] + conv_layers.append(init_layer) + + conv_n_out = (img_shape[0]-kernels[0][2]+1)/max_pool_layers[0][0] + + for i in range(1,len(kernels)): + layer = [[kernels[i][0],kernels[i-1][0],kernels[i][1],kernels[i][2]], + None, # same comment as for init_layer + max_pool_layers[i] ] + conv_layers.append(layer) + conv_n_out = (conv_n_out - kernels[i][2]+1)/max_pool_layers[i][0] + + network = SdA(input = layer0_input, n_ins_mlp = kernels[-1][0]*conv_n_out**2, + conv_hidden_layers_sizes = conv_layers, + mlp_hidden_layers_sizes = mlp_layers, + corruption_levels = corruption_levels, n_out = 62, + rng = rng , pretrain_lr = pretrain_lr, + finetune_lr = learning_rate, img_shape=img_shape) + + test_model = theano.function([network.x, network.y], network.errors) + + start_time = time.clock() + for i in xrange(len(network.layers)-len(mlp_layers)): + for epoch in xrange(pretraining_epochs): + for x, y in dataset.train(batch_size): + c = network.pretrain_functions[i](x) + print 'pre-training convolution layer %i, epoch %d, cost '%(i,epoch), c + + patience = 10000 # look as this many examples regardless + patience_increase = 2. # WAIT THIS MUCH LONGER WHEN A NEW BEST IS + # FOUND + improvement_threshold = 0.995 # a relative improvement of this much is + + validation_frequency = patience/2 + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + iter = 0 + + while (epoch < training_epochs) and (not done_looping): + epoch = epoch + 1 + for x, y in dataset.train(batch_size): + + cost_ij = network.finetune(x, y) + iter += 1 + + if iter % validation_frequency == 0: + validation_losses = [test_model(xv, yv) for xv, yv in dataset.valid(batch_size)] + this_validation_loss = numpy.mean(validation_losses) + print('epoch %i, iter %i, validation error %f %%' % \ + (epoch, iter, this_validation_loss*100.)) + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = [test_model(xt, yt) for xt, yt in dataset.test(batch_size)] + test_score = numpy.mean(test_losses) + print((' epoch %i, iter %i, test error of best ' + 'model %f %%') % + (epoch, iter, test_score*100.)) + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) + +if __name__ == '__main__': + sgd_optimization_mnist() + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/__init__.py diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/config.py.example --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/config.py.example Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,119 @@ +# ---------------------------------------------------------------------------- +# BEGIN EXPERIMENT ISOLATION CODE + +''' +This makes sure we use the codebase clone created for this experiment. +I.e. if you want to make modifications to the codebase but don't want your +running experiment code to be impacted by those changes, first copy the +codebase somewhere, and configure this section. It will make sure we import +from the right place. + +MUST BE DONE BEFORE IMPORTING ANYTHING ELSE +(Leave this comment there so others will understand what's going on) +''' + +# Place where you copied modules that should be fixed for this experiment +codebase_clone_path = "/u/savardf/ift6266/experiment_clones/ift6266_experiment10" + +# Places where there might be conflicting modules from your $PYTHONPATH +remove_these_from_pythonpath = ["/u/savardf/ift6266/dev_code"] + +import sys +sys.path[0:0] = [codebase_clone_path] + +# remove paths we specifically don't want in $PYTHONPATH +for bad_path in remove_these_from_pythonpath: + sys.path[:] = [el for el in sys.path if not el in (bad_path, bad_path+"/")] + +# Make the imports +import ift6266 + +# Just making sure we're importing from the right place +modules_to_check = [ift6266] +for module in modules_to_check: + if not codebase_clone_path in module.__path__[0]: + raise RuntimeError("Module loaded from incorrect path "+module.__path__[0]) + +# Path to pass to jobman sqlschedule. IMPORTANT TO CHANGE TO REFLECT YOUR CLONE. +# Make sure this is accessible from the default $PYTHONPATH (in your .bashrc) +# (and make sure every subdirectory has its __init__.py file) +EXPERIMENT_PATH = "ift6266_experiment10.ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint" + +# END EXPERIMENT ISOLATION CODE +# ---------------------------------------------------------------------------- + +from jobman import DD + +''' +These are parameters used by nist_sda.py. They'll end up as globals in there. + +Rename this file to config.py and configure as needed. +DON'T add the renamed file to the repository, as others might use it +without realizing it, with dire consequences. +''' + +# Set this to True when you want to run cluster tests, ie. you want +# to run on the cluster, many jobs, but want to reduce the training +# set size and the number of epochs, so you know everything runs +# fine on the cluster. +# Set this PRIOR to inserting your test jobs in the DB. +TEST_CONFIG = False + +# save params at training end +SAVE_PARAMS = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +NIST_ALL_TRAIN_SIZE = 649081 +# valid et test =82587 82587 + +# change "sandbox" when you're ready +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_sandbox_db/yourtablenamehere' + +# reduce training set to that many examples +REDUCE_TRAIN_TO = None +# that's a max, it usually doesn't get to that point +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 100 + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + + +# This is to configure insertion of jobs on the cluster. +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10,20], + 'hidden_layers_sizes': [300,800], + 'corruption_levels': [0.1,0.2,0.3], + 'minibatch_size': [20], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2,3]} + +# Just useful for tests... minimal number of epochs +# (This is used when running a single job, locally, when +# calling ./nist_sda.py test_jobman_entrypoint +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':2, + 'max_finetuning_epochs':2, + 'hidden_layers_sizes':800, + 'corruption_levels':0.2, + 'minibatch_size':20, + 'reduce_train_to':10000, + 'num_hidden_layers':1}) + +# To reinsert duplicate of jobs that crashed +REINSERT_COLS = ['pretraining_lr','pretraining_epochs_per_layer','hidden_layers_sizes','corruption_levels','minibatch_size','max_finetuning_epochs','finetuning_lr','num_hidden_layers','dupe'] +REINSERT_JOB_VALS = [\ + [0.1,10,800,0.3,20,1000,0.01,3,2], + [0.1,10,800,0.4,20,1000,0.01,3,2], + [0.1,10,800,0.3,20,1000,0.005,3,2], + [0.1,10,800,0.6,20,1000,0.005,3,2]] + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/nist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/nist_sda.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,194 @@ +#!/usr/bin/python +# coding: utf-8 + +# Must be imported first +from config import * + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs, jobs_from_reinsert_list + +from sgd_optimization import SdaSgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +from ift6266 import datasets + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = NIST_ALL_TRAIN_SIZE + if rtt: + examples_per_epoch = rtt + + series = create_series(state.num_hidden_layers) + + print "Creating optimizer with state, ", state + + dataset = None + if rtt: + dataset = datasets.nist_all(maxsize=rtt) + else: + dataset = datasets.nist_all() + + optimizer = SdaSgdOptimizer(dataset=dataset, + hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + examples_per_epoch=examples_per_epoch, \ + series=series, + save_params=SAVE_PARAMS) + + optimizer.pretrain(dataset) + channel.save() + + optimizer.finetune(dataset) + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + # comment out series we don't want to save + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +def jobman_REinsert_nist(): + jobs = jobs_from_reinsert_list(REINSERT_COLS, REINSERT_JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "reinserted" + + + +if __name__ == '__main__': + + args = sys.argv[1:] + + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() + + if len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + if len(args) > 0 and args[0] == 'reinsert': + jobman_REinsert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DEFAULT_HP_NIST, chanmock) + + else: + print "Bad arguments" + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/old/__init__.py diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/old/mnist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/mnist_sda.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,45 @@ +#!/usr/bin/python +# coding: utf-8 + +# TODO: This probably doesn't work anymore, adapt to new code in sgd_opt +# Parameterize call to sgd_optimization for MNIST + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +from sgd_optimization import SdaSgdOptimizer +import cPickle, gzip +from jobman import DD + +MNIST_LOCATION = '/u/savardf/datasets/mnist.pkl.gz' + +def sgd_optimization_mnist(learning_rate=0.1, pretraining_epochs = 2, \ + pretrain_lr = 0.1, training_epochs = 5, \ + dataset='mnist.pkl.gz'): + # Load the dataset + f = gzip.open(dataset,'rb') + # this gives us train, valid, test (each with .x, .y) + dataset = cPickle.load(f) + f.close() + + n_ins = 28*28 + n_outs = 10 + + hyperparameters = DD({'finetuning_lr':learning_rate, + 'pretraining_lr':pretrain_lr, + 'pretraining_epochs_per_layer':pretraining_epochs, + 'max_finetuning_epochs':training_epochs, + 'hidden_layers_sizes':[100], + 'corruption_levels':[0.2], + 'minibatch_size':20}) + + optimizer = SdaSgdOptimizer(dataset, hyperparameters, n_ins, n_outs) + optimizer.pretrain() + optimizer.finetune() + +if __name__ == '__main__': + sgd_optimization_mnist(dataset=MNIST_LOCATION) + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/old/nist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/nist_sda.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,260 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs + +from sgd_optimization import SdaSgdOptimizer + +from ift6266.utils.scalar_series import * + +############################################################################## +# GLOBALS + +TEST_CONFIG = False + +NIST_ALL_LOCATION = '/data/lisa/data/nist/by_class/all' +JOBDB = 'postgres://ift6266h10@gershwin/ift6266h10_db/fsavard_sda4' +EXPERIMENT_PATH = "ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint" + +REDUCE_TRAIN_TO = None +MAX_FINETUNING_EPOCHS = 1000 +# number of minibatches before taking means for valid error etc. +REDUCE_EVERY = 1000 + +if TEST_CONFIG: + REDUCE_TRAIN_TO = 1000 + MAX_FINETUNING_EPOCHS = 2 + REDUCE_EVERY = 10 + +# Possible values the hyperparameters can take. These are then +# combined with produit_cartesien_jobs so we get a list of all +# possible combinations, each one resulting in a job inserted +# in the jobman DB. +JOB_VALS = {'pretraining_lr': [0.1, 0.01],#, 0.001],#, 0.0001], + 'pretraining_epochs_per_layer': [10,20], + 'hidden_layers_sizes': [300,800], + 'corruption_levels': [0.1,0.2,0.3], + 'minibatch_size': [20], + 'max_finetuning_epochs':[MAX_FINETUNING_EPOCHS], + 'finetuning_lr':[0.1, 0.01], #0.001 was very bad, so we leave it out + 'num_hidden_layers':[2,3]} + +# Just useful for tests... minimal number of epochs +DEFAULT_HP_NIST = DD({'finetuning_lr':0.1, + 'pretraining_lr':0.1, + 'pretraining_epochs_per_layer':20, + 'max_finetuning_epochs':2, + 'hidden_layers_sizes':800, + 'corruption_levels':0.2, + 'minibatch_size':20, + #'reduce_train_to':300, + 'num_hidden_layers':2}) + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: +ift6266.deep.stacked_dae.nist_sda.jobman_entrypoint +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + channel.save() + + workingdir = os.getcwd() + + print "Will load NIST" + + nist = NIST(minibatch_size=20) + + print "NIST loaded" + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + if rtt: + print "Reducing training set to "+str(rtt)+ " examples" + nist.reduce_train_set(rtt) + + train,valid,test = nist.get_tvt() + dataset = (train,valid,test) + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + # b,b',W for each hidden layer + # + b,W of last layer (logreg) + numparams = state.num_hidden_layers * 3 + 2 + series_mux = None + series_mux = create_series(workingdir, numparams) + + print "Creating optimizer with state, ", state + + optimizer = SdaSgdOptimizer(dataset=dataset, hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + input_divider=255.0, series_mux=series_mux) + + optimizer.pretrain() + channel.save() + + optimizer.finetune() + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(basedir, numparams): + mux = SeriesMultiplexer() + + # comment out series we don't want to save + mux.add_series(AccumulatorSeries(name="reconstruction_error", + reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save + mean=True, + directory=basedir, flush_every=1)) + + mux.add_series(AccumulatorSeries(name="training_error", + reduce_every=REDUCE_EVERY, # every 1000 batches, we take the mean and save + mean=True, + directory=basedir, flush_every=1)) + + mux.add_series(BaseSeries(name="validation_error", directory=basedir, flush_every=1)) + mux.add_series(BaseSeries(name="test_error", directory=basedir, flush_every=1)) + + mux.add_series(ParamsArrayStats(numparams,name="params",directory=basedir)) + + return mux + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +class NIST: + def __init__(self, minibatch_size, basepath=None, reduce_train_to=None): + global NIST_ALL_LOCATION + + self.minibatch_size = minibatch_size + self.basepath = basepath and basepath or NIST_ALL_LOCATION + + self.set_filenames() + + # arrays of 2 elements: .x, .y + self.train = [None, None] + self.test = [None, None] + + self.load_train_test() + + self.valid = [[], []] + self.split_train_valid() + if reduce_train_to: + self.reduce_train_set(reduce_train_to) + + def get_tvt(self): + return self.train, self.valid, self.test + + def set_filenames(self): + self.train_files = ['all_train_data.ft', + 'all_train_labels.ft'] + + self.test_files = ['all_test_data.ft', + 'all_test_labels.ft'] + + def load_train_test(self): + self.load_data_labels(self.train_files, self.train) + self.load_data_labels(self.test_files, self.test) + + def load_data_labels(self, filenames, pair): + for i, fn in enumerate(filenames): + f = open(os.path.join(self.basepath, fn)) + pair[i] = filetensor.read(f) + f.close() + + def reduce_train_set(self, max): + self.train[0] = self.train[0][:max] + self.train[1] = self.train[1][:max] + + if max < len(self.test[0]): + for ar in (self.test, self.valid): + ar[0] = ar[0][:max] + ar[1] = ar[1][:max] + + def split_train_valid(self): + test_len = len(self.test[0]) + + new_train_x = self.train[0][:-test_len] + new_train_y = self.train[1][:-test_len] + + self.valid[0] = self.train[0][-test_len:] + self.valid[1] = self.train[1][-test_len:] + + self.train[0] = new_train_x + self.train[1] = new_train_y + +def test_load_nist(): + print "Will load NIST" + + import time + t1 = time.time() + nist = NIST(20) + t2 = time.time() + + print "NIST loaded. time delta = ", t2-t1 + + tr,v,te = nist.get_tvt() + + print "Lenghts: ", len(tr[0]), len(v[0]), len(te[0]) + + raw_input("Press any key") + +if __name__ == '__main__': + + import sys + + args = sys.argv[1:] + + if len(args) > 0 and args[0] == 'load_nist': + test_load_nist() + + elif len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DEFAULT_HP_NIST, chanmock) + + else: + print "Bad arguments" + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/old/sgd_optimization.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/sgd_optimization.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,234 @@ +#!/usr/bin/python +# coding: utf-8 + +# Generic SdA optimization loop, adapted from the deeplearning.net tutorial + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys + +from jobman import DD +import jobman, jobman.sql + +from stacked_dae import SdA + +def shared_dataset(data_xy): + data_x, data_y = data_xy + #shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) + #shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) + #shared_y = T.cast(shared_y, 'int32') + shared_x = theano.shared(data_x) + shared_y = theano.shared(data_y) + return shared_x, shared_y + +class DummyMux(): + def append(self, param1, param2): + pass + +class SdaSgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, input_divider=1.0, series_mux=None): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + self.input_divider = input_divider + + if not series_mux: + series_mux = DummyMux() + print "No series multiplexer set" + self.series_mux = series_mux + + self.rng = numpy.random.RandomState(1234) + + self.init_datasets() + self.init_classifier() + + sys.stdout.flush() + + def init_datasets(self): + print "init_datasets" + sys.stdout.flush() + + train_set, valid_set, test_set = self.dataset + self.test_set_x, self.test_set_y = shared_dataset(test_set) + self.valid_set_x, self.valid_set_y = shared_dataset(valid_set) + self.train_set_x, self.train_set_y = shared_dataset(train_set) + + # compute number of minibatches for training, validation and testing + self.n_train_batches = self.train_set_x.value.shape[0] / self.hp.minibatch_size + self.n_valid_batches = self.valid_set_x.value.shape[0] / self.hp.minibatch_size + # remove last batch in case it's incomplete + self.n_test_batches = (self.test_set_x.value.shape[0] / self.hp.minibatch_size) - 1 + + def init_classifier(self): + print "Constructing classifier" + + # we don't want to save arrays in DD objects, so + # we recreate those arrays here + nhl = self.hp.num_hidden_layers + layers_sizes = [self.hp.hidden_layers_sizes] * nhl + corruption_levels = [self.hp.corruption_levels] * nhl + + # construct the stacked denoising autoencoder class + self.classifier = SdA( \ + train_set_x= self.train_set_x, \ + train_set_y = self.train_set_y,\ + batch_size = self.hp.minibatch_size, \ + n_ins= self.n_ins, \ + hidden_layers_sizes = layers_sizes, \ + n_outs = self.n_outs, \ + corruption_levels = corruption_levels,\ + rng = self.rng,\ + pretrain_lr = self.hp.pretraining_lr, \ + finetune_lr = self.hp.finetuning_lr,\ + input_divider = self.input_divider ) + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain() + self.finetune() + + def pretrain(self): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + #time_acc_func = 0.0 + #time_acc_total = 0.0 + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + for batch_index in xrange(self.n_train_batches): + #t1 = time.clock() + c = self.classifier.pretrain_functions[i](batch_index) + #t2 = time.clock() + + #time_acc_func += t2 - t1 + + #if batch_index % 500 == 0: + # print "acc / total", time_acc_func / (t2 - start_time), time_acc_func + + self.series_mux.append("reconstruction_error", c) + + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series_mux.append("params", self.classifier.all_params) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + def finetune(self): + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + index = T.lscalar() # index to a [mini]batch + minibatch_size = self.hp.minibatch_size + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + shared_divider = theano.shared(numpy.asarray(self.input_divider, dtype=theano.config.floatX)) + test_model = theano.function([index], self.classifier.errors, + givens = { + self.classifier.x: self.test_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, + self.classifier.y: self.test_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + + validate_model = theano.function([index], self.classifier.errors, + givens = { + self.classifier.x: self.valid_set_x[index*minibatch_size:(index+1)*minibatch_size] / shared_divider, + self.classifier.y: self.valid_set_y[index*minibatch_size:(index+1)*minibatch_size]}) + + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.n_train_batches, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): + epoch = epoch + 1 + for minibatch_index in xrange(self.n_train_batches): + + cost_ij = self.classifier.finetune(minibatch_index) + iter = epoch * self.n_train_batches + minibatch_index + + self.series_mux.append("training_error", cost_ij) + + if (iter+1) % validation_frequency == 0: + + validation_losses = [validate_model(i) for i in xrange(self.n_valid_batches)] + this_validation_loss = numpy.mean(validation_losses) + self.series_mux.append("validation_error", this_validation_loss) + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, self.n_train_batches, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, iter * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = iter + + # test it on the test set + test_losses = [test_model(i) for i in xrange(self.n_test_batches)] + test_score = numpy.mean(test_losses) + self.series_mux.append("test_error", test_score) + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, self.n_train_batches, + test_score*100.)) + + sys.stdout.flush() + + self.series_mux.append("params", self.classifier.all_params) + + if patience <= iter : + done_looping = True + break + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/old/stacked_dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/stacked_dae.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,287 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import copy + +from utils import update_locals + +# taken from LeDeepNet/daa.py +# has a special case when taking log(0) (defined =0) +# modified to not take the mean anymore +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + +class LogisticRegression(object): + def __init__(self, input, n_in, n_out): + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + + + +class dA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) + + # bypassing z to avoid running to log(0) + #self.z_a = T.dot(self.y, self.W_prime) + self.b_prime) + #self.L = -T.sum( self.x * (T.log(1)-T.log(1+T.exp(-self.z_a))) \ + # + (1.0-self.x) * (T.log(1)-T.log(1+T.exp(-self.z_a))), axis=1 ) + + # I added this epsilon to avoid getting log(0) and 1/0 in grad + # This means conceptually that there'd be no probability of 0, but that + # doesn't seem to me as important (maybe I'm wrong?). + eps = 0.00000001 + eps_1 = 1-eps + self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + +class SdA(object): + def __init__(self, train_set_x, train_set_y, batch_size, n_ins, + hidden_layers_sizes, n_outs, + corruption_levels, rng, pretrain_lr, finetune_lr, input_divider=1.0): + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + update_locals(self, locals()) + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + # MODIF: added this so we also get the b_primes + # (not used for finetuning... still using ".params") + self.all_params = [] + self.n_layers = len(hidden_layers_sizes) + + print "Creating SdA with params:" + print "batch_size", batch_size + print "hidden_layers_sizes", hidden_layers_sizes + print "corruption_levels", corruption_levels + print "n_ins", n_ins + print "n_outs", n_outs + print "pretrain_lr", pretrain_lr + print "finetune_lr", finetune_lr + print "input_divider", input_divider + print "----" + + self.shared_divider = theano.shared(numpy.asarray(input_divider, dtype=theano.config.floatX)) + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + + # allocate symbolic variables for the data + index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + hidden_layers_sizes[i] ) + # add the layer to the + self.layers += [layer] + self.params += layer.params + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(input_size, hidden_layers_sizes[i], \ + corruption_level = corruption_levels[0],\ + input = layer_input, \ + shared_W = layer.W, shared_b = layer.b) + + self.all_params += dA_layer.params + + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([index], dA_layer.cost, \ + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) + # collect this function into a list + self.pretrain_functions += [update_fn] + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params += self.logLayer.params + self.all_params += self.logLayer.params + # construct a function that implements one step of finetunining + + # compute the cost, defined as the negative log likelihood + cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + # compute list of updates + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([index], cost, + updates = updates, + givens = { + self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) + + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + + self.errors = self.logLayer.errors(self.y) + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/old/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/old/utils.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/sgd_optimization.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/sgd_optimization.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,236 @@ +#!/usr/bin/python +# coding: utf-8 + +# Generic SdA optimization loop, adapted from the deeplearning.net tutorial + +from __future__ import with_statement + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys + +from jobman import DD +import jobman, jobman.sql + +from stacked_dae import SdA + +from ift6266.utils.seriestables import * + +default_series = { \ + 'reconstruction_error' : DummySeries(), + 'training_error' : DummySeries(), + 'validation_error' : DummySeries(), + 'test_error' : DummySeries(), + 'params' : DummySeries() + } + +class SdaSgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, + save_params=False): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + + self.save_params = save_params + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + + self.series = series + + self.rng = numpy.random.RandomState(1234) + + self.init_classifier() + + sys.stdout.flush() + + def init_classifier(self): + print "Constructing classifier" + + # we don't want to save arrays in DD objects, so + # we recreate those arrays here + nhl = self.hp.num_hidden_layers + layers_sizes = [self.hp.hidden_layers_sizes] * nhl + corruption_levels = [self.hp.corruption_levels] * nhl + + # construct the stacked denoising autoencoder class + self.classifier = SdA( \ + batch_size = self.hp.minibatch_size, \ + n_ins= self.n_ins, \ + hidden_layers_sizes = layers_sizes, \ + n_outs = self.n_outs, \ + corruption_levels = corruption_levels,\ + rng = self.rng,\ + pretrain_lr = self.hp.pretraining_lr, \ + finetune_lr = self.hp.finetuning_lr) + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain(self.dataset) + self.finetune(self.dataset) + + def pretrain(self,dataset): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + batch_index=0 + for x,y in dataset.train(self.hp.minibatch_size): + c = self.classifier.pretrain_functions[i](x) + + self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 + + #if batch_index % 100 == 0: + # print "100 batches" + + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + def finetune(self,dataset): + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + minibatch_size = self.hp.minibatch_size + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) + + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) + + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.mb_per_epoch, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + total_mb_index = 0 + + while (epoch < self.hp.max_finetuning_epochs) and (not done_looping): + epoch = epoch + 1 + minibatch_index = -1 + for x,y in dataset.train(minibatch_size): + minibatch_index += 1 + cost_ij = self.classifier.finetune(x,y) + total_mb_index += 1 + + self.series["training_error"].append((epoch, minibatch_index), cost_ij) + + if (total_mb_index+1) % validation_frequency == 0: + + iter = dataset.valid(minibatch_size) + validation_losses = [validate_model(x,y) for x,y in iter] + this_validation_loss = numpy.mean(validation_losses) + + self.series["validation_error"].\ + append((epoch, minibatch_index), this_validation_loss*100.) + + print('epoch %i, minibatch %i/%i, validation error %f %%' % \ + (epoch, minibatch_index+1, self.mb_per_epoch, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, total_mb_index * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = total_mb_index + + # test it on the test set + iter = dataset.test(minibatch_size) + test_losses = [test_model(x,y) for x,y in iter] + test_score = numpy.mean(test_losses) + + self.series["test_error"].\ + append((epoch, minibatch_index), test_score*100.) + + print((' epoch %i, minibatch %i/%i, test error of best ' + 'model %f %%') % + (epoch, minibatch_index+1, self.mb_per_epoch, + test_score*100.)) + + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + + if patience <= total_mb_index: + done_looping = True + break + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + if self.save_params: + save_params(self.classifier.all_params, "weights.dat") + + print(('Optimization complete with best validation score of %f %%,' + 'with test performance %f %%') % + (best_validation_loss * 100., test_score*100.)) + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + + +def save_params(all_params, filename): + import pickle + with open(filename, 'wb') as f: + values = [p.value for p in all_params] + + # -1 for HIGHEST_PROTOCOL + pickle.dump(values, f, -1) + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/stacked_dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/stacked_dae.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,292 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import copy + +from utils import update_locals + +# taken from LeDeepNet/daa.py +# has a special case when taking log(0) (defined =0) +# modified to not take the mean anymore +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + +class LogisticRegression(object): + def __init__(self, input, n_in, n_out): + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + + + +class dA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) + + # bypassing z to avoid running to log(0) + z_a = T.dot(self.y, self.W_prime) + self.b_prime + log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) + # log(1-sigmoid(z_a)) + log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) + self.L = -T.sum( self.x * (log_sigmoid) \ + + (1.0-self.x) * (log_1_sigmoid), axis=1 ) + + # I added this epsilon to avoid getting log(0) and 1/0 in grad + # This means conceptually that there'd be no probability of 0, but that + # doesn't seem to me as important (maybe I'm wrong?). + #eps = 0.00000001 + #eps_1 = 1-eps + #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + # + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + +class SdA(object): + def __init__(self, batch_size, n_ins, + hidden_layers_sizes, n_outs, + corruption_levels, rng, pretrain_lr, finetune_lr): + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + update_locals(self, locals()) + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + # MODIF: added this so we also get the b_primes + # (not used for finetuning... still using ".params") + self.all_params = [] + self.n_layers = len(hidden_layers_sizes) + + print "Creating SdA with params:" + print "batch_size", batch_size + print "hidden_layers_sizes", hidden_layers_sizes + print "corruption_levels", corruption_levels + print "n_ins", n_ins + print "n_outs", n_outs + print "pretrain_lr", pretrain_lr + print "finetune_lr", finetune_lr + print "----" + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + + # allocate symbolic variables for the data + #index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + hidden_layers_sizes[i] ) + # add the layer to the + self.layers += [layer] + self.params += layer.params + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(input_size, hidden_layers_sizes[i], \ + corruption_level = corruption_levels[0],\ + input = layer_input, \ + shared_W = layer.W, shared_b = layer.b) + + self.all_params += dA_layer.params + + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([self.x], dA_layer.cost, \ + updates = updates)#, + # givens = { + # self.x : ensemble}) + # collect this function into a list + #update_fn = theano.function([index], dA_layer.cost, \ + # updates = updates, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) + # collect this function into a list + self.pretrain_functions += [update_fn] + + + # We now need to add a logistic layer on top of the MLP + self.logLayer = LogisticRegression(\ + input = self.layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params += self.logLayer.params + self.all_params += self.logLayer.params + # construct a function that implements one step of finetunining + + # compute the cost, defined as the negative log likelihood + cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + # compute list of updates + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([self.x,self.y], cost, + updates = updates)#, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size]/self.shared_divider, + # self.y : train_set_y[index*batch_size:(index+1)*batch_size]} ) + + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + + self.errors = self.logLayer.errors(self.y) + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/utils.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,79 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +def jobs_from_reinsert_list(cols, job_vals): + job_list = [] + for vals in job_vals: + job = DD() + for i, col in enumerate(cols): + job[col] = vals[i] + job_list.append(job) + + return job_list + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/v_sylvain/__init__.py diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/v_sylvain/nist_sda.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/nist_sda.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,245 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs +from copy import copy + +from sgd_optimization import SdaSgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +from ift6266 import datasets +from config import * + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = NIST_ALL_TRAIN_SIZE + + series = create_series(state.num_hidden_layers) + + print "Creating optimizer with state, ", state + + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all(), + hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) + + parameters=[] + #Number of files of P07 used for pretraining + nb_file=0 + if state['pretrain_choice'] == 0: + print('\n\tpretraining with NIST\n') + optimizer.pretrain(datasets.nist_all()) + elif state['pretrain_choice'] == 1: + #To know how many file will be used during pretraining + nb_file = state['pretraining_epochs_per_layer'] + state['pretraining_epochs_per_layer'] = 1 #Only 1 time over the dataset + if nb_file >=100: + sys.exit("The code does not support this much pretraining epoch (99 max with P07).\n"+ + "You have to correct the code (and be patient, P07 is huge !!)\n"+ + "or reduce the number of pretraining epoch to run the code (better idea).\n") + print('\n\tpretraining with P07') + optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file)) + channel.save() + + #Set some of the parameters used for the finetuning + if state.has_key('finetune_set'): + finetune_choice=state['finetune_set'] + else: + finetune_choice=FINETUNE_SET + + if state.has_key('max_finetuning_epochs'): + max_finetune_epoch_NIST=state['max_finetuning_epochs'] + else: + max_finetune_epoch_NIST=MAX_FINETUNING_EPOCHS + + if state.has_key('max_finetuning_epochs_P07'): + max_finetune_epoch_P07=state['max_finetuning_epochs_P07'] + else: + max_finetune_epoch_P07=max_finetune_epoch_NIST + + #Decide how the finetune is done + + if finetune_choice == 0: + print('\n\n\tfinetune with NIST\n\n') + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1) + channel.save() + if finetune_choice == 1: + print('\n\n\tfinetune with P07\n\n') + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) + channel.save() + if finetune_choice == 2: + print('\n\n\tfinetune with NIST followed by P07\n\n') + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=21) + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=20) + channel.save() + if finetune_choice == 3: + print('\n\n\tfinetune with NIST only on the logistic regression on top (but validation on P07).\n\ + All hidden units output are input of the logistic regression\n\n') + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1,special=1) + + + if finetune_choice==-1: + print('\nSERIE OF 3 DIFFERENT FINETUNINGS') + print('\n\n\tfinetune with NIST\n\n') + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1) + channel.save() + print('\n\n\tfinetune with P07\n\n') + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) + channel.save() + print('\n\n\tfinetune with NIST (done earlier) followed by P07 (written here)\n\n') + optimizer.reload_parameters('params_finetune_NIST.txt') + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=20) + channel.save() + print('\n\n\tfinetune with NIST only on the logistic regression on top.\n\ + All hidden units output are input of the logistic regression\n\n') + optimizer.reload_parameters('params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1,special=1) + channel.save() + + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + # comment out series we don't want to save + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +if __name__ == '__main__': + + args = sys.argv[1:] + + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() + + if len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DD(DEFAULT_HP_NIST), chanmock) + + else: + print "Bad arguments" + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/v_sylvain/nist_sda_retrieve.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/nist_sda_retrieve.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,251 @@ +#!/usr/bin/python +# coding: utf-8 + +import ift6266 +import pylearn + +import numpy +import theano +import time + +import pylearn.version +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams + +import copy +import sys +import os +import os.path + +from jobman import DD +import jobman, jobman.sql +from pylearn.io import filetensor + +from utils import produit_cartesien_jobs +from copy import copy + +from sgd_optimization import SdaSgdOptimizer + +#from ift6266.utils.scalar_series import * +from ift6266.utils.seriestables import * +import tables + +from ift6266 import datasets +from config2 import * + +''' +Function called by jobman upon launching each job +Its path is the one given when inserting jobs: see EXPERIMENT_PATH +''' +def jobman_entrypoint(state, channel): + # record mercurial versions of each package + pylearn.version.record_versions(state,[theano,ift6266,pylearn]) + # TODO: remove this, bad for number of simultaneous requests on DB + channel.save() + + # For test runs, we don't want to use the whole dataset so + # reduce it to fewer elements if asked to. + rtt = None + if state.has_key('reduce_train_to'): + rtt = state['reduce_train_to'] + elif REDUCE_TRAIN_TO: + rtt = REDUCE_TRAIN_TO + + n_ins = 32*32 + n_outs = 62 # 10 digits, 26*2 (lower, capitals) + + examples_per_epoch = NIST_ALL_TRAIN_SIZE + + series = create_series(state.num_hidden_layers) + + print "Creating optimizer with state, ", state + + optimizer = SdaSgdOptimizer(dataset=datasets.nist_all(), + hyperparameters=state, \ + n_ins=n_ins, n_outs=n_outs,\ + examples_per_epoch=examples_per_epoch, \ + series=series, + max_minibatches=rtt) + + parameters=[] + #Number of files of P07 used for pretraining + nb_file=0 +## if state['pretrain_choice'] == 0: +## print('\n\tpretraining with NIST\n') +## optimizer.pretrain(datasets.nist_all()) +## elif state['pretrain_choice'] == 1: +## #To know how many file will be used during pretraining +## nb_file = state['pretraining_epochs_per_layer'] +## state['pretraining_epochs_per_layer'] = 1 #Only 1 time over the dataset +## if nb_file >=100: +## sys.exit("The code does not support this much pretraining epoch (99 max with P07).\n"+ +## "You have to correct the code (and be patient, P07 is huge !!)\n"+ +## "or reduce the number of pretraining epoch to run the code (better idea).\n") +## print('\n\tpretraining with P07') +## optimizer.pretrain(datasets.nist_P07(min_file=0,max_file=nb_file)) + print ('Retrieve pre-train done earlier') + + sys.stdout.flush() + + #Set some of the parameters used for the finetuning + if state.has_key('finetune_set'): + finetune_choice=state['finetune_set'] + else: + finetune_choice=FINETUNE_SET + + if state.has_key('max_finetuning_epochs'): + max_finetune_epoch_NIST=state['max_finetuning_epochs'] + else: + max_finetune_epoch_NIST=MAX_FINETUNING_EPOCHS + + if state.has_key('max_finetuning_epochs_P07'): + max_finetune_epoch_P07=state['max_finetuning_epochs_P07'] + else: + max_finetune_epoch_P07=max_finetune_epoch_NIST + + #Decide how the finetune is done + + if finetune_choice == 0: + print('\n\n\tfinetune with NIST\n\n') + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1) + channel.save() + if finetune_choice == 1: + print('\n\n\tfinetune with P07\n\n') + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_pretrain.txt') + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) + channel.save() + if finetune_choice == 2: + print('\n\n\tfinetune with NIST followed by P07\n\n') + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=21) + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=20) + channel.save() + if finetune_choice == 3: + print('\n\n\tfinetune with NIST only on the logistic regression on top (but validation on P07).\n\ + All hidden units output are input of the logistic regression\n\n') + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1,special=1) + + + if finetune_choice==-1: + print('\nSERIE OF 3 DIFFERENT FINETUNINGS') + print('\n\n\tfinetune with NIST\n\n') + sys.stdout.flush() + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1) + channel.save() + print('\n\n\tfinetune with P07\n\n') + sys.stdout.flush() + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_pretrain.txt') + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=0) + channel.save() + print('\n\n\tfinetune with NIST (done earlier) followed by P07 (written here)\n\n') + sys.stdout.flush() + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_finetune_NIST.txt') + optimizer.finetune(datasets.nist_P07(min_file=nb_file),datasets.nist_all(),max_finetune_epoch_P07,ind_test=20) + channel.save() + print('\n\n\tfinetune with NIST only on the logistic regression on top.\n\ + All hidden units output are input of the logistic regression\n\n') + sys.stdout.flush() + optimizer.reload_parameters('/u/pannetis/IFT6266/ift6266/deep/stacked_dae/v_sylvain/ift6266h10_db/pannetis_finetuningSDA/1/params_pretrain.txt') + optimizer.finetune(datasets.nist_all(),datasets.nist_P07(min_file=nb_file),max_finetune_epoch_NIST,ind_test=1,special=1) + channel.save() + + channel.save() + + return channel.COMPLETE + +# These Series objects are used to save various statistics +# during the training. +def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # reconstruction + reconstruction_base = \ + ErrorSeries(error_name="reconstruction_error", + table_name="reconstruction_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Reconstruction error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['reconstruction_error'] = \ + AccumulatorSeriesWrapper(base_series=reconstruction_base, + reduce_every=REDUCE_EVERY) + + # train + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over "+str(REDUCE_EVERY)+" minibatches)") + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=REDUCE_EVERY) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch','minibatch')) + + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + # comment out series we don't want to save + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +# Perform insertion into the Postgre DB based on combination +# of hyperparameter values above +# (see comment for produit_cartesien_jobs() to know how it works) +def jobman_insert_nist(): + jobs = produit_cartesien_jobs(JOB_VALS) + + db = jobman.sql.db(JOBDB) + for job in jobs: + job.update({jobman.sql.EXPERIMENT: EXPERIMENT_PATH}) + jobman.sql.insert_dict(job, db) + + print "inserted" + +if __name__ == '__main__': + + args = sys.argv[1:] + + #if len(args) > 0 and args[0] == 'load_nist': + # test_load_nist() + + if len(args) > 0 and args[0] == 'jobman_insert': + jobman_insert_nist() + + elif len(args) > 0 and args[0] == 'test_jobman_entrypoint': + chanmock = DD({'COMPLETE':0,'save':(lambda:None)}) + jobman_entrypoint(DD(DEFAULT_HP_NIST), chanmock) + + else: + print "Bad arguments" + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/v_sylvain/sgd_optimization.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/sgd_optimization.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,342 @@ +#!/usr/bin/python +# coding: utf-8 + +# Generic SdA optimization loop, adapted from the deeplearning.net tutorial + +import numpy +import theano +import time +import datetime +import theano.tensor as T +import sys +import pickle + +from jobman import DD +import jobman, jobman.sql +from copy import copy + +from stacked_dae import SdA + +from ift6266.utils.seriestables import * + +default_series = { \ + 'reconstruction_error' : DummySeries(), + 'training_error' : DummySeries(), + 'validation_error' : DummySeries(), + 'test_error' : DummySeries(), + 'params' : DummySeries() + } + +def itermax(iter, max): + for i,it in enumerate(iter): + if i >= max: + break + yield it + +class SdaSgdOptimizer: + def __init__(self, dataset, hyperparameters, n_ins, n_outs, + examples_per_epoch, series=default_series, max_minibatches=None): + self.dataset = dataset + self.hp = hyperparameters + self.n_ins = n_ins + self.n_outs = n_outs + self.parameters_pre=[] + + self.max_minibatches = max_minibatches + print "SdaSgdOptimizer, max_minibatches =", max_minibatches + + self.ex_per_epoch = examples_per_epoch + self.mb_per_epoch = examples_per_epoch / self.hp.minibatch_size + + self.series = series + + self.rng = numpy.random.RandomState(1234) + + self.init_classifier() + + sys.stdout.flush() + + def init_classifier(self): + print "Constructing classifier" + + # we don't want to save arrays in DD objects, so + # we recreate those arrays here + nhl = self.hp.num_hidden_layers + layers_sizes = [self.hp.hidden_layers_sizes] * nhl + corruption_levels = [self.hp.corruption_levels] * nhl + + # construct the stacked denoising autoencoder class + self.classifier = SdA( \ + batch_size = self.hp.minibatch_size, \ + n_ins= self.n_ins, \ + hidden_layers_sizes = layers_sizes, \ + n_outs = self.n_outs, \ + corruption_levels = corruption_levels,\ + rng = self.rng,\ + pretrain_lr = self.hp.pretraining_lr, \ + finetune_lr = self.hp.finetuning_lr) + + #theano.printing.pydotprint(self.classifier.pretrain_functions[0], "function.graph") + + sys.stdout.flush() + + def train(self): + self.pretrain(self.dataset) + self.finetune(self.dataset) + + def pretrain(self,dataset): + print "STARTING PRETRAINING, time = ", datetime.datetime.now() + sys.stdout.flush() + + un_fichier=int(819200.0/self.hp.minibatch_size) #Number of batches in a P07 batch + + start_time = time.clock() + ## Pre-train layer-wise + for i in xrange(self.classifier.n_layers): + # go through pretraining epochs + for epoch in xrange(self.hp.pretraining_epochs_per_layer): + # go through the training set + batch_index=0 + count=0 + num_files=0 + for x,y in dataset.train(self.hp.minibatch_size): + c = self.classifier.pretrain_functions[i](x) + count +=1 + + self.series["reconstruction_error"].append((epoch, batch_index), c) + batch_index+=1 + + #if batch_index % 100 == 0: + # print "100 batches" + + # useful when doing tests + if self.max_minibatches and batch_index >= self.max_minibatches: + break + + #When we pass through the data only once (the case with P07) + #There is approximately 800*1024=819200 examples per file (1k per example and files are 800M) + if self.hp.pretraining_epochs_per_layer == 1 and count%un_fichier == 0: + print 'Pre-training layer %i, epoch %d, cost '%(i,num_files),c + num_files+=1 + sys.stdout.flush() + self.series['params'].append((num_files,), self.classifier.all_params) + + #When NIST is used + if self.hp.pretraining_epochs_per_layer > 1: + print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),c + sys.stdout.flush() + + self.series['params'].append((epoch,), self.classifier.all_params) + + end_time = time.clock() + + print ('Pretraining took %f minutes' %((end_time-start_time)/60.)) + self.hp.update({'pretraining_time': end_time-start_time}) + + sys.stdout.flush() + + #To be able to load them later for tests on finetune + self.parameters_pre=[copy(x.value) for x in self.classifier.params] + f = open('params_pretrain.txt', 'w') + pickle.dump(self.parameters_pre,f) + f.close() + + + def finetune(self,dataset,dataset_test,num_finetune,ind_test,special=0): + + if special != 0 and special != 1: + sys.exit('Bad value for variable special. Must be in {0,1}') + print "STARTING FINETUNING, time = ", datetime.datetime.now() + + minibatch_size = self.hp.minibatch_size + if ind_test == 0 or ind_test == 20: + nom_test = "NIST" + nom_train="P07" + else: + nom_test = "P07" + nom_train = "NIST" + + + # create a function to compute the mistakes that are made by the model + # on the validation set, or testing set + test_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: ensemble_x, + # self.classifier.y: ensemble_y]}) + + validate_model = \ + theano.function( + [self.classifier.x,self.classifier.y], self.classifier.errors) + # givens = { + # self.classifier.x: , + # self.classifier.y: ]}) + + + # early-stopping parameters + patience = 10000 # look as this many examples regardless + patience_increase = 2. # wait this much longer when a new best is + # found + improvement_threshold = 0.995 # a relative improvement of this much is + # considered significant + validation_frequency = min(self.mb_per_epoch, patience/2) + # go through this many + # minibatche before checking the network + # on the validation set; in this case we + # check every epoch + if self.max_minibatches and validation_frequency > self.max_minibatches: + validation_frequency = self.max_minibatches / 2 + + best_params = None + best_validation_loss = float('inf') + test_score = 0. + start_time = time.clock() + + done_looping = False + epoch = 0 + + total_mb_index = 0 + minibatch_index = -1 + + while (epoch < num_finetune) and (not done_looping): + epoch = epoch + 1 + + for x,y in dataset.train(minibatch_size): + minibatch_index += 1 + if special == 0: + cost_ij = self.classifier.finetune(x,y) + elif special == 1: + cost_ij = self.classifier.finetune2(x,y) + total_mb_index += 1 + + self.series["training_error"].append((epoch, minibatch_index), cost_ij) + + if (total_mb_index+1) % validation_frequency == 0: + #minibatch_index += 1 + #The validation set is always NIST + if ind_test == 0: + iter=dataset_test.valid(minibatch_size) + else: + iter = dataset.valid(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + validation_losses = [validate_model(x,y) for x,y in iter] + this_validation_loss = numpy.mean(validation_losses) + + self.series["validation_error"].\ + append((epoch, minibatch_index), this_validation_loss*100.) + + print('epoch %i, minibatch %i, validation error on %s : %f %%' % \ + (epoch, minibatch_index+1,nom_test, \ + this_validation_loss*100.)) + + + # if we got the best validation score until now + if this_validation_loss < best_validation_loss: + + #improve patience if loss improvement is good enough + if this_validation_loss < best_validation_loss * \ + improvement_threshold : + patience = max(patience, total_mb_index * patience_increase) + + # save best validation score and iteration number + best_validation_loss = this_validation_loss + best_iter = total_mb_index + + # test it on the test set + iter = dataset.test(minibatch_size) + if self.max_minibatches: + iter = itermax(iter, self.max_minibatches) + test_losses = [test_model(x,y) for x,y in iter] + test_score = numpy.mean(test_losses) + + #test it on the second test set + iter2 = dataset_test.test(minibatch_size) + if self.max_minibatches: + iter2 = itermax(iter2, self.max_minibatches) + test_losses2 = [test_model(x,y) for x,y in iter2] + test_score2 = numpy.mean(test_losses2) + + self.series["test_error"].\ + append((epoch, minibatch_index), test_score*100.) + + print((' epoch %i, minibatch %i, test error on dataset %s (train data) of best ' + 'model %f %%') % + (epoch, minibatch_index+1,nom_train, + test_score*100.)) + + print((' epoch %i, minibatch %i, test error on dataset %s of best ' + 'model %f %%') % + (epoch, minibatch_index+1,nom_test, + test_score2*100.)) + + if patience <= total_mb_index: + done_looping = True + break + + sys.stdout.flush() + + # useful when doing tests + if self.max_minibatches and minibatch_index >= self.max_minibatches: + break + + self.series['params'].append((epoch,), self.classifier.all_params) + + if done_looping == True: #To exit completly the fine-tuning + break + + end_time = time.clock() + self.hp.update({'finetuning_time':end_time-start_time,\ + 'best_validation_error':best_validation_loss,\ + 'test_score':test_score, + 'num_finetuning_epochs':epoch}) + + print(('\nOptimization complete with best validation score of %f %%,' + 'with test performance %f %% on dataset %s ') % + (best_validation_loss * 100., test_score*100.,nom_train)) + print(('The test score on the %s dataset is %f')%(nom_test,test_score2*100.)) + + print ('The finetuning ran for %f minutes' % ((end_time-start_time)/60.)) + + #Save a copy of the parameters in a file to be able to get them in the future + + if special == 1: #To keep a track of the value of the parameters + parameters_finetune=[copy(x.value) for x in self.classifier.params] + f = open('params_finetune_stanford.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + elif ind_test== 0: #To keep a track of the value of the parameters + parameters_finetune=[copy(x.value) for x in self.classifier.params] + f = open('params_finetune_P07.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + elif ind_test== 1: #For the run with 2 finetunes. It will be faster. + parameters_finetune=[copy(x.value) for x in self.classifier.params] + f = open('params_finetune_NIST.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + elif ind_test== 20: #To keep a track of the value of the parameters + parameters_finetune=[copy(x.value) for x in self.classifier.params] + f = open('params_finetune_NIST_then_P07.txt', 'w') + pickle.dump(parameters_finetune,f) + f.close() + + + #Set parameters like they where right after pre-train + def reload_parameters(self,which): + + #self.parameters_pre=pickle.load('params_pretrain.txt') + f = open(which) + self.parameters_pre=pickle.load(f) + f.close() + for idx,x in enumerate(self.parameters_pre): + self.classifier.params[idx].value=copy(x) + + + + + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/v_sylvain/stacked_dae.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/stacked_dae.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,328 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy +import theano +import time +import theano.tensor as T +from theano.tensor.shared_randomstreams import RandomStreams +import copy + +from utils import update_locals + +# taken from LeDeepNet/daa.py +# has a special case when taking log(0) (defined =0) +# modified to not take the mean anymore +from theano.tensor.xlogx import xlogx, xlogy0 +# it's target*log(output) +def binary_cross_entropy(target, output, sum_axis=1): + XE = xlogy0(target, output) + xlogy0((1 - target), (1 - output)) + return -T.sum(XE, axis=sum_axis) + +class LogisticRegression(object): + def __init__(self, input, n_in, n_out): + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) + self.W = theano.shared( value=numpy.zeros((n_in,n_out), + dtype = theano.config.floatX) ) + # initialize the baises b as a vector of n_out 0s + self.b = theano.shared( value=numpy.zeros((n_out,), + dtype = theano.config.floatX) ) + # compute vector of class-membership probabilities in symbolic form + self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b) + + # compute prediction as class whose probability is maximal in + # symbolic form + self.y_pred=T.argmax(self.p_y_given_x, axis=1) + + # list of parameters for this layer + self.params = [self.W, self.b] + + + def negative_log_likelihood(self, y): + return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) + + def errors(self, y): + # check if y has same dimension of y_pred + if y.ndim != self.y_pred.ndim: + raise TypeError('y should have the same shape as self.y_pred', + ('y', target.type, 'y_pred', self.y_pred.type)) + + # check if y is of the correct datatype + if y.dtype.startswith('int'): + # the T.neq operator returns a vector of 0s and 1s, where 1 + # represents a mistake in prediction + return T.mean(T.neq(self.y_pred, y)) + else: + raise NotImplementedError() + + +class SigmoidalLayer(object): + def __init__(self, rng, input, n_in, n_out): + self.input = input + + W_values = numpy.asarray( rng.uniform( \ + low = -numpy.sqrt(6./(n_in+n_out)), \ + high = numpy.sqrt(6./(n_in+n_out)), \ + size = (n_in, n_out)), dtype = theano.config.floatX) + self.W = theano.shared(value = W_values) + + b_values = numpy.zeros((n_out,), dtype= theano.config.floatX) + self.b = theano.shared(value= b_values) + + self.output = T.nnet.sigmoid(T.dot(input, self.W) + self.b) + self.params = [self.W, self.b] + + + +class dA(object): + def __init__(self, n_visible= 784, n_hidden= 500, corruption_level = 0.1,\ + input = None, shared_W = None, shared_b = None): + self.n_visible = n_visible + self.n_hidden = n_hidden + + # create a Theano random generator that gives symbolic random values + theano_rng = RandomStreams() + + if shared_W != None and shared_b != None : + self.W = shared_W + self.b = shared_b + else: + # initial values for weights and biases + # note : W' was written as `W_prime` and b' as `b_prime` + + # W is initialized with `initial_W` which is uniformely sampled + # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) + # the output of uniform if converted using asarray to dtype + # theano.config.floatX so that the code is runable on GPU + initial_W = numpy.asarray( numpy.random.uniform( \ + low = -numpy.sqrt(6./(n_hidden+n_visible)), \ + high = numpy.sqrt(6./(n_hidden+n_visible)), \ + size = (n_visible, n_hidden)), dtype = theano.config.floatX) + initial_b = numpy.zeros(n_hidden, dtype = theano.config.floatX) + + + # theano shared variables for weights and biases + self.W = theano.shared(value = initial_W, name = "W") + self.b = theano.shared(value = initial_b, name = "b") + + + initial_b_prime= numpy.zeros(n_visible) + # tied weights, therefore W_prime is W transpose + self.W_prime = self.W.T + self.b_prime = theano.shared(value = initial_b_prime, name = "b'") + + # if no input is given, generate a variable representing the input + if input == None : + # we use a matrix because we expect a minibatch of several examples, + # each example being a row + self.x = T.dmatrix(name = 'input') + else: + self.x = input + # Equation (1) + # keep 90% of the inputs the same and zero-out randomly selected subset of 10% of the inputs + # note : first argument of theano.rng.binomial is the shape(size) of + # random numbers that it should produce + # second argument is the number of trials + # third argument is the probability of success of any trial + # + # this will produce an array of 0s and 1s where 1 has a + # probability of 1 - ``corruption_level`` and 0 with + # ``corruption_level`` + self.tilde_x = theano_rng.binomial( self.x.shape, 1, 1 - corruption_level, dtype=theano.config.floatX) * self.x + # Equation (2) + # note : y is stored as an attribute of the class so that it can be + # used later when stacking dAs. + self.y = T.nnet.sigmoid(T.dot(self.tilde_x, self.W ) + self.b) + # Equation (3) + #self.z = T.nnet.sigmoid(T.dot(self.y, self.W_prime) + self.b_prime) + # Equation (4) + # note : we sum over the size of a datapoint; if we are using minibatches, + # L will be a vector, with one entry per example in minibatch + #self.L = - T.sum( self.x*T.log(self.z) + (1-self.x)*T.log(1-self.z), axis=1 ) + #self.L = binary_cross_entropy(target=self.x, output=self.z, sum_axis=1) + + # bypassing z to avoid running to log(0) + z_a = T.dot(self.y, self.W_prime) + self.b_prime + log_sigmoid = T.log(1.) - T.log(1.+T.exp(-z_a)) + # log(1-sigmoid(z_a)) + log_1_sigmoid = -z_a - T.log(1.+T.exp(-z_a)) + self.L = -T.sum( self.x * (log_sigmoid) \ + + (1.0-self.x) * (log_1_sigmoid), axis=1 ) + + # I added this epsilon to avoid getting log(0) and 1/0 in grad + # This means conceptually that there'd be no probability of 0, but that + # doesn't seem to me as important (maybe I'm wrong?). + #eps = 0.00000001 + #eps_1 = 1-eps + #self.L = - T.sum( self.x * T.log(eps + eps_1*self.z) \ + # + (1-self.x)*T.log(eps + eps_1*(1-self.z)), axis=1 ) + # note : L is now a vector, where each element is the cross-entropy cost + # of the reconstruction of the corresponding example of the + # minibatch. We need to compute the average of all these to get + # the cost of the minibatch + self.cost = T.mean(self.L) + + self.params = [ self.W, self.b, self.b_prime ] + + +class SdA(object): + def __init__(self, batch_size, n_ins, + hidden_layers_sizes, n_outs, + corruption_levels, rng, pretrain_lr, finetune_lr): + # Just to make sure those are not modified somewhere else afterwards + hidden_layers_sizes = copy.deepcopy(hidden_layers_sizes) + corruption_levels = copy.deepcopy(corruption_levels) + + update_locals(self, locals()) + + self.layers = [] + self.pretrain_functions = [] + self.params = [] + # MODIF: added this so we also get the b_primes + # (not used for finetuning... still using ".params") + self.all_params = [] + self.n_layers = len(hidden_layers_sizes) + self.logistic_params = [] + + print "Creating SdA with params:" + print "batch_size", batch_size + print "hidden_layers_sizes", hidden_layers_sizes + print "corruption_levels", corruption_levels + print "n_ins", n_ins + print "n_outs", n_outs + print "pretrain_lr", pretrain_lr + print "finetune_lr", finetune_lr + print "----" + + if len(hidden_layers_sizes) < 1 : + raiseException (' You must have at least one hidden layer ') + + + # allocate symbolic variables for the data + #index = T.lscalar() # index to a [mini]batch + self.x = T.matrix('x') # the data is presented as rasterized images + self.y = T.ivector('y') # the labels are presented as 1D vector of + # [int] labels + + for i in xrange( self.n_layers ): + # construct the sigmoidal layer + + # the size of the input is either the number of hidden units of + # the layer below or the input size if we are on the first layer + if i == 0 : + input_size = n_ins + else: + input_size = hidden_layers_sizes[i-1] + + # the input to this layer is either the activation of the hidden + # layer below or the input of the SdA if you are on the first + # layer + if i == 0 : + layer_input = self.x + else: + layer_input = self.layers[-1].output + + layer = SigmoidalLayer(rng, layer_input, input_size, + hidden_layers_sizes[i] ) + # add the layer to the + self.layers += [layer] + self.params += layer.params + + # Construct a denoising autoencoder that shared weights with this + # layer + dA_layer = dA(input_size, hidden_layers_sizes[i], \ + corruption_level = corruption_levels[0],\ + input = layer_input, \ + shared_W = layer.W, shared_b = layer.b) + + self.all_params += dA_layer.params + + # Construct a function that trains this dA + # compute gradients of layer parameters + gparams = T.grad(dA_layer.cost, dA_layer.params) + # compute the list of updates + updates = {} + for param, gparam in zip(dA_layer.params, gparams): + updates[param] = param - gparam * pretrain_lr + + # create a function that trains the dA + update_fn = theano.function([self.x], dA_layer.cost, \ + updates = updates)#, + # givens = { + # self.x : ensemble}) + # collect this function into a list + #update_fn = theano.function([index], dA_layer.cost, \ + # updates = updates, + # givens = { + # self.x : train_set_x[index*batch_size:(index+1)*batch_size] / self.shared_divider}) + # collect this function into a list + self.pretrain_functions += [update_fn] + + + # We now need to add a logistic layer on top of the SDA + self.logLayer = LogisticRegression(\ + input = self.layers[-1].output,\ + n_in = hidden_layers_sizes[-1], n_out = n_outs) + + self.params += self.logLayer.params + self.all_params += self.logLayer.params + # construct a function that implements one step of finetunining + + # compute the cost, defined as the negative log likelihood + cost = self.logLayer.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + gparams = T.grad(cost, self.params) + # compute list of updates + updates = {} + for param,gparam in zip(self.params, gparams): + updates[param] = param - gparam*finetune_lr + + self.finetune = theano.function([self.x,self.y], cost, + updates = updates)#, + + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + + self.errors = self.logLayer.errors(self.y) + + + #STRUCTURE FOR THE FINETUNING OF THE LOGISTIC REGRESSION ON THE TOP WITH + #ALL HIDDEN LAYERS AS INPUT + + all_h=[] + for i in xrange(self.n_layers): + all_h.append(self.layers[i].output) + self.all_hidden=T.concatenate(all_h,axis=1) + + + self.logLayer2 = LogisticRegression(\ + input = self.all_hidden,\ + n_in = sum(hidden_layers_sizes), n_out = n_outs) + #n_in=hidden_layers_sizes[0],n_out=n_outs) + + #self.logistic_params+= self.logLayer2.params + # construct a function that implements one step of finetunining + + # compute the cost, defined as the negative log likelihood + cost2 = self.logLayer2.negative_log_likelihood(self.y) + # compute the gradients with respect to the model parameters + gparams2 = T.grad(cost2, self.logLayer2.params) + + # compute list of updates + updates2 = {} + for param,gparam in zip(self.logLayer2.params, gparams2): + updates2[param] = param - gparam*finetune_lr + + self.finetune2 = theano.function([self.x,self.y], cost2, + updates = updates2) + + # symbolic variable that points to the number of errors made on the + # minibatch given by self.x and self.y + + self.errors2 = self.logLayer2.errors(self.y) + + +if __name__ == '__main__': + import sys + args = sys.argv[1:] + diff -r 6f606b359df3 -r a9af079892ce deep/stacked_dae/v_sylvain/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deep/stacked_dae/v_sylvain/utils.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +from jobman import DD + +# from pylearn codebase +# useful in __init__(param1, param2, etc.) to save +# values in self.param1, self.param2... just call +# update_locals(self, locals()) +def update_locals(obj, dct): + if 'self' in dct: + del dct['self'] + obj.__dict__.update(dct) + +# from a dictionary of possible values for hyperparameters, e.g. +# hp_values = {'learning_rate':[0.1, 0.01], 'num_layers': [1,2]} +# create a list of other dictionaries representing all the possible +# combinations, thus in this example creating: +# [{'learning_rate': 0.1, 'num_layers': 1}, ...] +# (similarly for combinations (0.1, 2), (0.01, 1), (0.01, 2)) +def produit_cartesien_jobs(val_dict): + job_list = [DD()] + all_keys = val_dict.keys() + + for key in all_keys: + possible_values = val_dict[key] + new_job_list = [] + for val in possible_values: + for job in job_list: + to_insert = job.copy() + to_insert.update({key: val}) + new_job_list.append(to_insert) + job_list = new_job_list + + return job_list + +def test_produit_cartesien_jobs(): + vals = {'a': [1,2], 'b': [3,4,5]} + print produit_cartesien_jobs(vals) + + +# taken from http://stackoverflow.com/questions/276052/how-to-get-current-cpu-and-ram-usage-in-python +"""Simple module for getting amount of memory used by a specified user's +processes on a UNIX system. +It uses UNIX ps utility to get the memory usage for a specified username and +pipe it to awk for summing up per application memory usage and return the total. +Python's Popen() from subprocess module is used for spawning ps and awk. + +""" + +import subprocess + +class MemoryMonitor(object): + + def __init__(self, username): + """Create new MemoryMonitor instance.""" + self.username = username + + def usage(self): + """Return int containing memory used by user's processes.""" + self.process = subprocess.Popen("ps -u %s -o rss | awk '{sum+=$1} END {print sum}'" % self.username, + shell=True, + stdout=subprocess.PIPE, + ) + self.stdout_list = self.process.communicate()[0].split('\n') + return int(self.stdout_list[0]) + diff -r 6f606b359df3 -r a9af079892ce scripts/CalcPropNist.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/CalcPropNist.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,92 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Script qui calcule la proportion de chiffres, lettres minuscules et lettres majuscules +dans NIST train et NIST test. + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + +from pylearn.io import filetensor as ft +import matplotlib.pyplot as plt + + +#f1 = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/all_train_labels.ft') +f1 = open('/data/lisa/data/nist/by_class/all/all_train_labels.ft') +train = ft.read(f1) +#f2 = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/all_test_labels.ft') +f2 = open('/data/lisa/data/nist/by_class/all/all_test_labels.ft') +test = ft.read(f2) +f1.close() +f2.close() + +#Les 6 variables +train_c=0 +train_min=0 +train_maj=0 + +test_c=0 +test_min=0 +test_maj=0 + +classe=0 #variable utilisee pour voir la classe presentement regardee +#Calcul pour le train_set +for i in xrange(len(train)): + classe=train[i] + if classe < 10: + train_c += 1 + elif classe < 36: + train_maj += 1 + elif classe < 62: + train_min += 1 + +for j in xrange(len(test)): + classe=test[j] + if classe < 10: + test_c += 1 + elif classe < 36: + test_maj += 1 + elif classe < 62: + test_min += 1 +print "Train set:",len(train),"\nchiffres:",float(train_c)/len(train),"\tmajuscules:",\ +float(train_maj)/len(train),"\tminuscules:",float(train_min)/len(train),\ +"\nchiffres:", float(train_c)/len(train),"\tlettres:",float(train_maj+train_min)/len(train) + +print "\nTest set:",len(test),"\nchiffres:",float(test_c)/len(test),"\tmajuscules:",\ +float(test_maj)/len(test),"\tminuscules:",float(test_min)/len(test),\ +"\nchiffres:", float(test_c)/len(test),"\tlettres:",float(test_maj+test_min)/len(test) + + +if test_maj+test_min+test_c != len(test): + print "probleme avec le test, des donnees ne sont pas etiquetees" + +if train_maj+train_min+train_c != len(train): + print "probleme avec le train, des donnees ne sont pas etiquetees" + + +#train set +plt.subplot(211) +plt.hist(train,bins=62) +plt.axis([0, 62,0,40000]) +plt.axvline(x=10, ymin=0, ymax=40000,linewidth=2, color='r') +plt.axvline(x=36, ymin=0, ymax=40000,linewidth=2, color='r') +plt.text(3,36000,'chiffres') +plt.text(18,36000,'majuscules') +plt.text(40,36000,'minuscules') +plt.title('Train set') + +#test set +plt.subplot(212) +plt.hist(test,bins=62) +plt.axis([0, 62,0,7000]) +plt.axvline(x=10, ymin=0, ymax=7000,linewidth=2, color='r') +plt.axvline(x=36, ymin=0, ymax=7000,linewidth=2, color='r') +plt.text(3,6400,'chiffres') +plt.text(18,6400,'majuscules') +plt.text(45,6400,'minuscules') +plt.title('Test set') + +#afficher +plt.show() \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce scripts/__init__.py diff -r 6f606b359df3 -r a9af079892ce scripts/creer_jeu_occlusion.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/creer_jeu_occlusion.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,42 @@ +#!/usr/bin/python +# coding: utf-8 + +''' +Sert a creer un petit jeu de donnees afin de pouvoir avoir des fragments +de lettres pour ajouter bruit d'occlusion + +Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 + +''' + +from pylearn.io import filetensor as ft +import pylab +import random as r +from numpy import * + +nombre = 20000 #La grandeur de l'echantillon + +f = open('all_train_data.ft') #Le jeu de donnees est en local. +d = ft.read(f) +f.close() +print len(d) +random.seed(3525) + +echantillon=r.sample(xrange(len(d)),nombre) +nouveau=d[0:nombre] +for i in xrange(nombre): + nouveau[i]=d[echantillon[i]] + + +f2 = open('echantillon_occlusion.ft', 'w') +ft.write(f2,nouveau) +f2.close() + + +##Tester si ca a fonctionne +f3 = open('echantillon_occlusion.ft') + +d2=ft.read(f3) +pylab.imshow(d2[0].reshape((32,32))) +pylab.show() +f3.close() \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce scripts/deepmlp.py --- a/scripts/deepmlp.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,310 +0,0 @@ -# - -import numpy, cPickle, gzip - - -import theano -import theano.tensor as T - -import time - -import theano.tensor.nnet - -class MLP(object): - """Multi-Layer Perceptron Class - - A multilayer perceptron is a feedforward artificial neural network model - that has one layer or more of hidden units and nonlinear activations. - Intermidiate layers usually have as activation function thanh or the - sigmoid function while the top layer is a softamx layer. - """ - - - - def __init__(self, input, n_in, n_hidden, n_out): - """Initialize the parameters for the multilayer perceptron - - :param input: symbolic variable that describes the input of the - architecture (one minibatch) - - :param n_in: number of input units, the dimension of the space in - which the datapoints lie - - :param n_hidden: List representing the number of units for each - hidden layer - - #:param n_layer: Number of hidden layers - - :param n_out: number of output units, the dimension of the space in - which the labels lie - - """ - - # initialize the parameters theta = (W,b) ; Here W and b are lists - # where W[i] and b[i] represent the parameters and the bias vector - # of the i-th layer. - n_layer=len(n_hidden) - W_values=[] - b_values=[] - self.W=[] - self.b=[] - - # We first initialize the matrix W[0] and b[0] that represent the parameters - # from the input to the first hidden layer - W_values.append(numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_in+n_hidden[0])), \ - high = numpy.sqrt(6./(n_in+n_hidden[0])), \ - size = (n_in, n_hidden[0])), dtype = theano.config.floatX)) - self.W.append(theano.shared( value = W_values[0] )) - self.b.append(theano.shared( value = numpy.zeros((n_hidden[0],), - dtype= theano.config.floatX))) - - # We initialize the parameters between all consecutive hidden layers - for i in range(1,n_layer): - # Each `W[i]` is initialized with `W_values[i]` which is uniformely sampled - # from -6./sqrt(n_hidden[i]+n_hidden[i+1]) and 6./sqrt(n_hidden[i]+n_hidden[i+1]) - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - W_values.append(numpy.asarray( numpy.random.uniform( \ - low = -numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ - high = numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ - size = (n_hidden[i-1], n_hidden[i])), dtype = theano.config.floatX)) - self.W.append(theano.shared( value = W_values[i] )) - self.b.append(theano.shared( value = numpy.zeros((n_hidden[i],), - dtype= theano.config.floatX))) - - # We initialize the matrix W[n_layer] and b[n_layer] that represent - # the parameters from the last hidden layer to the output layer using the - # same uniform sampling. - W_values.append(numpy.asarray( numpy.random.uniform( - low = -numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)), \ - high= numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)),\ - size= (n_hidden[n_layer-1], n_out)), dtype = theano.config.floatX)) - self.W.append(theano.shared( value = W_values[n_layer])) - self.b.append(theano.shared( value = numpy.zeros((n_out,), - dtype= theano.config.floatX))) - - # List of the symbolic expressions computing the values each hidden layer - self.hidden = [] - - # Symbolic expression of the first hidden layer - self.hidden.append(T.tanh(T.dot(input, self.W[0])+ self.b[0])) - for i in range(1,n_layer): - # Symbolic expression of the i-th hidden layer - self.hidden.append(T.tanh(T.dot(self.hidden[i-1], self.W[i])+ self.b[i])) - - # symbolic expression computing the values of the top layer - self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden[n_layer-1], self.W[n_layer])+self.b[n_layer]) - - # compute prediction as class whose probability is maximal in - # symbolic form - self.y_pred = T.argmax( self.p_y_given_x, axis =1) - - # L1 norm ; one regularization option is to enforce L1 norm to - # be small - self.L1=abs(self.W[0]).sum() - self.L2_sqr=abs(self.W[0]).sum() - for i in range(1,n_layer+1): - self.L1 += abs(self.W[i]).sum() - # square of L2 norm ; one regularization option is to enforce - # square of L2 norm to be small - for i in range(n_layer+1): - self.L2_sqr += abs(self.W[i]**2).sum() - - def negative_log_likelihood(self, y): - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) - - def errors(self, y): - """Return a float representing the number of errors in the minibatch - over the total number of examples of the minibatch - """ - - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError('y should have the same shape as self.y_pred', - ('y', target.type, 'y_pred', self.y_pred.type)) - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() -def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ - L2_reg = 0.0001, n_iter=100,n_hidden=[200,100,90,80,70]): - """ - Demonstrate stochastic gradient descent optimization for a multilayer - perceptron - - This is demonstrated on MNIST. - - :param learning_rate: learning rate used (factor for the stochastic - gradient - - :param L1_reg: L1-norm's weight when added to the cost (see - regularization) - - :param L2_reg: L2-norm's weight when added to the cost (see - regularization) - - :param n_iter: maximal number of iterations ot run the optimizer - - """ - - # Load the dataset - f = gzip.open('mnist.pkl.gz','rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - - # make minibatches of size 20 - batch_size = 20 # sized of the minibatch - - # Dealing with the training set - # get the list of training images (x) and their labels (y) - (train_set_x, train_set_y) = train_set - - # initialize the list of training minibatches with empty list - train_batches = [] - for i in xrange(0, len(train_set_x), batch_size): - # add to the list of minibatches the minibatch starting at - # position i, ending at position i+batch_size - # a minibatch is a pair ; the first element of the pair is a list - # of datapoints, the second element is the list of corresponding - # labels - train_batches = train_batches + \ - [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] - - # Dealing with the validation set - (valid_set_x, valid_set_y) = valid_set - # initialize the list of validation minibatches - valid_batches = [] - for i in xrange(0, len(valid_set_x), batch_size): - valid_batches = valid_batches + \ - [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] - - # Dealing with the testing set - (test_set_x, test_set_y) = test_set - # initialize the list of testing minibatches - test_batches = [] - for i in xrange(0, len(test_set_x), batch_size): - test_batches = test_batches + \ - [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] - - - ishape = (28,28) # this is the size of MNIST images - - # allocate symbolic variables for the data - x = T.fmatrix() # the data is presented as rasterized images - y = T.lvector() # the labels are presented as 1D vector of - # [long int] labels - - # construct the logistic regression class - classifier = MLP( input=x.reshape((batch_size,28*28)),\ - n_in=28*28, n_hidden=n_hidden, n_out=10) - - # the cost we minimize during training is the negative log likelihood of - # the model plus the regularization terms (L1 and L2); cost is expressed - # here symbolically - cost = classifier.negative_log_likelihood(y) \ - + L1_reg * classifier.L1 \ - + L2_reg * classifier.L2_sqr - - # compiling a theano function that computes the mistakes that are made by - # the model on a minibatch - test_model = theano.function([x,y], classifier.errors(y)) - g_W=[] - g_b=[] - # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) - for i in range(len(n_hidden)+1): - g_W.append(T.grad(cost, classifier.W[i])) - g_b.append(T.grad(cost, classifier.b[i])) - - - # specify how to update the parameters of the model as a dictionary - updates={} - for i in range(len(n_hidden)+1): - updates[classifier.W[i]]= classifier.W[i] - learning_rate*g_W[i] - updates[classifier.b[i]]= classifier.b[i] - learning_rate*g_b[i] - # compiling a theano function `train_model` that returns the cost, but in - # the same time updates the parameter of the model based on the rules - # defined in `updates` - train_model = theano.function([x, y], cost, updates = updates ) - n_minibatches = len(train_batches) - - # early-stopping parameters - patience = 10000 # look as this many examples regardless - patience_increase = 2 # wait this much longer when a new best is - # found - improvement_threshold = 0.995 # a relative improvement of this much is - # considered significant - validation_frequency = n_minibatches # go through this many - # minibatche before checking the network - # on the validation set; in this case we - # check every epoch - - - best_params = None - best_validation_loss = float('inf') - best_iter = 0 - test_score = 0. - start_time = time.clock() - # have a maximum of `n_iter` iterations through the entire dataset - for iter in xrange(n_iter* n_minibatches): - - # get epoch and minibatch index - epoch = iter / n_minibatches - minibatch_index = iter % n_minibatches - - # get the minibatches corresponding to `iter` modulo - # `len(train_batches)` - x,y = train_batches[ minibatch_index ] - cost_ij = train_model(x,y) - - if (iter+1) % validation_frequency == 0: - # compute zero-one loss on validation set - this_validation_loss = 0. - for x,y in valid_batches: - # sum up the errors for each minibatch - this_validation_loss += test_model(x,y) - # get the average by dividing with the number of minibatches - this_validation_loss /= len(valid_batches) - - print('epoch %i, minibatch %i/%i, validation error %f %%' % \ - (epoch, minibatch_index+1, n_minibatches, \ - this_validation_loss*100.)) - - - # if we got the best validation score until now - if this_validation_loss < best_validation_loss: - - #improve patience if loss improvement is good enough - if this_validation_loss < best_validation_loss * \ - improvement_threshold : - patience = max(patience, iter * patience_increase) - - # save best validation score and iteration number - best_validation_loss = this_validation_loss - best_iter = iter - - # test it on the test set - test_score = 0. - for x,y in test_batches: - test_score += test_model(x,y) - test_score /= len(test_batches) - print((' epoch %i, minibatch %i/%i, test error of best ' - 'model %f %%') % - (epoch, minibatch_index+1, n_minibatches, - test_score*100.)) - - if patience <= iter : - break - - end_time = time.clock() - print(('Optimization complete. Best validation score of %f %% ' - 'obtained at iteration %i, with test performance %f %%') % - (best_validation_loss * 100., best_iter, test_score*100.)) - print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) - #test on NIST (you need pylearn and access to NIST to do that) -if __name__ == '__main__': - sgd_optimization_mnist() - diff -r 6f606b359df3 -r a9af079892ce scripts/fonts_test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/fonts_test.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,19 @@ +#!/usr/bin/python + +import os +import ImageFont, ImageDraw, Image + +dir1 = "/data/lisa/data/ift6266h10/allfonts/" +#dir1 = "/Tmp/allfonts/" + +img = Image.new("L", (132,132)) +draw = ImageDraw.Draw(img) +L = [chr(ord('0')+x) for x in range(10)] + [chr(ord('A')+x) for x in range(26)] + [chr(ord('a')+x) for x in range(26)] + +for f in os.listdir(dir1): + try: + font = ImageFont.truetype(dir1+f, 25) + for l in L: + draw.text((60,60), l, font=font, fill="white") + except: + print dir1+f diff -r 6f606b359df3 -r a9af079892ce scripts/imgbg_test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/imgbg_test.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,15 @@ +#!/usr/bin/python + +import Image, cPickle + +f=open('/Tmp/image_net/filelist.pkl') +image_files = cPickle.load(f) +f.close() + +for i in range(len(image_files)): + filename = '/Tmp/image_net/' + image_files[i] + try: + image = Image.open(filename).convert('L') + except: + print filename + diff -r 6f606b359df3 -r a9af079892ce scripts/launch_generate100.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/launch_generate100.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +import os +dir1 = "/data/lisa/data/ift6266h10/" + +mach = ["maggie16.iro.umontreal.ca,zappa8@iro.umontreal.ca"] + +#test and valid sets +for i,s in enumerate(['valid','test']): + for j,c in enumerate([0.3,0.5,0.7,1]): + l = str(c).replace('.','') + os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P%s_%s_data.ft -p %sdata/P%s_%s_params -x %sdata/P%s_%s_labels.ft -f %s%s_data.ft -l %s%s_labels.ft -c %socr_%s_data.ft -d %socr_%s_labels.ft -m %s -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s %d -y %d" % (mach, dir1, l, s, dir1, l, s, dir1, l, s, dir1, s, dir1, s, dir1, s, dir1, s, c ,[20000,80000][i], 200+i*4+j)) + +#P07 +for i in range(100): + os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/P07_train%d_data.ft -p %sdata/P07_train%d_params -x %sdata/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i)) + +#PNIST07 +for i in range(100): + os.system("dbidispatch --condor --os=fc4,fc7,fc9 --machine=%s ./run_pipeline.sh -o %sdata/PNIST07_train%d_data.ft -p %sdata/PNIST07_train%d_params -x %sdata/PNIST07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d -t %d" % (mach, dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1, 100+i,1)) + + + +#P07 +#for i in [90,94]:#[2,10,13,15,20,49,68,82,86,90,94]: + #os.system("dbidispatch --condor --mem=3900 --os=fc4,fc7,fc9 --machine=maggie16.iro.umontreal.ca --machine=maggie15.iro.umontreal.ca --machine=zappa8@iro.umontreal.ca ./run_pipeline.sh -o %sdata2/P07_train%d_data.ft -p %sdata2/P07_train%d_params -x %sdata2/P07_train%d_labels.ft -f %strain_data.ft -l %strain_labels.ft -c %socr_train_data.ft -d %socr_train_labels.ft -m 0.7 -z 0.1 -a 0.1 -b 0.25 -g 0.25 -s 819200 -y %d" % (dir1, i, dir1, i, dir1, i, dir1, dir1, dir1, dir1,100+i)) diff -r 6f606b359df3 -r a9af079892ce scripts/nist_divide.py --- a/scripts/nist_divide.py Wed Feb 10 11:15:04 2010 -0500 +++ b/scripts/nist_divide.py Mon Mar 29 17:42:44 2010 -0400 @@ -3,8 +3,8 @@ ''' creation des ensembles train, valid et test NIST pur ensemble test est pris tel quel -ensemble valid est trainorig[:20000] -ensemble train est trainorig[20000:] +ensemble valid est trainorig[:80000] +ensemble train est trainorig[80000:] trainorig est deja shuffled ''' @@ -20,16 +20,16 @@ f = open(dir1 + "/all_train_data.ft") d = ft.read(f) f = open(dir2 + "valid_data.ft", 'wb') -ft.write(f, d[:20000]) +ft.write(f, d[:80000]) f = open(dir2 + "train_data.ft", 'wb') -ft.write(f, d[20000:]) +ft.write(f, d[80000:]) f = open(dir1 + "/all_train_labels.ft") d = ft.read(f) f = open(dir2 + "valid_labels.ft", 'wb') -ft.write(f, d[:20000]) +ft.write(f, d[:80000]) f = open(dir2 + "train_labels.ft", 'wb') -ft.write(f, d[20000:]) +ft.write(f, d[80000:]) for i in ["train", "valid", "test"]: os.chmod(dir2 + i + "_data.ft", 0744) diff -r 6f606b359df3 -r a9af079892ce scripts/nist_read2.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/nist_read2.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +from pylearn.io import filetensor as ft +import pylab, numpy + +datapath = '/data/lisa/data/ift6266h10/train_' + +f = open(datapath+'data.ft') +d = ft.read(f) + +f = open(datapath+'labels.ft') +labels = ft.read(f) + +def label2chr(l): + if l<10: + return chr(l + ord('0')) + elif l<36: + return chr(l-10 + ord('A')) + else: + return chr(l-36 + ord('a')) + +for i in range(min(d.shape[0],30)): + pylab.figure() + pylab.title(label2chr(labels[i])) + pylab.imshow(d[i].reshape((32,32))/255., pylab.matplotlib.cm.Greys_r, interpolation='nearest') + +pylab.show() + diff -r 6f606b359df3 -r a9af079892ce scripts/ocr_divide.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/ocr_divide.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +''' +creation des ensembles train, valid et test OCR +ensemble valid est trainorig[:80000] +ensemble test est trainorig[80000:160000] +ensemble train est trainorig[160000:] +trainorig est deja shuffled +''' + +from pylearn.io import filetensor as ft +import numpy, os + +dir1 = '/data/lisa/data/ocr_breuel/filetensor/' +dir2 = "/data/lisa/data/ift6266h10/" + +f = open(dir1 + 'unlv-corrected-2010-02-01-shuffled.ft') +d = ft.read(f) +f = open(dir2 + "ocr_valid_data.ft", 'wb') +ft.write(f, d[:80000]) +f = open(dir2 + "ocr_test_data.ft", 'wb') +ft.write(f, d[80000:160000]) +f = open(dir2 + "ocr_train_data.ft", 'wb') +ft.write(f, d[160000:]) + +f = open(dir1 + 'unlv-corrected-2010-02-01-labels-shuffled.ft') +d = ft.read(f) +f = open(dir2 + "ocr_valid_labels.ft", 'wb') +ft.write(f, d[:80000]) +f = open(dir2 + "ocr_test_labels.ft", 'wb') +ft.write(f, d[80000:160000]) +f = open(dir2 + "ocr_train_labels.ft", 'wb') +ft.write(f, d[160000:]) + +for i in ["train", "valid", "test"]: + os.chmod(dir2 + "ocr_" + i + "_data.ft", 0744) + os.chmod(dir2 + "ocr_" + i + "_labels.ft", 0744) + + + diff -r 6f606b359df3 -r a9af079892ce scripts/run_pipeline.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/run_pipeline.sh Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,20 @@ +#!/bin/bash + +# This is one _ugly_ hack, but I couldn't figure out how +# to cleanly pass command line options to the script if +# invoking using the "gimp --batch < script.py" syntax + +# Basically I create a temp file, put the args into it, +# then the script gets the filename and reads back the +# args + +export PIPELINE_ARGS_TMPFILE=`mktemp` + +for arg in "$@" +do + echo $arg >> $PIPELINE_ARGS_TMPFILE +done + +gimp -i --batch-interpreter python-fu-eval --batch - < ../data_generation/pipeline/pipeline.py + + diff -r 6f606b359df3 -r a9af079892ce scripts/setup_batches.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/setup_batches.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- + +import random +from numpy import * +from pylearn.io import filetensor as ft + +class Batches(): + def __init__(self): + data_path = '/data/lisa/data/nist/by_class/' + + digits_train_data = 'digits/digits_train_data.ft' + digits_train_labels = 'digits/digits_train_labels.ft' + digits_test_data = 'digits/digits_test_data.ft' + digits_test_labels = 'digits/digits_test_labels.ft' + + lower_train_data = 'lower/lower_train_data.ft' + lower_train_labels = 'lower/lower_train_labels.ft' + #upper_train_data = 'upper/upper_train_data.ft' + #upper_train_labels = 'upper/upper_train_labels.ft' + + print 'Opening data...' + + f_digits_train_data = open(data_path + digits_train_data) + f_digits_train_labels = open(data_path + digits_train_labels) + f_digits_test_data = open(data_path + digits_test_data) + f_digits_test_labels = open(data_path + digits_test_labels) + + f_lower_train_data = open(data_path + lower_train_data) + f_lower_train_labels = open(data_path + lower_train_labels) + #f_upper_train_data = open(data_path + upper_train_data) + #f_upper_train_labels = open(data_path + upper_train_labels) + + self.raw_digits_train_data = ft.read(f_digits_train_data) + self.raw_digits_train_labels = ft.read(f_digits_train_labels) + self.raw_digits_test_data = ft.read(f_digits_test_data) + self.raw_digits_test_labels = ft.read(f_digits_test_labels) + + self.raw_lower_train_data = ft.read(f_lower_train_data) + self.raw_lower_train_labels = ft.read(f_lower_train_labels) + #self.raw_upper_train_data = ft.read(f_upper_train_data) + #self.raw_upper_train_labels = ft.read(f_upper_train_labels) + + f_digits_train_data.close() + f_digits_train_labels.close() + f_digits_test_data.close() + f_digits_test_labels.close() + + f_lower_train_data.close() + f_lower_train_labels.close() + #f_upper_train_data.close() + #f_upper_train_labels.close() + + print 'Data opened' + + def set_batches(self, start_ratio = -1, end_ratio = -1, batch_size = 20, verbose = False): + self.batch_size = batch_size + + digits_train_size = len(self.raw_digits_train_labels) + digits_test_size = len(self.raw_digits_test_labels) + + lower_train_size = len(self.raw_lower_train_labels) + #upper_train_size = len(self.raw_upper_train_labels) + + if verbose == True: + print 'digits_train_size = %d' %digits_train_size + print 'digits_test_size = %d' %digits_test_size + print 'lower_train_size = %d' %lower_train_size + #print 'upper_train_size = %d' %upper_train_size + + # define main and other datasets + raw_main_train_data = self.raw_digits_train_data + raw_other_train_data = self.raw_lower_train_labels + raw_test_data = self.raw_digits_test_data + + raw_main_train_labels = self.raw_digits_train_labels + raw_other_train_labels = self.raw_lower_train_labels + raw_test_labels = self.raw_digits_test_labels + + main_train_size = len(raw_main_train_data) + other_train_size = len(raw_other_train_data) + test_size = len(raw_test_labels) + test_size = int(test_size/batch_size) + test_size *= batch_size + validation_size = test_size + + # default ratio is actual ratio + if start_ratio == -1: + self.start_ratio = float(main_train_size) / float(main_train_size + other_train_size) + else: + self.start_ratio = start_ratio + + if start_ratio == -1: + self.end_ratio = float(main_train_size) / float(main_train_size + other_train_size) + else: + self.end_ratio = end_ratio + + if verbose == True: + print 'start_ratio = %f' %self.start_ratio + print 'end_ratio = %f' %self.end_ratio + + i_main = 0 + i_other = 0 + i_batch = 0 + + # compute the number of batches given start and end ratios + n_main_batch = (main_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2)) + n_other_batch = (other_train_size - batch_size * (self.end_ratio - self.start_ratio) / 2 ) / (batch_size - batch_size * (self.start_ratio + (self.end_ratio - self.start_ratio) / 2)) + n_batches = min([n_main_batch, n_other_batch]) + + # train batches + self.train_batches = [] + + # as long as we have data left in main and other, we create batches + while i_main < main_train_size - batch_size - test_size and i_other < other_train_size - batch_size: + + ratio = self.start_ratio + i_batch * (self.end_ratio - self.start_ratio) / n_batches + batch_data = raw_main_train_data[0:self.batch_size] + batch_labels = raw_main_train_labels[0:self.batch_size] + + for i in xrange(0, self.batch_size): # randomly choose between main and other, given the current ratio + rnd = random.randint(0, 100) + + if rnd < 100 * ratio: + batch_data[i] = raw_main_train_data[i_main] + batch_labels[i] = raw_main_train_labels[i_main] + i_main += 1 + else: + batch_data[i] = raw_other_train_data[i_other] + batch_labels[i] = raw_other_train_labels[i_other] - 26 #to put values between 10 and 35 for lower case + i_other += 1 + + self.train_batches = self.train_batches + \ + [(batch_data, batch_labels)] + i_batch += 1 + + offset = i_main + + # test batches + self.test_batches = [] + for i in xrange(0, test_size, batch_size): + self.test_batches = self.test_batches + \ + [(raw_test_data[i:i+batch_size], raw_test_labels[i:i+batch_size])] + + # validation batches + self.validation_batches = [] + for i in xrange(0, test_size, batch_size): + self.validation_batches = self.validation_batches + \ + [(raw_main_train_data[offset+i:offset+i+batch_size], raw_main_train_labels[offset+i:offset+i+batch_size])] + + if verbose == True: + print 'n_main = %d' %i_main + print 'n_other = %d' %i_other + print 'nb_train_batches = %d / %d' %(i_batch,n_batches) + print 'offset = %d' %offset + + def get_train_batches(self): + return self.train_batches + + def get_test_batches(self): + return self.test_batches + + def get_validation_batches(self): + return self.validation_batches + + def test_set_batches(self, intervall = 1000): + for i in xrange(0, len(self.train_batches) - self.batch_size, intervall): + n_main = 0 + + for j in xrange(0, self.batch_size): + if self.train_batches[i][1][j] < 10: + n_main +=1 + print 'ratio batch %d : %f' %(i,float(n_main) / float(self.batch_size)) + +if __name__ == '__main__': + batches = Batches() + batches.set_batches(0.5,1, 20, True) + batches.test_set_batches() diff -r 6f606b359df3 -r a9af079892ce test.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,30 @@ +import doctest, sys, pkgutil + +def runTests(): + import ift6266 + for (_, name, ispkg) in pkgutil.walk_packages(ift6266.__path__, ift6266.__name__+'.'): + if not ispkg: + if name.startswith('ift6266.scripts.') or \ + name.startswith('ift6266.data_generation.transformations.pycaptcha.') or \ + name in ['ift6266.test', + 'ift6266.data_generation.transformations.testmod', + 'ift6266.data_generation.transformations.gimp_script']: + continue + test(name) + +def test(name): + import ift6266 + predefs = ift6266.__dict__ + options = doctest.ELLIPSIS or doctest.DONT_ACCEPT_TRUE_FOR_1 + print "Testing:", name + __import__(name) + doctest.testmod(sys.modules[name], extraglobs=predefs, optionflags=options) + +if __name__ == '__main__': + if len(sys.argv) > 1: + for mod in sys.argv[1:]: + if mod.endswith('.py'): + mod = mod[:-3] + test(mod) + else: + runTests() diff -r 6f606b359df3 -r a9af079892ce transformations/BruitGauss.py --- a/transformations/BruitGauss.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Ajout de bruit gaussien dans les donnees. A chaque iteration, un bruit poivre -et sel est ajoute, puis un lissage gaussien autour de ce point est ajoute. -On fait un nombre d'iteration = 1024*complexity/25 ce qui equivaud -a complexity/25 des points qui recoivent le centre du noyau gaussien. -Il y en a beaucoup moins que le bruit poivre et sel, car la transformation -est plutôt aggressive et touche beaucoup de pixels autour du centre - -La grandeur de la gaussienne ainsi que son ecart type sont definit par complexity -et par une composante aleatoire normale. - -Ce fichier prend pour acquis que les images sont donnees une a la fois -sous forme de numpy.array de 1024 (32 x 32) valeurs entre 0 et 1. - -Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 - -''' - -import numpy -import random -import scipy -from scipy import ndimage - -class BruitGauss(): - - def __init__(self): - self.proportion_bruit=0.1 #Le pourcentage des pixels qui seront bruites - self.nb_chng=10 #Le nombre de pixels changes. Seulement pour fin de calcul - self.sigma_gauss=3.0 #L'ecart type du noyau gaussien - self.grandeur=7 #Largeur de la fenetre gaussienne - - def get_settings_names(self): - return ['proportion_bruit','sigma_gauss','grandeur'] - - def regenerate_parameters(self, complexity): - self.proportion_bruit = float(complexity)/25 - self.nb_chng=int(1024*self.proportion_bruit) - if float(complexity) > 0: - self.sigma_gauss=max(0,numpy.random.normal(complexity*5,complexity)) - self.grandeur=int(min(31,max(1,8*complexity*numpy.random.normal(1,float(complexity)/2)))) - else: - self.sigma_gauss = 0 - self.grandeur=1 - #Un peu de paranoia ici, mais on ne sait jamais - - if self.grandeur%2 == 0: - self.grandeur+=1 #Toujours un nombre impair, plus simple plus tard - return self._get_current_parameters() - - def _get_current_parameters(self): - return [self.proportion_bruit,self.sigma_gauss,self.grandeur] - - - def transform_image(self, image): - image=image.reshape((32,32)) - - #creation du noyau gaussien - gauss=numpy.zeros((self.grandeur,self.grandeur)) - x0 = y0 = self.grandeur/2 - for i in xrange(0,self.grandeur): - for j in xrange(0,self.grandeur): - gauss[i,j]=numpy.exp(-4*numpy.log(2) * ((i-x0)**2 + (j-y0)**2) / self.sigma_gauss**2) - #pylab.contour(gauss) - #pylab.show() #Pour voir si la gaussienne est bien comme desiree - - #Chaque tour dans la boucle ajoute un pointpoivre et sel, puis - #y ajoute un bruit gaussien autour afin d'avoir de la correlation dans - #les points - - for i in xrange(0,self.nb_chng): - x_bruit=int(numpy.random.randint(0,32)) - y_bruit=int(numpy.random.randint(0,32)) - - image[x_bruit,y_bruit]=max(0,min(1,numpy.random.normal(0.4,self.proportion_bruit*20))) - - bord = int((self.grandeur-1)/2) - #Faire le "smooting" - for x in xrange(0,self.grandeur): - for y in xrange(0,self.grandeur): - #pour etre certain de ne pas changer le vide - if x_bruit-bord+x < 0: - continue - if y_bruit-bord+y < 0: - continue - if x_bruit-bord+x > 31: - continue - if y_bruit-bord+y > 31: - continue - image[x_bruit-bord+x,y_bruit-bord+y]=max(image[x_bruit-bord+x,y_bruit-bord+y],gauss[x,y]*image[x_bruit,y_bruit]) - #image[x_bruit-bord+x,y_bruit-bord+y]=min(1,image[x_bruit-bord+x,y_bruit-bord+y]*(1+gauss[x,y])) - #Cette derniere ligne n'est pas très interessante. Elle ajoute le bruit - #plutot que de prendre le max entre la valeur presente et le bruit. Ca rend l'image un peu - #chaostique, pas une bonne idee - - return image - -#---TESTS--- - -def _load_image(): - f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. - d = ft.read(f) - w=numpy.asarray(d[0]) - return (w/255.0).astype('float') - -def _test(complexite): - img=_load_image() - transfo = BruitGauss() - pylab.imshow(img.reshape((32,32))) - pylab.show() - print transfo.get_settings_names() - print transfo.regenerate_parameters(complexite) - - img_trans=transfo.transform_image(img) - - pylab.imshow(img_trans.reshape((32,32))) - pylab.show() - - -if __name__ == '__main__': - from pylearn.io import filetensor as ft - import pylab - _test(0.5) - - diff -r 6f606b359df3 -r a9af079892ce transformations/DistorsionGauss.py --- a/transformations/DistorsionGauss.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Ajout d'une composante aleatoire dans chaque pixel de l'image. -C'est une distorsion gaussienne de moyenne 0 et d'écart type complexity/10 - -Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 - -''' - -import numpy -import random - -class DistorsionGauss(): - - def __init__(self): - self.ecart_type=0.1 #L'ecart type de la gaussienne - - def get_settings_names(self): - return ['ecart_type'] - - def regenerate_parameters(self, complexity): - self.ecart_type=float(complexity)/10 - return self._get_current_parameters() - - def _get_current_parameters(self): - return [] - - def get_parameters_determined_by_complexity(self, complexity): - return [float(complexity)/10] - - def transform_image(self, image): - image=image.reshape(1024,1) - aleatoire=numpy.zeros((1024,1)).astype('float32') - for i in xrange(0,1024): - aleatoire[i]=float(random.gauss(0,self.ecart_type)) - image=image+aleatoire - - #Ramener tout entre 0 et 1 - if numpy.min(image) < 0: - image-=numpy.min(image) - if numpy.max(image) > 1: - image/=numpy.max(image) - - return image.reshape(32,32) - - -#---TESTS--- - -def _load_image(): - f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. - d = ft.read(f) - w=numpy.asarray(d[random.randint(0,100)]) - return (w/255.0).astype('float') - -def _test(complexite): - img=_load_image() - transfo = DistorsionGauss() - pylab.imshow(img.reshape((32,32))) - pylab.show() - print transfo.get_settings_names() - print transfo.regenerate_parameters(complexite) - - img_trans=transfo.transform_image(img) - - pylab.imshow(img_trans.reshape((32,32))) - pylab.show() - - -if __name__ == '__main__': - from pylearn.io import filetensor as ft - import pylab - for i in xrange(0,5): - _test(1) - - diff -r 6f606b359df3 -r a9af079892ce transformations/Occlusion.py --- a/transformations/Occlusion.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,138 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Ajout de bruit d'occlusion dans l'image originale. - -Le bruit provient d'un echantillon pris dans la seconde image puis rajoutee a -gauche ou a droite de l'image originale. De plus, il se peut aussi que le -bruit soit rajoute sur l'image originale, mais en plus pâle. - -Il faut s'assurer d'avoir le fichier echantillon_occlusion.ft (voir "files" sur assembla) -dans le repertoire ou est effectuee l'execution de Occlusion.py. Sinon, -simplement changer le "path" sur la ligne 46 - -Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 - -''' - - -import numpy -import scipy -import pylab -from pylearn.io import filetensor as ft - -class Occlusion(): - - def __init__(self): - #Ces 4 variables representent la taille du "crop" sur l'image2 - #Ce "crop" est pris a partie de image1[15,15], le milieu de l'image1 - self.haut=2 - self.bas=2 - self.gauche=2 - self.droite=2 - - #Ces deux variables representent le deplacement en x et y par rapport - #au milieu du bord gauche ou droit - self.x_arrivee=0 - self.y_arrivee=0 - - #Cette variable =1 si l'image est mise a gauche et -1 si a droite - #et =0 si au centre, mais plus pale - self.endroit=-1 - - #Cette variable determine l'opacite de l'ajout dans le cas ou on est au milieu - self.opacite=0.5 #C'est completement arbitraire. Possible de le changer si voulu - - f3 = open('/data/lisa/data/ift6266h10/echantillon_occlusion.ft') #Doit etre sur le reseau DIRO. - #Il faut arranger le path sinon - w=ft.read(f3) - f3.close() - - self.longueur=len(w) - self.d=(w.astype('float'))/255 - - - def get_settings_names(self): - return ['haut','bas','gauche','droite','x_arrivee','y_arrivee','endroit','rajout'] - - def regenerate_parameters(self, complexity): - self.haut=min(15,int(numpy.abs(numpy.random.normal(int(7*complexity),2)))) - self.bas=min(15,int(numpy.abs(numpy.random.normal(int(7*complexity),2)))) - self.gauche=min(15,int(numpy.abs(numpy.random.normal(int(7*complexity),2)))) - self.droite=min(15,int(numpy.abs(numpy.random.normal(int(7*complexity),2)))) - if self.haut+self.bas+self.gauche+self.droite==0: #Tres improbable - self.haut=1 - self.bas=1 - self.gauche=1 - self.droite=1 - - #Ces deux valeurs seront controlees afin d'etre certain de ne pas depasser - self.x_arrivee=int(numpy.abs(numpy.random.normal(0,2))) #Complexity n'entre pas en jeu, pas besoin - self.y_arrivee=int(numpy.random.normal(0,3)) - - self.rajout=numpy.random.randint(0,self.longueur) #les bouts de quelle lettre - - self.endroit=numpy.random.randint(-1,2) - #####Revoir si ces trois dernieres variables sont determinees ici ! - return self._get_current_parameters() - - def _get_current_parameters(self): - return [self.haut,self.bas,self.gauche,self.droite,self.x_arrivee,self.y_arrivee,self.endroit,self.rajout] - - - def transform_image(self, image): - #Attrapper le bruit d'occlusion - bruit=self.d[self.rajout].reshape((32,32))[15-self.haut:15+self.bas+1,15-self.gauche:15+self.droite+1] - - if self.x_arrivee+self.gauche+self.droite>32: - self.endroit*=-1 #On change de bord et on colle sur le cote - self.x_arrivee=0 - if self.y_arrivee-self.haut <-16: - self.y_arrivee=self.haut-16#On colle le morceau en haut - if self.y_arrivee+self.bas > 15: - self.y_arrivee=15-self.bas #On colle le morceau en bas - - if self.endroit==-1: #a gauche - image[(16+self.y_arrivee-self.haut):(16+self.y_arrivee+self.bas+1),(self.x_arrivee):(self.x_arrivee+self.gauche+self.droite+1)]=image[(16+self.y_arrivee-self.haut):(16+self.y_arrivee+self.bas+1),(self.x_arrivee):(self.x_arrivee+self.gauche+self.droite+1)]+bruit - - elif self.endroit==1: #a droite - image[(16+self.y_arrivee-self.haut):(16+self.y_arrivee+self.bas+1),(31-self.x_arrivee-self.gauche-self.droite):(31-self.x_arrivee+1)]=image[(16+self.y_arrivee-self.haut):(16+self.y_arrivee+self.bas+1),(31-self.x_arrivee-self.gauche-self.droite):(31-self.x_arrivee+1)]+bruit - - elif self.endroit==0: #au milieu - image[(16-self.haut):(16+self.bas+1),(16-self.gauche):(16+self.droite+1)]=image[(16-self.haut):(16+self.bas+1),(16-self.gauche):(16+self.droite+1)]+(bruit*self.opacite) - - #renormaliser le tout. Toutes les entrees doivent etre entre 0 et 1 - for ii in xrange(0,32): - for jj in xrange(0,32): - image[ii,jj]=min(1,image[ii,jj]) - - return image - -#---TESTS--- - -def _load_image(): - f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. - d = ft.read(f) - w=numpy.asarray(d[numpy.random.randint(0,50)]) - return (w/255.0).astype('float') - -def _test(complexite): - - transfo = Occlusion() - for i in xrange(0,20): - img = _load_image() - pylab.imshow(img.reshape((32,32))) - pylab.show() - print transfo.get_settings_names() - print transfo.regenerate_parameters(complexite) - - img_trans=transfo.transform_image(img.reshape((32,32))) - - pylab.imshow(img_trans.reshape((32,32))) - pylab.show() - - -if __name__ == '__main__': - import pylab - _test(0.5) \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce transformations/PermutPixel.py --- a/transformations/PermutPixel.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Un echange de pixels est effectue entre certain pixels choisit aleatoirement -et un de ses 4 voisins, tout aussi choisi aleatoirement. - -Le nombre de pixels permutes est definit pas complexity*1024 - -Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 - -''' - -import numpy -import random - -class PermutPixel(): - - def __init__(self): - self.nombre=10 #Le nombre de pixels a permuter - self.proportion=0.3 - - def get_settings_names(self): - return ['nombre'] - - def regenerate_parameters(self, complexity): - self.proportion=float(complexity) - self.nombre=int(256*self.proportion)*4 #Par multiple de 4 (256=1024/4) - return self._get_current_parameters() - - def _get_current_parameters(self): - return [] - - def get_parameters_determined_by_complexity(self, complexity): - return [int(complexity*256)*4] - - def transform_image(self, image): - image=image.reshape(1024,1) - temp=0 #variable temporaire - #constitution de l'echantillon - echantillon=random.sample(xrange(0,1024),self.nombre) - for i in xrange(0,self.nombre,4): - #gauche - if echantillon[i] > 0: - temp=image[echantillon[i]-1] - image[echantillon[i]-1]=image[echantillon[i]] - image[echantillon[i]]=temp - #droite - if echantillon[i+1] < 1023: - temp=image[echantillon[i+1]+1] - image[echantillon[i+1]+1]=image[echantillon[i+1]] - image[echantillon[i+1]]=temp - #haut - if echantillon[i+2] > 31: - temp=image[echantillon[i+2]-32] - image[echantillon[i+2]-32]=image[echantillon[i+2]] - image[echantillon[i+2]]=temp - #bas - if echantillon[i+3] < 992: - temp=image[echantillon[i+3]+32] - image[echantillon[i+3]+32]=image[echantillon[i+3]] - image[echantillon[i+3]]=temp - - - return image.reshape((32,32)) - - -#---TESTS--- - -def _load_image(): - f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. - d = ft.read(f) - w=numpy.asarray(d[random.randint(0,100)]) - return (w/255.0).astype('float') - -def _test(complexite): - img=_load_image() - transfo = PermutPixel() - pylab.imshow(img.reshape((32,32))) - pylab.show() - print transfo.get_settings_names() - print transfo.regenerate_parameters(complexite) - - img_trans=transfo.transform_image(img) - - pylab.imshow(img_trans.reshape((32,32))) - pylab.show() - - -if __name__ == '__main__': - from pylearn.io import filetensor as ft - import pylab - for i in xrange(0,5): - _test(0.5) - - diff -r 6f606b359df3 -r a9af079892ce transformations/PoivreSel.py --- a/transformations/PoivreSel.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Ajout de bruit poivre et sel dans les donnees. Le bruit est distribue de facon -aleatoire tire d'une uniforme tout comme la clarte des bites changees. - -La proportion de bites aleatoires est definit par complexity/5. -Lorsque cette valeur est a 1 ==> Plus reconnaissable et 0 ==> Rien ne se passe - -Ce fichier prend pour acquis que les images sont donnees une a la fois -sous forme de numpy.array de 1024 (32 x 32) valeurs entre 0 et 1. - -Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 - -''' - -import numpy -import random - -class PoivreSel(): - - def __init__(self): - self.proportion_bruit=0.1 #Le pourcentage des pixels qui seront bruites - self.nb_chng=10 #Le nombre de pixels changes. Seulement pour fin de calcul - - def get_settings_names(self): - return ['proportion_bruit'] - - def regenerate_parameters(self, complexity): - self.proportion_bruit = float(complexity)/5 - self.nb_chng=int(1024*self.proportion_bruit) - return self._get_current_parameters() - - def _get_current_parameters(self): - return [] - - def get_parameters_determined_by_complexity(self, complexity): - return [self.proportion_bruit] - - def transform_image(self, image): - image=image.reshape(1024,1) - changements=random.sample(xrange(numpy.size(image)),self.nb_chng) #Les pixels qui seront changes - for j in xrange(0,self.nb_chng): - image[changements[j]]=numpy.random.random() #On determine les nouvelles valeurs des pixels changes - return image.reshape(32,32) - - -#---TESTS--- - -def _load_image(): - f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. - d = ft.read(f) - w=numpy.asarray(d[0]) - return (w/255.0).astype('float') - -def _test(complexite): - img=_load_image() - transfo = PoivreSel() - pylab.imshow(img.reshape((32,32))) - pylab.show() - print transfo.get_settings_names() - print transfo.regenerate_parameters(complexite) - - img_trans=transfo.transform_image(img) - - pylab.imshow(img_trans.reshape((32,32))) - pylab.show() - - -if __name__ == '__main__': - from pylearn.io import filetensor as ft - import pylab - _test(0.5) - - diff -r 6f606b359df3 -r a9af079892ce transformations/Rature.py --- a/transformations/Rature.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Ajout de rature sur le caractère. La rature peut etre horizontale, verticale -(dans ces deux cas, l'amplacement de la bande est aleatoire) ou sur la diagonale -(et anti-diagonale). - -La largeur de la bande ainsi que sa clarté sont definies a l'aide de complexity -et d'une composante aleatoire. -clarte: 0=blanc et 1=noir - -Ce fichier prend pour acquis que les images sont donnees une a la fois -sous forme de numpy.array de 1024 (32 x 32) valeurs entre 0 et 1. - -Sylvain Pannetier Lebeuf dans le cadre de IFT6266, hiver 2010 - -''' - -import numpy -import random - -class Rature(): - - def __init__(self): - self.largeur=2 #Largeur de la bande - self.deplacement=0 #Deplacement par rapport au milieu - self.orientation=0 #0=horizontal, 1=vertical, 2=oblique - self.clarte=0.5 #Clarte de la ligne appliquee - self.faire=1 #Si ==1, on applique une rature - - def get_settings_names(self): - return ['orientation','deplacement','clarte','faire'] - - def regenerate_parameters(self, complexity): - #Il faut choisir parmis vertical, horizontal et diagonal. - #La methode n'est pas exacte, mais un peu plus rapide que generer un int. - #Complexity n'a rien a voir avec ce choix - - choix=numpy.random.random() - - if choix <0.34: - self.orientation=0 - elif choix <0.67: - self.orientation=1 - else: - self.orientation=2 - if float(complexity) > 0: - self.largeur=min(32,max(1,int(numpy.ceil(complexity*5)*numpy.random.normal(1,float(complexity)/2)))) - self.clarte=min(1,max(0,complexity*numpy.random.normal(1,float(complexity)/2))) - self.faire=numpy.random.binomial(1,float(complexity)) - else: - self.largeur=0 - self.clarte=0 - self.faire=0 #On ne fait rien !!! - - return self._get_current_parameters() - - def _get_current_parameters(self): - return [self.orientation,self.largeur,self.clarte,self.faire] - - def transform_image(self, image): - if self.faire == 0: - return image - - if self.orientation == 0: - return self._horizontal(image) - elif self.orientation == 1: - return self._vertical(image) - else: - return self._oblique(image) - - def _horizontal(self,image): - self.deplacement=numpy.random.normal(0,5) - #On s'assure de rester dans l'image - if self.deplacement < -16: #Si on recule trop - self.deplacement = -16 - if self.deplacement+self.largeur > 16: #Si on avance trop - self.deplacement=16-self.largeur - for i in xrange(0,self.largeur): - for j in xrange(0,32): - image[i+15+self.deplacement,j]=min(1,max(image[i+15+self.deplacement,j],self.clarte)) - return image - - def _vertical(self,image): - self.deplacement=numpy.random.normal(0,5) - #On s'assure de rester dans l'image - if self.deplacement < -16: #Si on recule trop - self.deplacement = -16 - if self.deplacement+self.largeur > 16: #Si on avance trop - self.deplacement=16-self.largeur - for i in xrange(0,self.largeur): - for j in xrange(0,32): - image[j,i+15+self.deplacement]=min(1,max(image[j,i+15+self.deplacement],self.clarte)) - return image - - def _oblique(self,image): - decision=numpy.random.random() - D=numpy.zeros((32,32)) #La matrice qui sera additionnee - for i in xrange(int(-numpy.floor(self.largeur/2)),int(numpy.ceil((self.largeur+1)/2))): - D+=numpy.eye(32,32,i) - if decision<0.5: #On met tout sur l'anti-diagonale - D = D[:,::-1] - D*=self.clarte - for i in xrange(0,32): - for j in xrange(0,32): - image[i,j]=min(1,max(image[i,j],D[i,j])) - return image - - -#---TESTS--- - -def _load_image(): - f = open('/home/sylvain/Dropbox/Msc/IFT6266/donnees/lower_test_data.ft') #Le jeu de donnees est en local. - d = ft.read(f) - w=numpy.asarray(d[1]) - return (w/255.0).astype('float') - -def _test(complexite): - img=_load_image() - transfo = Rature() - pylab.imshow(img.reshape((32,32))) - pylab.show() - print transfo.get_settings_names() - print transfo.regenerate_parameters(complexite) - img=img.reshape((32,32)) - - img_trans=transfo.transform_image(img) - - pylab.imshow(img_trans.reshape((32,32))) - pylab.show() - - -if __name__ == '__main__': - from pylearn.io import filetensor as ft - import pylab - _test(0.8) - - diff -r 6f606b359df3 -r a9af079892ce transformations/add_background_image.py --- a/transformations/add_background_image.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -#!/usr/bin/python -# -*- coding: iso-8859-1 -*- - -''' - Implementation of random background adding to a specific image - - Author: Guillaume Sicard -''' - -import sys, os, random -import cPickle -import Image, numpy - -class AddBackground(): - def __init__(self, threshold = 128): - self.h = 32 - self.w = 32 - self.threshold = threshold; - self.bg_image_file = '/data/lisa/data/ift6266h10/image_net/' - f=open(self.bg_image_file+'filelist.pkl') - self.image_files = cPickle.load(f) - f.close() - - # get threshold value - def get_settings_names(self): - return [str(self.threshold)] - - # no need, except for testmod.py - def regenerate_parameters(self, complexity): - value = random.gauss(0, 0.5*complexity) - return [value] - - # load an image - def load_image(self,filename): - image = Image.open(filename).convert('L') - image = numpy.asarray(image) - image = (image / 255.0).astype(numpy.float32) - return image - - # save an image - def save_image(self,array, filename): - image = (array * 255.0).astype('int') - image = Image.fromarray(image) - if (filename != ''): - image.save(filename) - else: - image.show() - - # make a random 32x32 crop of an image - def rand_crop(self,image): - i_w, i_h = image.shape - x, y = random.randint(0, i_w - self.w), random.randint(0, i_h - self.h) - return image[x:x + self.w, y:y + self.h] - - # select a random background image from "bg_image_file" and crops it - def rand_bg_image(self): - i = random.randint(0, len(self.image_files) - 1) - - image = self.load_image(self.bg_image_file + self.image_files[i]) - self.bg_image = self.rand_crop(image) - - # set "bg_image" as background to "image", based on a pixels threshold - def set_bg(self,image): - b = (image < self.threshold / 255.0).astype(numpy.float32) - return b * self.bg_image + ( 1 - b) * image - - # transform an image file and return an array - def transform_image_from_file(self, filename): - self.rand_bg_image() - image = self.load_image(filename) - image = self.set_bg(image) - return image - - # standard array to array transform - def transform_image(self, image): - self.rand_bg_image() - image = self.set_bg(image) - return image - - # test method - def test(self,filename): - import time - - sys.stdout.write('Starting addBackground test : loading image') - sys.stdout.flush() - - image = self.load_image(filename) - - t = 0 - n = 500 - for i in range(n): - t0 = time.time() - image2 = self.transform_image(image) - t = ( i * t + (time.time() - t0) ) / (i + 1) - sys.stdout.write('.') - sys.stdout.flush() - - print "Done!\nAverage time : " + str(1000 * t) + " ms" - -if __name__ == '__main__': - - myAddBackground = AddBackground() - myAddBackground.test('./images/0-LiberationSans-Italic.ttf.jpg') diff -r 6f606b359df3 -r a9af079892ce transformations/affine_transform.py --- a/transformations/affine_transform.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Simple implementation of random affine transformations based on the Python -Imaging Module affine transformations. - - -Author: Razvan Pascanu -''' - -import numpy, Image - - - -class AffineTransformation(): - def __init__( self, complexity = .5): - self.shape = (32,32) - self.rng = numpy.random.RandomState() - self.complexity = complexity - params = self.rng.uniform(size=6) -.5 - self.a = 1. + params[0]*.4*complexity - self.b = 0. + params[1]*.4*complexity - self.c = params[2]*8.*complexity - self.d = 0. + params[3]*.4*complexity - self.e = 1. + params[4]*.4*complexity - self.f = params[5]*8.*complexity - - - def _get_current_parameters(self): - return [self.a, self.b, self.c, self.d, self.e, self.f] - - def get_settings_names(self): - return ['a','b','c','d','e','f'] - - def regenerate_parameters(self, complexity): - # generate random affine transformation - # a point (x',y') of the new image corresponds to (x,y) of the old - # image where : - # x' = params[0]*x + params[1]*y + params[2] - # y' = params[3]*x + params[4]*y _ params[5] - - # the ranges are set manually as to look acceptable - - self.complexity = complexity - params = self.rng.uniform(size=6) -.5 - self.a = 1. + params[0]*.4*complexity - self.b = 0. + params[1]*.4*complexity - self.c = params[2]*8.*complexity - self.d = 0. + params[3]*.4*complexity - self.e = 1. + params[4]*.4*complexity - self.f = params[5]*8.*complexity - return self._get_current_parameters() - - - - - def transform_image(self,NIST_image): - - im = Image.fromarray( \ - numpy.asarray(\ - NIST_image.reshape(self.shape)*255.0, dtype='uint8')) - nwim = im.transform( (32,32), Image.AFFINE, [self.a,self.b,self.c,self.d,self.e,self.f]) - return numpy.asarray(nwim)/255.0 - - - -if __name__ =='__main__': - print 'random test' - - from pylearn.io import filetensor as ft - import pylab - - datapath = '/data/lisa/data/nist/by_class/' - - f = open(datapath+'digits/digits_train_data.ft') - d = ft.read(f) - f.close() - - - transformer = AffineTransformation() - id = numpy.random.randint(30) - - pylab.figure() - pylab.imshow(d[id].reshape((32,32))) - pylab.figure() - pylab.imshow(transformer.transform_image(d[id]).reshape((32,32))) - - pylab.show() - diff -r 6f606b359df3 -r a9af079892ce transformations/contrast.py --- a/transformations/contrast.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,137 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Simple implementation of random contrast. This always switch half the time the polarity. -then it decides of a random contrast dependant of the complexity, the mean of the maximum and minimum -pixel value stays 0 (to avoid import bias change between exemples). - -Author: Xavier Glorot -''' - -import numpy as N -import copy - - -class Contrast(): - def __init__(self,complexity = 1): - #---------- private attributes - self.__nx__ = 32 #xdim of the images - self.__ny__ = 32 #ydim of the images - self.__Pinvert__ = 0.5 #probability to switch polarity - self.__mincontrast__ = 0.15 - self.__resolution__ = 256 - self.__rangecontrastres__ = self.__resolution__ - N.int(self.__mincontrast__*self.__resolution__) - #------------------------------------------------ - - #---------- generation parameters - self.regenerate_parameters(complexity) - #------------------------------------------------ - - def _get_current_parameters(self): - return [self.invert,self.contrast] - - def get_settings_names(self): - return ['invert','contrast'] - - def regenerate_parameters(self, complexity): - self.invert = (N.random.uniform() < self.__Pinvert__) - self.contrast = self.__resolution__ - N.random.randint(1 + self.__rangecontrastres__ * complexity) - return self._get_current_parameters() - - def transform_1_image(self,image): #the real transformation method - maxi = image.max() - mini = image.min() - if self.invert: - newimage = 1 - (self.__resolution__- self.contrast) / (2 * float(self.__resolution__)) -\ - (image - mini) / float(maxi - mini) * self.contrast / float(self.__resolution__) - else: - newimage = (self.__resolution__- self.contrast) / (2 * float(self.__resolution__)) +\ - (image - mini) / float(maxi - mini) * self.contrast / float(self.__resolution__) - if image.dtype == 'uint8': - return N.asarray(newimage*255,dtype='uint8') - else: - return N.asarray(newimage,dtype=image.dtype) - - def transform_image(self,image): #handling different format - if image.shape == (self.__nx__,self.__ny__): - return self.transform_1_image(image) - if image.ndim == 3: - newimage = copy.copy(image) - for i in range(image.shape[0]): - newimage[i,:,:] = self.transform_1_image(image[i,:,:]) - return newimage - if image.ndim == 2 and image.shape != (self.__nx__,self.__ny__): - newimage = N.reshape(image,(image.shape[0],self.__nx__,self.__ny__)) - for i in range(image.shape[0]): - newimage[i,:,:] = self.transform_1_image(newimage[i,:,:]) - return N.reshape(newimage,image.shape) - if image.ndim == 1: - newimage = N.reshape(image,(self.__nx__,self.__ny__)) - newimage = self.transform_1_image(newimage) - return N.reshape(newimage,image.shape) - assert False #should never go there - - - - -#test on NIST (you need pylearn and access to NIST to do that) - -if __name__ == '__main__': - - from pylearn.io import filetensor as ft - import copy - import pygame - import time - datapath = '/data/lisa/data/nist/by_class/' - f = open(datapath+'digits/digits_train_data.ft') - d = ft.read(f) - - pygame.surfarray.use_arraytype('numpy') - - pygame.display.init() - screen = pygame.display.set_mode((8*2*32,8*32),0,8) - anglcolorpalette=[(x,x,x) for x in xrange(0,256)] - screen.set_palette(anglcolorpalette) - - MyContrast = Contrast() - - debut=time.time() - MyContrast.transform_image(d) - fin=time.time() - print '------------------------------------------------' - print d.shape[0],' images transformed in :', fin-debut, ' seconds' - print '------------------------------------------------' - print (fin-debut)/d.shape[0]*1000000,' microseconds per image' - print '------------------------------------------------' - print MyContrast.get_settings_names() - print MyContrast._get_current_parameters() - print MyContrast.regenerate_parameters(0) - print MyContrast.regenerate_parameters(0.5) - print MyContrast.regenerate_parameters(1) - for i in range(10000): - a=d[i,:] - b=N.asarray(N.reshape(a,(32,32))).T - - new=pygame.surfarray.make_surface(b) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(0,0)) - - print MyContrast.get_settings_names(), MyContrast.regenerate_parameters(1) - c=MyContrast.transform_image(a) - b=N.asarray(N.reshape(c,(32,32))).T - - new=pygame.surfarray.make_surface(b) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(8*32,0)) - - pygame.display.update() - raw_input('Press Enter') - - pygame.display.quit() diff -r 6f606b359df3 -r a9af079892ce transformations/filetensor.py --- a/transformations/filetensor.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,232 +0,0 @@ -""" -Read and write the matrix file format described at -U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} - -The format is for dense tensors: - - - magic number indicating type and endianness - 4bytes - - rank of tensor - int32 - - dimensions - int32, int32, int32, ... - - - -The number of dimensions and rank is slightly tricky: - - for scalar: rank=0, dimensions = [1, 1, 1] - - for vector: rank=1, dimensions = [?, 1, 1] - - for matrix: rank=2, dimensions = [?, ?, 1] - -For rank >= 3, the number of dimensions matches the rank exactly. - - -@todo: add complex type support - -""" -import sys -import numpy - -def _prod(lst): - p = 1 - for l in lst: - p *= l - return p - -_magic_dtype = { - 0x1E3D4C51 : ('float32', 4), - #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? - 0x1E3D4C53 : ('float64', 8), - 0x1E3D4C54 : ('int32', 4), - 0x1E3D4C55 : ('uint8', 1), - 0x1E3D4C56 : ('int16', 2), - } -_dtype_magic = { - 'float32': 0x1E3D4C51, - #'packed matrix': 0x1E3D4C52, - 'float64': 0x1E3D4C53, - 'int32': 0x1E3D4C54, - 'uint8': 0x1E3D4C55, - 'int16': 0x1E3D4C56 - } - -def _read_int32(f): - """unpack a 4-byte integer from the current position in file f""" - s = f.read(4) - s_array = numpy.fromstring(s, dtype='int32') - return s_array.item() - -def _read_header(f, debug=False): - """ - :returns: data type, element size, rank, shape, size - """ - #what is the data type of this matrix? - #magic_s = f.read(4) - #magic = numpy.fromstring(magic_s, dtype='int32') - magic = _read_int32(f) - magic_t, elsize = _magic_dtype[magic] - if debug: - print 'header magic', magic, magic_t, elsize - if magic_t == 'packed matrix': - raise NotImplementedError('packed matrix not supported') - - #what is the rank of the tensor? - ndim = _read_int32(f) - if debug: print 'header ndim', ndim - - #what are the dimensions of the tensor? - dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] - dim_size = _prod(dim) - if debug: print 'header dim', dim, dim_size - - return magic_t, elsize, ndim, dim, dim_size - -class arraylike(object): - """Provide an array-like interface to the filetensor in f. - - The rank parameter to __init__ controls how this object interprets the underlying tensor. - Its behaviour should be clear from the following example. - Suppose the underlying tensor is MxNxK. - - - If rank is 0, self[i] will be a scalar and len(self) == M*N*K. - - - If rank is 1, self[i] is a vector of length K, and len(self) == M*N. - - - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1. - - - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1. - - - :note: Objects of this class generally require exclusive use of the underlying file handle, because - they call seek() every time you access an element. - """ - - f = None - """File-like object""" - - magic_t = None - """numpy data type of array""" - - elsize = None - """number of bytes per scalar element""" - - ndim = None - """Rank of underlying tensor""" - - dim = None - """tuple of array dimensions (aka shape)""" - - dim_size = None - """number of scalars in the tensor (prod of dim)""" - - f_start = None - """The file position of the first element of the tensor""" - - readshape = None - """tuple of array dimensions of the block that we read""" - - readsize = None - """number of elements we must read for each block""" - - def __init__(self, f, rank=0, debug=False): - self.f = f - self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug) - self.f_start = f.tell() - - if rank <= self.ndim: - self.readshape = tuple(self.dim[self.ndim-rank:]) - else: - self.readshape = tuple(self.dim) - - #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim) - - if rank <= self.ndim: - padding = tuple() - else: - padding = (1,) * (rank - self.ndim) - - #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim) - self.returnshape = padding + self.readshape - self.readsize = _prod(self.readshape) - if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize - - def __len__(self): - return _prod(self.dim[:self.ndim-len(self.readshape)]) - - def __getitem__(self, idx): - if idx >= len(self): - raise IndexError(idx) - self.f.seek(self.f_start + idx * self.elsize * self.readsize) - return numpy.fromfile(self.f, - dtype=self.magic_t, - count=self.readsize).reshape(self.returnshape) - - -# -# TODO: implement item selection: -# e.g. load('some mat', subtensor=(:6, 2:5)) -# -# This function should be memory efficient by: -# - allocating an output matrix at the beginning -# - seeking through the file, reading subtensors from multiple places -def read(f, subtensor=None, debug=False): - """Load all or part of file 'f' into a numpy ndarray - - @param f: file from which to read - @type f: file-like object - - If subtensor is not None, it should be like the argument to - numpy.ndarray.__getitem__. The following two expressions should return - equivalent ndarray objects, but the one on the left may be faster and more - memory efficient if the underlying file f is big. - - read(f, subtensor) <===> read(f)[*subtensor] - - Support for subtensors is currently spotty, so check the code to see if your - particular type of subtensor is supported. - - """ - magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug) - f_start = f.tell() - - rval = None - if subtensor is None: - rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) - elif isinstance(subtensor, slice): - if subtensor.step not in (None, 1): - raise NotImplementedError('slice with step', subtensor.step) - if subtensor.start not in (None, 0): - bytes_per_row = _prod(dim[1:]) * elsize - f.seek(f_start + subtensor.start * bytes_per_row) - dim[0] = min(dim[0], subtensor.stop) - subtensor.start - rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) - else: - raise NotImplementedError('subtensor access not written yet:', subtensor) - - return rval - -def write(f, mat): - """Write a numpy.ndarray to file. - - @param f: file into which to write - @type f: file-like object - - @param mat: array to write to file - @type mat: numpy ndarray or compatible - - """ - def _write_int32(f, i): - i_array = numpy.asarray(i, dtype='int32') - if 0: print 'writing int32', i, i_array - i_array.tofile(f) - - try: - _write_int32(f, _dtype_magic[str(mat.dtype)]) - except KeyError: - raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) - - _write_int32(f, len(mat.shape)) - shape = mat.shape - if len(shape) < 3: - shape = list(shape) + [1] * (3 - len(shape)) - if 0: print 'writing shape =', shape - for sh in shape: - _write_int32(f, sh) - mat.tofile(f) - diff -r 6f606b359df3 -r a9af079892ce transformations/gimp_script.py --- a/transformations/gimp_script.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -''' -Filtres GIMP sous Python -Auteur: Nicolas Boulanger-Lewandowski -Date: Hiver 2010 - -run with: gimp -i --batch-interpreter python-fu-eval --batch - < gimp_script.py -end with: pdb.gimp_quit(0) - -Implémente le motionblur et le pinch -''' - -from gimpfu import * -import numpy - -img = gimp.Image(32, 32, GRAY) -img.disable_undo() -layer1 = gimp.Layer(img, "layer1", 32, 32, GRAY_IMAGE, 100, NORMAL_MODE) -img.add_layer(layer1, 0) -dest_rgn = layer1.get_pixel_rgn(0, 0, 32, 32, True) - -def setpix(image): - dest_rgn[:,:] = (image.T*255).astype(numpy.uint8).tostring() - layer1.flush() - layer1.update(0, 0, 32, 32) - -def getpix(): - return numpy.fromstring(dest_rgn[:,:], 'UInt8').astype(numpy.float32).reshape((32,32)).T / 255.0 - -class GIMP1(): - def get_settings_names(self): - return ['mblur_length', 'mblur_angle', 'pinch'] - - def regenerate_parameters(self, complexity): - if complexity: - self.mblur_length = abs(int(round(numpy.random.normal(0, 3*complexity)))) - else: - self.mblur_length = 0 - self.mblur_angle = int(round(numpy.random.uniform(0,360))) - self.pinch = numpy.random.uniform(-complexity, 0.7*complexity) - - return [self.mblur_length, self.mblur_angle, self.pinch] - - def transform_image(self, image): - if self.mblur_length or self.pinch: - setpix(image) - if self.mblur_length: - pdb.plug_in_mblur(img, layer1, 0, self.mblur_length, self.mblur_angle, 0, 0) - if self.pinch: - pdb.plug_in_whirl_pinch(img, layer1, 0.0, self.pinch, 1.0) - image = getpix() - - return image - -# test -if __name__ == '__main__': - import Image - im = numpy.asarray(Image.open("a.bmp").convert("L")) / 255.0 - - test = GIMP1() - print test.get_settings_names(), '=', test.regenerate_parameters(1) - #for i in range(1000): - im = test.transform_image(im) - - import pylab - pylab.imshow(im, pylab.matplotlib.cm.Greys_r) - pylab.show() - - pdb.gimp_quit(0) diff -r 6f606b359df3 -r a9af079892ce transformations/image_tiling.py --- a/transformations/image_tiling.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -""" -Illustrate filters (or data) in a grid of small image-shaped tiles. - -Note: taken from the pylearn codebase on Feb 4, 2010 (fsavard) -""" - -import numpy -from PIL import Image - -def scale_to_unit_interval(ndar,eps=1e-8): - ndar = ndar.copy() - ndar -= ndar.min() - ndar *= 1.0 / (ndar.max()+eps) - return ndar - -def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0,0), - scale_rows_to_unit_interval=True, - output_pixel_vals=True - ): - """ - Transform an array with one flattened image per row, into an array in which images are - reshaped and layed out like tiles on a floor. - - This function is useful for visualizing datasets whose rows are images, and also columns of - matrices for transforming those rows (such as the first layer of a neural net). - - :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can be 2-D ndarrays or None - :param X: a 2-D array in which every row is a flattened image. - :type img_shape: tuple; (height, width) - :param img_shape: the original shape of each image - :type tile_shape: tuple; (rows, cols) - :param tile_shape: the number of images to tile (rows, cols) - - :returns: array suitable for viewing as an image. (See:`PIL.Image.fromarray`.) - :rtype: a 2-d array with same dtype as X. - - """ - assert len(img_shape) == 2 - assert len(tile_shape) == 2 - assert len(tile_spacing) == 2 - - out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp - in zip(img_shape, tile_shape, tile_spacing)] - - if isinstance(X, tuple): - assert len(X) == 4 - if output_pixel_vals: - out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8') - else: - out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype) - - #colors default to 0, alpha defaults to 1 (opaque) - if output_pixel_vals: - channel_defaults = [0,0,0,255] - else: - channel_defaults = [0.,0.,0.,1.] - - for i in xrange(4): - if X[i] is None: - out_array[:,:,i] = numpy.zeros(out_shape, - dtype='uint8' if output_pixel_vals else out_array.dtype - )+channel_defaults[i] - else: - out_array[:,:,i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals) - return out_array - - else: - H, W = img_shape - Hs, Ws = tile_spacing - - out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype) - for tile_row in xrange(tile_shape[0]): - for tile_col in xrange(tile_shape[1]): - if tile_row * tile_shape[1] + tile_col < X.shape[0]: - if scale_rows_to_unit_interval: - this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)) - else: - this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape) - out_array[ - tile_row * (H+Hs):tile_row*(H+Hs)+H, - tile_col * (W+Ws):tile_col*(W+Ws)+W - ] \ - = this_img * (255 if output_pixel_vals else 1) - return out_array - - diff -r 6f606b359df3 -r a9af079892ce transformations/local_elastic_distortions.py --- a/transformations/local_elastic_distortions.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,453 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Implementation of elastic distortions as described in -Simard, Steinkraus, Platt, "Best Practices for Convolutional - Neural Networks Applied to Visual Document Analysis", 2003 - -Author: François Savard -Date: Fall 2009, revised Winter 2010 - -Usage: create the Distorter with proper alpha, sigma etc. - Then each time you want to change the distortion field applied, - call regenerate_field(). - - (The point behind this is that regeneration takes some time, - so we better reuse the fields a few times) -''' - -import sys -import math -import numpy -import numpy.random -import scipy.signal # convolve2d - -_TEST_DIR = "/u/savardf/ift6266/debug_images/" - -def _raw_zeros(size): - return [[0 for i in range(size[1])] for j in range(size[0])] - -class ElasticDistortionParams(): - def __init__(self, image_size=(32,32), alpha=0.0, sigma=0.0): - self.image_size = image_size - self.alpha = alpha - self.sigma = sigma - - h,w = self.image_size - - self.matrix_tl_corners_rows = _raw_zeros((h,w)) - self.matrix_tl_corners_cols = _raw_zeros((h,w)) - - self.matrix_tr_corners_rows = _raw_zeros((h,w)) - self.matrix_tr_corners_cols = _raw_zeros((h,w)) - - self.matrix_bl_corners_rows = _raw_zeros((h,w)) - self.matrix_bl_corners_cols = _raw_zeros((h,w)) - - self.matrix_br_corners_rows = _raw_zeros((h,w)) - self.matrix_br_corners_cols = _raw_zeros((h,w)) - - # those will hold the precomputed ratios for - # bilinear interpolation - self.matrix_tl_multiply = numpy.zeros((h,w)) - self.matrix_tr_multiply = numpy.zeros((h,w)) - self.matrix_bl_multiply = numpy.zeros((h,w)) - self.matrix_br_multiply = numpy.zeros((h,w)) - - def alpha_sigma(self): - return [self.alpha, self.sigma] - -class LocalElasticDistorter(): - def __init__(self, image_size=(32,32)): - self.image_size = image_size - - self.current_complexity_10 = 0 - self.current_complexity = 0 - - # number of precomputed fields - # (principle: as complexity doesn't change often, we can - # precompute a certain number of fields for a given complexity, - # each with its own parameters. That way, we have good - # randomization, but we're much faster). - self.to_precompute_per_complexity = 50 - - # Both use ElasticDistortionParams - self.current_params = None - self.precomputed_params = [[] for i in range(10)] - - # - self.kernel_size = None - self.kernel = None - - # set some defaults - self.regenerate_parameters(0.0) - - def get_settings_names(self): - return ['alpha', 'sigma'] - - def _floor_complexity(self, complexity): - return self._to_complexity_10(complexity) / 10.0 - - def _to_complexity_10(self, complexity): - return min(9, max(0, int(complexity * 10))) - - def regenerate_parameters(self, complexity): - complexity_10 = self._to_complexity_10(complexity) - - if complexity_10 != self.current_complexity_10: - self.current_complexity_10 = complexity_10 - self.current_complexity = self._floor_complexity(complexity) - - if len(self.precomputed_params[complexity_10]) <= self.to_precompute_per_complexity: - # not yet enough params generated, produce one more - # and append to list - new_params = self._initialize_new_params() - new_params = self._generate_fields(new_params) - self.current_params = new_params - self.precomputed_params[complexity_10].append(new_params) - else: - # if we have enough precomputed fields, just select one - # at random and set parameters to match what they were - # when the field was generated - idx = numpy.random.randint(0, len(self.precomputed_params[complexity_10])) - self.current_params = self.precomputed_params[complexity_10][idx] - - # don't return anything, to avoid storing deterministic parameters - return [] # self.current_params.alpha_sigma() - - def get_parameters_determined_by_complexity(self, complexity): - tmp_params = self._initialize_new_params(_floor_complexity(complexity)) - return tmp_params.alpha_sigma() - - # adapted from http://blenderartists.org/forum/showthread.php?t=163361 - def _gen_gaussian_kernel(self, sigma): - # the kernel size can change DRAMATICALLY the time - # for the blur operation... so even though results are better - # with a bigger kernel, we need to compromise here - # 1*s is very different from 2*s, but there's not much difference - # between 2*s and 4*s - ks = self.kernel_size - s = sigma - target_ks = (1.5*s, 1.5*s) - if not ks is None and ks[0] == target_ks[0] and ks[1] == target_ks[1]: - # kernel size is good, ok, no need to regenerate - return - self.kernel_size = target_ks - h,w = self.kernel_size - a,b = h/2.0, w/2.0 - y,x = numpy.ogrid[0:w, 0:h] - gauss = numpy.exp(-numpy.square((x-a)/s))*numpy.exp(-numpy.square((y-b)/s)) - # Normalize so we don't reduce image intensity - self.kernel = gauss/gauss.sum() - - def _gen_distortion_field(self, params): - self._gen_gaussian_kernel(params.sigma) - - # we add kernel_size on all four sides so blurring - # with the kernel produces a smoother result on borders - ks0 = self.kernel_size[0] - ks1 = self.kernel_size[1] - sz0 = self.image_size[1] + ks0 - sz1 = self.image_size[0] + ks1 - field = numpy.random.uniform(-1.0, 1.0, (sz0, sz1)) - field = scipy.signal.convolve2d(field, self.kernel, mode='same') - - # crop only image_size in the middle - field = field[ks0:ks0+self.image_size[0], ks1:ks1+self.image_size[1]] - - return params.alpha * field - - - def _initialize_new_params(self, complexity=None): - if not complexity: - complexity = self.current_complexity - - params = ElasticDistortionParams(self.image_size) - - # pour faire progresser la complexité un peu plus vite - # tout en gardant les extrêmes de 0.0 et 1.0 - complexity = complexity ** (1./3.) - - # the smaller the alpha, the closest the pixels are fetched - # a max of 10 is reasonable - params.alpha = complexity * 10.0 - - # the bigger the sigma, the smoother is the distortion - # max of 1 is "reasonable", but produces VERY noisy results - # And the bigger the sigma, the bigger the blur kernel, and the - # slower the field generation, btw. - params.sigma = 10.0 - (7.0 * complexity) - - return params - - def _generate_fields(self, params): - ''' - Here's how the code works: - - We first generate "distortion fields" for x and y with these steps: - - Uniform noise over [-1, 1] in a matrix of size (h,w) - - Blur with a Gaussian kernel of spread sigma - - Multiply by alpha - - Then (conceptually) to compose the distorted image, we loop over each pixel - of the new image and use the corresponding x and y distortions - (from the matrices generated above) to identify pixels - of the old image from which we fetch color data. As the - coordinates are not integer, we interpolate between the - 4 nearby pixels (top left, top right etc.). - - That's just conceptually. Here I'm using matrix operations - to speed up the computation. I first identify the 4 nearby - pixels in the old image for each pixel in the distorted image. - I can then use them as "fancy indices" to extract the proper - pixels for each new pixel. - - Then I multiply those extracted nearby points by precomputed - ratios for the bilinear interpolation. - ''' - - p = params - - dist_fields = [None, None] - dist_fields[0] = self._gen_distortion_field(params) - dist_fields[1] = self._gen_distortion_field(params) - - #pylab.imshow(dist_fields[0]) - #pylab.show() - - # regenerate distortion index matrices - # "_rows" are row indices - # "_cols" are column indices - # (separated due to the way fancy indexing works in numpy) - h,w = p.image_size - - for y in range(h): - for x in range(w): - distort_x = dist_fields[0][y,x] - distort_y = dist_fields[1][y,x] - - # the "target" is the coordinate we fetch color data from - # (in the original image) - # target_left and _top are the rounded coordinate on the - # left/top of this target (float) coordinate - target_pixel = (y+distort_y, x+distort_x) - - target_left = int(math.floor(x + distort_x)) - target_top = int(math.floor(y + distort_y)) - - index_tl = [target_top, target_left] - index_tr = [target_top, target_left+1] - index_bl = [target_top+1, target_left] - index_br = [target_top+1, target_left+1] - - # x_ratio is the ratio of importance of left pixels - # y_ratio is the """" of top pixels - # (in bilinear combination) - y_ratio = 1.0 - (target_pixel[0] - target_top) - x_ratio = 1.0 - (target_pixel[1] - target_left) - - # We use a default background color of 0 for displacements - # outside of boundaries of the image. - - # if top left outside bounds - if index_tl[0] < 0 or index_tl[0] >= h or index_tl[1] < 0 or index_tl[1] >= w: - p.matrix_tl_corners_rows[y][x] = 0 - p.matrix_tl_corners_cols[y][x] = 0 - p.matrix_tl_multiply[y,x] = 0 - else: - p.matrix_tl_corners_rows[y][x] = index_tl[0] - p.matrix_tl_corners_cols[y][x] = index_tl[1] - p.matrix_tl_multiply[y,x] = x_ratio*y_ratio - - # if top right outside bounds - if index_tr[0] < 0 or index_tr[0] >= h or index_tr[1] < 0 or index_tr[1] >= w: - p.matrix_tr_corners_rows[y][x] = 0 - p.matrix_tr_corners_cols[y][x] = 0 - p.matrix_tr_multiply[y,x] = 0 - else: - p.matrix_tr_corners_rows[y][x] = index_tr[0] - p.matrix_tr_corners_cols[y][x] = index_tr[1] - p.matrix_tr_multiply[y,x] = (1.0-x_ratio)*y_ratio - - # if bottom left outside bounds - if index_bl[0] < 0 or index_bl[0] >= h or index_bl[1] < 0 or index_bl[1] >= w: - p.matrix_bl_corners_rows[y][x] = 0 - p.matrix_bl_corners_cols[y][x] = 0 - p.matrix_bl_multiply[y,x] = 0 - else: - p.matrix_bl_corners_rows[y][x] = index_bl[0] - p.matrix_bl_corners_cols[y][x] = index_bl[1] - p.matrix_bl_multiply[y,x] = x_ratio*(1.0-y_ratio) - - # if bottom right outside bounds - if index_br[0] < 0 or index_br[0] >= h or index_br[1] < 0 or index_br[1] >= w: - p.matrix_br_corners_rows[y][x] = 0 - p.matrix_br_corners_cols[y][x] = 0 - p.matrix_br_multiply[y,x] = 0 - else: - p.matrix_br_corners_rows[y][x] = index_br[0] - p.matrix_br_corners_cols[y][x] = index_br[1] - p.matrix_br_multiply[y,x] = (1.0-x_ratio)*(1.0-y_ratio) - - # not really necessary, but anyway - return p - - def transform_image(self, image): - p = self.current_params - - # index pixels to get the 4 corners for bilinear combination - tl_pixels = image[p.matrix_tl_corners_rows, p.matrix_tl_corners_cols] - tr_pixels = image[p.matrix_tr_corners_rows, p.matrix_tr_corners_cols] - bl_pixels = image[p.matrix_bl_corners_rows, p.matrix_bl_corners_cols] - br_pixels = image[p.matrix_br_corners_rows, p.matrix_br_corners_cols] - - # bilinear ratios, elemwise multiply - tl_pixels = numpy.multiply(tl_pixels, p.matrix_tl_multiply) - tr_pixels = numpy.multiply(tr_pixels, p.matrix_tr_multiply) - bl_pixels = numpy.multiply(bl_pixels, p.matrix_bl_multiply) - br_pixels = numpy.multiply(br_pixels, p.matrix_br_multiply) - - # sum to finish bilinear combination - return numpy.sum([tl_pixels,tr_pixels,bl_pixels,br_pixels], axis=0).astype(numpy.float32) - -# TESTS ---------------------------------------------------------------------- - -def _load_image(filepath): - _RGB_TO_GRAYSCALE = [0.3, 0.59, 0.11, 0.0] - img = Image.open(filepath) - img = numpy.asarray(img) - if len(img.shape) > 2: - img = (img * _RGB_TO_GRAYSCALE).sum(axis=2) - return (img / 255.0).astype('float') - -def _specific_test(): - imgpath = os.path.join(_TEST_DIR, "d.png") - img = _load_image(imgpath) - dist = LocalElasticDistorter((32,32)) - print dist.regenerate_parameters(0.5) - img = dist.transform_image(img) - print dist.get_parameters_determined_by_complexity(0.4) - pylab.imshow(img) - pylab.show() - -def _complexity_tests(): - imgpath = os.path.join(_TEST_DIR, "d.png") - dist = LocalElasticDistorter((32,32)) - orig_img = _load_image(imgpath) - html_content = '''Original:
    ''' - for complexity in numpy.arange(0.0, 1.1, 0.1): - html_content += '
    Complexity: ' + str(complexity) + '
    ' - for i in range(10): - t1 = time.time() - dist.regenerate_parameters(complexity) - t2 = time.time() - print "diff", t2-t1 - img = dist.transform_image(orig_img) - filename = "complexity_" + str(complexity) + "_" + str(i) + ".png" - new_path = os.path.join(_TEST_DIR, filename) - _save_image(img, new_path) - html_content += '' - html_content += "" - html_file = open(os.path.join(_TEST_DIR, "complexity.html"), "w") - html_file.write(html_content) - html_file.close() - -def _complexity_benchmark(): - imgpath = os.path.join(_TEST_DIR, "d.png") - dist = LocalElasticDistorter((32,32)) - orig_img = _load_image(imgpath) - - for cpx in (0.21, 0.35): - # time the first 10 - t1 = time.time() - for i in range(10): - dist.regenerate_parameters(cpx) - img = dist.transform_image(orig_img) - t2 = time.time() - - print "first 10, total = ", t2-t1, ", avg=", (t2-t1)/10 - - # time the next 40 - t1 = time.time() - for i in range(40): - dist.regenerate_parameters(cpx) - img = dist.transform_image(orig_img) - t2 = time.time() - - print "next 40, total = ", t2-t1, ", avg=", (t2-t1)/40 - - # time the next 50 - t1 = time.time() - for i in range(50): - dist.regenerate_parameters(cpx) - img = dist.transform_image(orig_img) - t2 = time.time() - - print "next 50, total = ", t2-t1, ", avg=", (t2-t1)/50 - - # time the next 1000 - t1 = time.time() - for i in range(1000): - dist.regenerate_parameters(cpx) - img = dist.transform_image(orig_img) - t2 = time.time() - - print "next 1000, total = ", t2-t1, ", avg=", (t2-t1)/1000 - - # time the next 1000 with old complexity - t1 = time.time() - for i in range(1000): - dist.regenerate_parameters(0.21) - img = dist.transform_image(orig_img) - t2 = time.time() - - print "next 1000, total = ", t2-t1, ", avg=", (t2-t1)/1000 - - - - -def _save_image(img, path): - img2 = Image.fromarray((img * 255).astype('uint8'), "L") - img2.save(path) - -# TODO: reformat to follow new class... it function of complexity now -''' -def _distorter_tests(): - #import pylab - #pylab.imshow(img) - #pylab.show() - - for letter in ("d", "a", "n", "o"): - img = _load_image("tests/" + letter + ".png") - for alpha in (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0): - for sigma in (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0): - id = LocalElasticDistorter((32,32)) - img2 = id.distort_image(img) - img2 = Image.fromarray((img2 * 255).astype('uint8'), "L") - img2.save("tests/"+letter+"_alpha"+str(alpha)+"_sigma"+str(sigma)+".png") -''' - -def _benchmark(): - img = _load_image("tests/d.png") - dist = LocalElasticDistorter((32,32)) - dist.regenerate_parameters(0.0) - import time - t1 = time.time() - for i in range(10000): - if i % 1000 == 0: - print "-" - dist.distort_image(img) - t2 = time.time() - print "t2-t1", t2-t1 - print "avg", 10000/(t2-t1) - -if __name__ == '__main__': - import time - import pylab - import Image - import os.path - #_distorter_tests() - #_benchmark() - #_specific_test() - #_complexity_tests() - _complexity_benchmark() - - - diff -r 6f606b359df3 -r a9af079892ce transformations/pipeline.py --- a/transformations/pipeline.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,365 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -from __future__ import with_statement - -# This is intended to be run as a GIMP script -#from gimpfu import * - -import sys, os, getopt -import numpy -import filetensor as ft -import random - -# To debug locally, also call with -s 100 (to stop after ~100) -# (otherwise we allocate all needed memory, might be loonnng and/or crash -# if, lucky like me, you have an age-old laptop creaking from everywhere) -DEBUG = False -DEBUG_X = False -if DEBUG: - DEBUG_X = False # Debug under X (pylab.show()) - -DEBUG_IMAGES_PATH = None -if DEBUG: - # UNTESTED YET - # To avoid loading NIST if you don't have it handy - # (use with debug_images_iterator(), see main()) - # To use NIST, leave as = None - DEBUG_IMAGES_PATH = None#'/home/francois/Desktop/debug_images' - -# Directory where to dump images to visualize results -# (create it, otherwise it'll crash) -DEBUG_OUTPUT_DIR = 'debug_out' - -DEFAULT_NIST_PATH = '/data/lisa/data/ift6266h10/train_data.ft' -DEFAULT_LABEL_PATH = '/data/lisa/data/ift6266h10/train_labels.ft' -DEFAULT_OCR_PATH = '/data/lisa/data/ocr_breuel/filetensor/unlv-corrected-2010-02-01-shuffled.ft' -DEFAULT_OCRLABEL_PATH = '/data/lisa/data/ocr_breuel/filetensor/unlv-corrected-2010-02-01-labels-shuffled.ft' -ARGS_FILE = os.environ['PIPELINE_ARGS_TMPFILE'] - -if DEBUG_X: - import pylab - pylab.ion() - -#from add_background_image import AddBackground -#from affine_transform import AffineTransformation -from PoivreSel import PoivreSel -from thick import Thick -#from BruitGauss import BruitGauss -#from gimp_script import GIMPTransformation -#from Rature import Rature -from contrast import Contrast -from local_elastic_distortions import LocalElasticDistorter -from slant import Slant - -if DEBUG: - from visualizer import Visualizer - # Either put the visualizer as in the MODULES_INSTANCES list - # after each module you want to visualize, or in the - # AFTER_EACH_MODULE_HOOK list (but not both, it's redundant) - VISUALIZER = Visualizer(to_dir=DEBUG_OUTPUT_DIR, on_screen=False) - -MODULE_INSTANCES = [LocalElasticDistorter()] - -# These should have a "after_transform_callback(self, image)" method -# (called after each call to transform_image in a module) -AFTER_EACH_MODULE_HOOK = [] -if DEBUG: - AFTER_EACH_MODULE_HOOK = [VISUALIZER] - -# These should have a "end_transform_callback(self, final_image" method -# (called after all modules have been called) -END_TRANSFORM_HOOK = [] -if DEBUG: - END_TRANSFORM_HOOK = [VISUALIZER] - -class Pipeline(): - def __init__(self, modules, num_img, image_size=(32,32)): - self.modules = modules - self.num_img = num_img - self.num_params_stored = 0 - self.image_size = image_size - - self.init_memory() - - def init_num_params_stored(self): - # just a dummy call to regenerate_parameters() to get the - # real number of params (only those which are stored) - self.num_params_stored = 0 - for m in self.modules: - self.num_params_stored += len(m.regenerate_parameters(0.0)) - - def init_memory(self): - self.init_num_params_stored() - - total = self.num_img - num_px = self.image_size[0] * self.image_size[1] - - self.res_data = numpy.empty((total, num_px), dtype=numpy.uint8) - # +1 to store complexity - self.params = numpy.empty((total, self.num_params_stored+1)) - self.res_labels = numpy.empty(total, dtype=numpy.int32) - - def run(self, img_iterator, complexity_iterator): - img_size = self.image_size - - should_hook_after_each = len(AFTER_EACH_MODULE_HOOK) != 0 - should_hook_at_the_end = len(END_TRANSFORM_HOOK) != 0 - - for img_no, (img, label) in enumerate(img_iterator): - sys.stdout.flush() - complexity = complexity_iterator.next() - - global_idx = img_no - - img = img.reshape(img_size) - - param_idx = 1 - # store complexity along with other params - self.params[global_idx, 0] = complexity - for mod in self.modules: - # This used to be done _per batch_, - # ie. out of the "for img" loop - p = mod.regenerate_parameters(complexity) - self.params[global_idx, param_idx:param_idx+len(p)] = p - param_idx += len(p) - - img = mod.transform_image(img) - - if should_hook_after_each: - for hook in AFTER_EACH_MODULE_HOOK: - hook.after_transform_callback(img) - - self.res_data[global_idx] = \ - img.reshape((img_size[0] * img_size[1],))*255 - self.res_labels[global_idx] = label - - if should_hook_at_the_end: - for hook in END_TRANSFORM_HOOK: - hook.end_transform_callback(img) - - def write_output(self, output_file_path, params_output_file_path, labels_output_file_path): - with open(output_file_path, 'wb') as f: - ft.write(f, self.res_data) - - numpy.save(params_output_file_path, self.params) - - with open(labels_output_file_path, 'wb') as f: - ft.write(f, self.res_labels) - - -############################################################################## -# COMPLEXITY ITERATORS -# They're called once every img, to get the complexity to use for that img -# they must be infinite (should never throw StopIteration when calling next()) - -# probability of generating 0 complexity, otherwise -# uniform over 0.0-max_complexity -def range_complexity_iterator(probability_zero, max_complexity): - assert max_complexity <= 1.0 - n = numpy.random.uniform(0.0, 1.0) - while True: - if n < probability_zero: - yield 0.0 - else: - yield numpy.random.uniform(0.0, max_complexity) - -############################################################################## -# DATA ITERATORS -# They can be used to interleave different data sources etc. - -''' -# Following code (DebugImages and iterator) is untested - -def load_image(filepath): - _RGB_TO_GRAYSCALE = [0.3, 0.59, 0.11, 0.0] - img = Image.open(filepath) - img = numpy.asarray(img) - if len(img.shape) > 2: - img = (img * _RGB_TO_GRAYSCALE).sum(axis=2) - return (img / 255.0).astype('float') - -class DebugImages(): - def __init__(self, images_dir_path): - import glob, os.path - self.filelist = glob.glob(os.path.join(images_dir_path, "*.png")) - -def debug_images_iterator(debug_images): - for path in debug_images.filelist: - yield load_image(path) -''' - -class NistData(): - def __init__(self, nist_path, label_path, ocr_path, ocrlabel_path): - self.train_data = open(nist_path, 'rb') - self.train_labels = open(label_path, 'rb') - self.dim = tuple(ft._read_header(self.train_data)[3]) - # in order to seek to the beginning of the file - self.train_data.close() - self.train_data = open(nist_path, 'rb') - self.ocr_data = open(ocr_path, 'rb') - self.ocr_labels = open(ocrlabel_path, 'rb') - -def nist_supp_iterator(nist, prob_font, prob_captcha, prob_ocr, num_img): - img = ft.read(nist.train_data).astype(numpy.float32)/255 - labels = ft.read(nist.train_labels) - if prob_ocr: - ocr_img = ft.read(nist.ocr_data).astype(numpy.float32)/255 - ocr_labels = ft.read(nist.ocr_labels) - - for i in xrange(num_img): - r = numpy.random.rand() - if r <= prob_font: - pass #get font - elif r <= prob_font + prob_captcha: - pass #get captcha - elif r <= prob_font + prob_captcha + prob_ocr: - j = numpy.random.randint(len(ocr_labels)) - yield ocr_img[j], ocr_labels[j] - else: - j = numpy.random.randint(len(labels)) - yield img[j], labels[j] - - -# Mostly for debugging, for the moment, just to see if we can -# reload the images and parameters. -def reload(output_file_path, params_output_file_path): - images_ft = open(output_file_path, 'rb') - images_ft_dim = tuple(ft._read_header(images_ft)[3]) - - print "Images dimensions: ", images_ft_dim - - params = numpy.load(params_output_file_path) - - print "Params dimensions: ", params.shape - print params - - -############################################################################## -# MAIN - -def usage(): - print ''' -Usage: run_pipeline.sh [-m ...] [-z ...] [-o ...] [-p ...] - -m, --max-complexity: max complexity to generate for an image - -z, --probability-zero: probability of using complexity=0 for an image - -o, --output-file: full path to file to use for output of images - -p, --params-output-file: path to file to output params to - -x, --labels-output-file: path to file to output labels to - -f, --data-file: path to filetensor (.ft) data file (NIST) - -l, --label-file: path to filetensor (.ft) labels file (NIST labels) - -c, --ocr-file: path to filetensor (.ft) data file (OCR) - -d, --ocrlabel-file: path to filetensor (.ft) labels file (OCR labels) - -a, --prob-font: probability of using a raw font image - -b, --prob-captcha: probability of using a captcha image - -e, --prob-ocr: probability of using an ocr image - ''' - -# See run_pipeline.py -def get_argv(): - with open(ARGS_FILE) as f: - args = [l.rstrip() for l in f.readlines()] - return args - -# Might be called locally or through dbidispatch. In all cases it should be -# passed to the GIMP executable to be able to use GIMP filters. -# Ex: -def _main(): - #global DEFAULT_NIST_PATH, DEFAULT_LABEL_PATH, DEFAULT_OCR_PATH, DEFAULT_OCRLABEL_PATH - #global getopt, get_argv - - max_complexity = 0.5 # default - probability_zero = 0.1 # default - output_file_path = None - params_output_file_path = None - labels_output_file_path = None - nist_path = DEFAULT_NIST_PATH - label_path = DEFAULT_LABEL_PATH - ocr_path = DEFAULT_OCR_PATH - ocrlabel_path = DEFAULT_OCRLABEL_PATH - prob_font = 0.0 - prob_captcha = 0.0 - prob_ocr = 0.0 - stop_after = None - reload_mode = False - - try: - opts, args = getopt.getopt(get_argv(), "rm:z:o:p:x:s:f:l:c:d:a:b:e:", ["reload","max-complexity=", "probability-zero=", "output-file=", "params-output-file=", "labels-output-file=", "stop-after=", "data-file=", "label-file=", "ocr-file=", "ocrlabel-file=", "prob-font=", "prob-captcha=", "prob-ocr="]) - except getopt.GetoptError, err: - # print help information and exit: - print str(err) # will print something like "option -a not recognized" - usage() - pdb.gimp_quit(0) - sys.exit(2) - - for o, a in opts: - if o in ('-m', '--max-complexity'): - max_complexity = float(a) - assert max_complexity >= 0.0 and max_complexity <= 1.0 - elif o in ('-r', '--reload'): - reload_mode = True - elif o in ("-z", "--probability-zero"): - probability_zero = float(a) - assert probability_zero >= 0.0 and probability_zero <= 1.0 - elif o in ("-o", "--output-file"): - output_file_path = a - elif o in ('-p', "--params-output-file"): - params_output_file_path = a - elif o in ('-x', "--labels-output-file"): - labels_output_file_path = a - elif o in ('-s', "--stop-after"): - stop_after = int(a) - elif o in ('-f', "--data-file"): - nist_path = a - elif o in ('-l', "--label-file"): - label_path = a - elif o in ('-c', "--ocr-file"): - ocr_path = a - elif o in ('-d', "--ocrlabel-file"): - ocrlabel_path = a - elif o in ('-a', "--prob-font"): - prob_font = float(a) - elif o in ('-b', "--prob-captcha"): - prob_captcha = float(a) - elif o in ('-e', "--prob-ocr"): - prob_ocr = float(a) - else: - assert False, "unhandled option" - - if output_file_path == None or params_output_file_path == None or labels_output_file_path == None: - print "Must specify the three output files." - usage() - pdb.gimp_quit(0) - sys.exit(2) - - if reload_mode: - reload(output_file_path, params_output_file_path) - else: - if DEBUG_IMAGES_PATH: - ''' - # This code is yet untested - debug_images = DebugImages(DEBUG_IMAGES_PATH) - num_img = len(debug_images.filelist) - pl = Pipeline(modules=MODULE_INSTANCES, num_img=num_img, image_size=(32,32)) - img_it = debug_images_iterator(debug_images) - ''' - else: - nist = NistData(nist_path, label_path, ocr_path, ocrlabel_path) - num_img = 819200 # 800 Mb file - if stop_after: - num_img = stop_after - pl = Pipeline(modules=MODULE_INSTANCES, num_img=num_img, image_size=(32,32)) - img_it = nist_supp_iterator(nist, prob_font, prob_captcha, prob_ocr, num_img) - - cpx_it = range_complexity_iterator(probability_zero, max_complexity) - pl.run(img_it, cpx_it) - pl.write_output(output_file_path, params_output_file_path, labels_output_file_path) - -_main() - -if DEBUG_X: - pylab.ioff() - pylab.show() - -pdb.gimp_quit(0) - diff -r 6f606b359df3 -r a9af079892ce transformations/run_pipeline.sh --- a/transformations/run_pipeline.sh Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -#!/bin/bash - -# This is one _ugly_ hack, but I couldn't figure out how -# to cleanly pass command line options to the script if -# invoking using the "gimp --batch < script.py" syntax - -# Basically I create a temp file, put the args into it, -# then the script gets the filename and reads back the -# args - -export PIPELINE_ARGS_TMPFILE=`mktemp` - -for arg in "$@" -do - echo $arg >> $PIPELINE_ARGS_TMPFILE -done - -gimp -i --batch-interpreter python-fu-eval --batch - < pipeline.py - - diff -r 6f606b359df3 -r a9af079892ce transformations/slant.py --- a/transformations/slant.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Author: Youssouf - -this module add a slant effect to the image. - -To obtain the slant effect, each row of the array is shifted proportionately by a step controlled by the complexity. - -''' - -import numpy - - -class Slant(): - def __init__(self, complexity=1): - #---------- private attributes - self.direction = 1 - self.angle = 0 - - #---------- generation parameters - self.regenerate_parameters(complexity) - #------------------------------------------------ - - def _get_current_parameters(self): - return [self.angle, self.direction] - - def get_settings_names(self): - return ['angle', 'direction'] - - def regenerate_parameters(self, complexity): - self.angle = numpy.random.uniform(0.0, complexity) - P = numpy.random.uniform() - self.direction = 1; - if P < 0.5: - self.direction = -1; - return self._get_current_parameters() - - - def transform_image(self,image): - if self.angle == 0: - return image - - ysize, xsize = image.shape - slant = self.direction*self.angle - - output = image.copy() - - # shift all the rows - for i in range(ysize): - line = image[i] - delta = round((i*slant)) % xsize - line1 = line[:xsize-delta] - line2 = line[xsize-delta:xsize] - - output[i][delta:xsize] = line1 - output[i][0:delta] = line2 - - - #correction to center the image - correction = (self.direction)*round(self.angle*ysize/2) - correction = (xsize - correction) % xsize - - # center the region - line1 = output[0:ysize,0:xsize-correction].copy() - line2 = output[0:ysize,xsize-correction:xsize].copy() - output[0:ysize,correction:xsize] = line1 - output[0:ysize,0:correction] = line2 - - - return output - - -# Test function -# Load an image in local and create several samples of the effect on the -# original image with different parameter. All the samples are saved in a single image, the 1st image being the original. - -def test_slant(): - import scipy - img_name = "test_img/mnist_0.png" - dest_img_name = "test_img/slanted.png" - nb_samples = 10 - im = Image.open(img_name) - im = im.convert("L") - image = numpy.asarray(im) - - image_final = image - slant = Slant() - for i in range(nb_samples): - slant.regenerate_parameters(1) - image_slant = slant.transform_image(image) - image_final = scipy.hstack((image_final,image_slant)) - - im = Image.fromarray(image_final.astype('uint8'), "L") - im.save(dest_img_name) - -# Test -if __name__ == '__main__': - import sys, os, fnmatch - import Image - - test_slant() - diff -r 6f606b359df3 -r a9af079892ce transformations/testmod.py --- a/transformations/testmod.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ -# This script is to test your modules to see if they conform to the module API -# defined on the wiki. -import random, numpy, gc, time, math - -# this is an example module that does stupid image value shifting - -class DummyModule(object): - def get_settings_names(self): - return ['value'] - - def regenerate_parameters(self, complexity): - self._value = random.gauss(0, 0.5*complexity) - return [self._value] - - def transform_image(self, image): - return numpy.clip(image+self._value, 0, 1) - -#import - -# instanciate your class here (rather than DummyModule) -mod = DummyModule() - -def error(msg): - print "ERROR:", msg - sys.exit(1) - -def warn(msg): - print "WARNING:", msg - -def timeit(f, lbl): - - gc.disable() - t = time.time() - f() - est = time.time() - t - gc.enable() - - loops = max(1, int(10**math.floor(math.log(10/est, 10)))) - - gc.disable() - t = time.time() - for _ in xrange(loops): - f() - - print lbl, "(", loops, "loops ):", (time.time() - t)/loops, "s" - gc.enable() - -######################## -# get_settings_names() # -######################## - -print "Testing get_settings_names()" - -names = mod.get_settings_names() - -if type(names) is not list: - error("Must return a list") - -if not all(type(e) is str for e in names): - warn("The elements of the list should be strings") - -########################### -# regenerate_parameters() # -########################### - -print "Testing regenerate_parameters()" - -params = mod.regenerate_parameters(0.2) - -if type(params) is not list: - error("Must return a list") - -if len(params) != len(names): - error("the returned parameter list must have the same length as the number of parameters") - -params2 = mod.regenerate_parameters(0.2) -if len(names) != 0 and params == params2: - error("the complexity parameter determines the distribution of the parameters, not their value") - -mod.regenerate_parameters(0.0) -mod.regenerate_parameters(1.0) - -mod.regenerate_parameters(0.5) - -##################### -# transform_image() # -##################### - -print "Testing transform_image()" - -imgr = numpy.random.random_sample((32, 32)).astype(numpy.float32) -img1 = numpy.ones((32, 32), dtype=numpy.float32) -img0 = numpy.zeros((32, 32), dtype=numpy.float32) - -resr = mod.transform_image(imgr) - -if type(resr) is not numpy.ndarray: - error("Must return an ndarray") - -if resr.shape != (32, 32): - error("Must return 32x32 array") - -if resr.dtype != numpy.float32: - error("Must return float32 array") - -res1 = mod.transform_image(img1) -res0 = mod.transform_image(img0) - -if res1.max() > 1.0 or res0.max() > 1.0: - error("Must keep array values between 0 and 1") - -if res1.min() < 0.0 or res0.min() < 0.0: - error("Must keep array values between 0 and 1") - -mod.regenerate_parameters(0.0) -mod.transform_image(imgr) -mod.regenerate_parameters(1.0) -mod.transform_image(imgr) - -print "Bonus Stage: timings" - -timeit(lambda: None, "empty") -timeit(lambda: mod.regenerate_parameters(0.5), "regenerate_parameters()") -timeit(lambda: mod.transform_image(imgr), "tranform_image()") - -def f(): - mod.regenerate_parameters(0.2) - mod.transform_image(imgr) - -timeit(f, "regen and transform") diff -r 6f606b359df3 -r a9af079892ce transformations/testtransformations.py --- a/transformations/testtransformations.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ -#!/usr/bin/env python - - -from pylearn.io import filetensor as ft -import copy -import pygame -import time -import numpy as N - -#from gimpfu import * - - -from PoivreSel import PoivreSel -from thick import Thick -from BruitGauss import BruitGauss -from DistorsionGauss import DistorsionGauss -from PermutPixel import PermutPixel -#from gimp_script import GIMPTransformation -from Rature import Rature -from contrast import Contrast -from local_elastic_distortions import LocalElasticDistorter -from slant import Slant -#from Occlusion import Occlusion -from add_background_image import AddBackground -from affine_transform import AffineTransformation - -###---------------------order of transformation module -MODULE_INSTANCES = [Thick(),Slant(),AffineTransformation(), LocalElasticDistorter(), PermutPixel(), Rature(), BruitGauss(),PoivreSel(), Contrast()] - -###---------------------complexity associated to each of them -complexity = [0.6,0.7,0.5,0.4,0.1,0.5,0.03,0.03,0.5] - - -nbmodule = len(MODULE_INSTANCES) - -datapath = '/data/lisa/data/nist/by_class/' -f = open(datapath+'lower/lower_train_data.ft') -d = ft.read(f) - -d = d[0:1000,:]/255.0 - -pygame.surfarray.use_arraytype('numpy') - -pygame.display.init() -screen = pygame.display.set_mode((4*(nbmodule+1)*32,4*32+20),0,8) -anglcolorpalette=[(x,x,x) for x in xrange(0,256)] -screen.set_palette(anglcolorpalette) - -pygame.font.init() - -for i in range(10000): - a=d[i,:] - b=N.asarray(N.reshape(a,(32,32))) - c=N.asarray(N.reshape(a*255.0,(32,32))).T - new=pygame.surfarray.make_surface(c) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(0,0)) - - offset = 4*32 - ct = 0 - for j in MODULE_INSTANCES: - #max dilation - - #random - print j.get_settings_names(), j.regenerate_parameters(complexity[ct]) - - b=j.transform_image(b) - c=N.asarray(b*255).T - - new=pygame.surfarray.make_surface(c) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(offset,0)) - font = pygame.font.SysFont('liberationserif',18) - text = font.render(j.__module__,0,(255,255,255),(0,0,0)) - screen.blit(text,(offset,4*32)) - offset += 4*32 - ct+=1 - pygame.display.update() - raw_input('Press Enter') - -pygame.display.quit() diff -r 6f606b359df3 -r a9af079892ce transformations/thick.py --- a/transformations/thick.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,198 +0,0 @@ -#!/usr/bin/python -# coding: utf-8 - -''' -Simple implementation of random thickness deformation using morphological -operation of scipy. -Only one morphological operation applied (dilation or erosion), the kernel is random -out of a list of 12 symmetric kernels. (only 5 to be chosen for erosion because it can -hurt the recognizability of the charater and 12 for dilation). - -Author: Xavier Glorot - -''' - -import scipy.ndimage.morphology -import numpy as N - - -class Thick(): - def __init__(self,complexity = 1): - #---------- private attributes - self.__nx__ = 32 #xdim of the images - self.__ny__ = 32 #ydim of the images - self.__erodemax__ = 9 #nb of index max of erode structuring elements - self.__dilatemax__ = 9 #nb of index max of dilation structuring elements - self.__structuring_elements__ = [N.asarray([[1,1]]),N.asarray([[1],[1]]),\ - N.asarray([[1,1],[1,1]]),N.asarray([[0,1,0],[1,1,1],[0,1,0]]),\ - N.asarray([[1,1,1],[1,1,1]]),N.asarray([[1,1],[1,1],[1,1]]),\ - N.asarray([[1,1,1],[1,1,1],[1,1,1]]),\ - N.asarray([[1,1,1,1],[1,1,1,1],[1,1,1,1]]),\ - N.asarray([[1,1,1],[1,1,1],[1,1,1],[1,1,1]]),\ - N.asarray([[0,0,1,0,0],[0,1,1,1,0],[1,1,1,1,1],[0,1,1,1,0],[0,0,1,0,0]]),\ - N.asarray([[1,1,1,1],[1,1,1,1]]),N.asarray([[1,1],[1,1],[1,1],[1,1]])] - #------------------------------------------------ - - #---------- generation parameters - self.regenerate_parameters(complexity) - #------------------------------------------------ - - def _get_current_parameters(self): - return [self.thick_param] - - def get_settings_names(self): - return ['thick_param'] - - def regenerate_parameters(self, complexity): - self.erodenb = N.ceil(complexity * self.__erodemax__) - self.dilatenb = N.ceil(complexity * self.__dilatemax__) - self.Perode = self.erodenb / (self.dilatenb + self.erodenb + 1.0) - self.Pdilate = self.dilatenb / (self.dilatenb + self.erodenb + 1.0) - assert (self.Perode + self.Pdilate <= 1) & (self.Perode + self.Pdilate >= 0) - assert (complexity >= 0) & (complexity <= 1) - P = N.random.uniform() - if P>1-(self.Pdilate+self.Perode): - if P>1-(self.Pdilate+self.Perode)+self.Perode: - self.meth = 1 - self.nb=N.random.randint(self.dilatenb) - else: - self.meth = -1 - self.nb=N.random.randint(self.erodenb) - else: - self.meth = 0 - self.nb = -1 - self.thick_param = self.meth*self.nb - return self._get_current_parameters() - - def transform_1_image(self,image): #the real transformation method - if self.meth!=0: - maxi = float(N.max(image)) - mini = float(N.min(image)) - - imagenorm=image/maxi - - if self.meth==1: - trans=scipy.ndimage.morphology.grey_dilation\ - (imagenorm,size=self.__structuring_elements__[self.nb].shape,structure=self.__structuring_elements__[self.nb]) - else: - trans=scipy.ndimage.morphology.grey_erosion\ - (imagenorm,size=self.__structuring_elements__[self.nb].shape,structure=self.__structuring_elements__[self.nb]) - - #------renormalizing - maxit = N.max(trans) - minit = N.min(trans) - trans= N.asarray((trans - (minit+mini)) / (maxit - (minit+mini)) * maxi,dtype=image.dtype) - #-------- - return trans - else: - return image - - def transform_image(self,image): #handling different format - if image.shape == (self.__nx__,self.__ny__): - return self.transform_1_image(image) - if image.ndim == 3: - newimage = copy.copy(image) - for i in range(image.shape[0]): - newimage[i,:,:] = self.transform_1_image(image[i,:,:]) - return newimage - if image.ndim == 2 and image.shape != (self.__nx__,self.__ny__): - newimage = N.reshape(image,(image.shape[0],self.__nx__,self.__ny__)) - for i in range(image.shape[0]): - newimage[i,:,:] = self.transform_1_image(newimage[i,:,:]) - return N.reshape(newimage,image.shape) - if image.ndim == 1: - newimage = N.reshape(image,(self.__nx__,self.__ny__)) - newimage = self.transform_1_image(newimage) - return N.reshape(newimage,image.shape) - assert False #should never go there - - - - -#test on NIST (you need pylearn and access to NIST to do that) - -if __name__ == '__main__': - - from pylearn.io import filetensor as ft - import copy - import pygame - import time - datapath = '/data/lisa/data/nist/by_class/' - f = open(datapath+'digits/digits_train_data.ft') - d = ft.read(f) - - pygame.surfarray.use_arraytype('numpy') - - pygame.display.init() - screen = pygame.display.set_mode((8*4*32,8*32),0,8) - anglcolorpalette=[(x,x,x) for x in xrange(0,256)] - screen.set_palette(anglcolorpalette) - - MyThick = Thick() - - #debut=time.time() - #MyThick.transform_image(d) - #fin=time.time() - #print '------------------------------------------------' - #print d.shape[0],' images transformed in :', fin-debut, ' seconds' - #print '------------------------------------------------' - #print (fin-debut)/d.shape[0]*1000000,' microseconds per image' - #print '------------------------------------------------' - #print MyThick.get_settings_names() - #print MyThick._get_current_parameters() - #print MyThick.regenerate_parameters(0) - #print MyThick.regenerate_parameters(0.5) - #print MyThick.regenerate_parameters(1) - for i in range(10000): - a=d[i,:] - b=N.asarray(N.reshape(a,(32,32))).T - - new=pygame.surfarray.make_surface(b) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(0,0)) - - #max dilation - MyThick.meth=1 - MyThick.nb=MyThick.__dilatemax__ - c=MyThick.transform_image(a) - b=N.asarray(N.reshape(c,(32,32))).T - - new=pygame.surfarray.make_surface(b) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(8*32,0)) - - #max erosion - MyThick.meth=-1 - MyThick.nb=MyThick.__erodemax__ - c=MyThick.transform_image(a) - b=N.asarray(N.reshape(c,(32,32))).T - - new=pygame.surfarray.make_surface(b) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(8*2*32,0)) - - #random - print MyThick.get_settings_names(), MyThick.regenerate_parameters(1) - c=MyThick.transform_image(a) - b=N.asarray(N.reshape(c,(32,32))).T - - new=pygame.surfarray.make_surface(b) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new=pygame.transform.scale2x(new) - new.set_palette(anglcolorpalette) - screen.blit(new,(8*3*32,0)) - - pygame.display.update() - raw_input('Press Enter') - - pygame.display.quit() diff -r 6f606b359df3 -r a9af079892ce transformations/ttf2jpg.py --- a/transformations/ttf2jpg.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ -#!/usr/bin/python -# -*- coding: iso-8859-1 -*- - -''' - Implementation of font image generator - download fonts from http://www.dafont.com for exemple - - Author: Guillaume Sicard -''' - -import sys, os, fnmatch, random -import Image, ImageFont, ImageDraw, numpy - -class ttf2jpg(): - def __init__(self, font_file = ''): - self.w = 32 - self.h = 32 - self.font_dir = '/usr/share/fonts/truetype/ttf-liberation/' - self.font_file = font_file - self.image_dir = './images/' - self.pattern = '*.ttf' - self.char_list = [] - for i in range(0,26): - self.char_list.append(chr(ord('a') + i) ) - for i in range(0,26): - self.char_list.append(chr(ord('A') + i) ) - for i in range(0,10): - self.char_list.append(chr(ord('0') + i) ) - - # get font name - def get_settings_names(self): - return [self.font_file] - - # save an image - def save_image(self,array, filename = ''): - image = (array * 255.0).astype('int') - image = Image.fromarray(image).convert('L') - if (filename != ''): - image.save(filename) - else: - image.show() - - # set a random font for character generation - def set_random_font(self): - files = os.listdir(self.font_dir) - font_files = fnmatch.filter(files, self.pattern) - i = random.randint(0, len(font_files) - 1) - self.font_file = self.font_dir + font_files[i] - - # return a picture array of "text" with font "font_file" - def create_image(self, text): - # create a w x h black picture, and a drawing space - image = Image.new('L', (self.w, self.h), 'Black') - draw = ImageDraw.Draw(image) - - # load the font with the right size - font = ImageFont.truetype(self.font_file, 28) - d_w,d_h = draw.textsize(text, font=font) - - # write text and aligns it - draw.text(((32 - d_w) / 2, ((32 - d_h) / 2)), text, font=font, fill='White') - - image = numpy.asarray(image) - image = (image / 255.0).astype(numpy.float32) - - return image - - # write all the letters and numbers into pictures - def process_font(self): - for i in range(0, len(self.char_list) ): - image = self.create_image(self.char_list[i]) - self.save_image(image, self.image_dir + self.char_list[i] + '-' + os.path.basename(self.font_file) + '.jpg') - sys.stdout.write('.') - sys.stdout.flush() - return (len(self.char_list)) - - # generate the character from the font_file and returns a numpy array - def generate_image_from_char(self, character, font_file = ''): - if (font_file != ''): - self.font_file = font_file - - return self.create_image(character) - - # generate random character from random font file as a numpy array - def generate_image(self): - self.set_random_font() - i = random.randint(0, len(self.char_list) - 1) - return self.generate_image_from_char(self.char_list[i]) - - # test method, create character images for all fonts in "font_dir" in dir "image_dir" - def test(self): - import time - - # look for ttf files - files = os.listdir(self.font_dir) - font_files = fnmatch.filter(files, self.pattern) - - # create "image_dir" if it doesn't exist - if not os.path.isdir(self.image_dir): - os.mkdir(self.image_dir) - - sys.stdout.write( str(len(font_files)) + ' fonts found, generating jpg images in folder ' + self.image_dir ) - sys.stdout.flush() - - # main loop - t = time.time() - n = 0 - - for font_file in font_files: - self.font_file = self.font_dir + font_file - n += self.process_font() - t = time.time() - t - - sys.stdout.write('\nall done!\n' + str(n) + ' images generated in ' + str(t) + 's (average : ' + str(1000 * t / n) + ' ms/im)\n') - -if __name__ == '__main__': - - myttf2jpg = ttf2jpg() - #myttf2jpg.test() - image = myttf2jpg.generate_image() - myttf2jpg.save_image(image, '') \ No newline at end of file diff -r 6f606b359df3 -r a9af079892ce transformations/visualizer.py --- a/transformations/visualizer.py Wed Feb 10 11:15:04 2010 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -#!/usr/bin/python - -import numpy -import Image -from image_tiling import tile_raster_images -import pylab -import time - -class Visualizer(): - def __init__(self, num_columns=10, image_size=(32,32), to_dir=None, on_screen=False): - self.list = [] - self.image_size = image_size - self.num_columns = num_columns - - self.on_screen = on_screen - self.to_dir = to_dir - - self.cur_grid_image = None - - self.cur_index = 0 - - def visualize_stop_and_flush(self): - self.make_grid_image() - - if self.on_screen: - self.visualize() - if self.to_dir: - self.dump_to_disk() - - self.stop_and_wait() - self.flush() - - self.cur_index += 1 - - def make_grid_image(self): - num_rows = len(self.list) / self.num_columns - if len(self.list) % self.num_columns != 0: - num_rows += 1 - grid_shape = (num_rows, self.num_columns) - self.cur_grid_image = tile_raster_images(numpy.array(self.list), self.image_size, grid_shape, tile_spacing=(5,5), output_pixel_vals=False) - - def visualize(self): - pylab.imshow(self.cur_grid_image) - pylab.draw() - - def dump_to_disk(self): - gi = Image.fromarray((self.cur_grid_image * 255).astype('uint8'), "L") - gi.save(self.to_dir + "/grid_" + str(self.cur_index) + ".png") - - def stop_and_wait(self): - # can't raw_input under gimp, so sleep) - print "New image generated, sleeping 5 secs" - time.sleep(5) - - def flush(self): - self.list = [] - - def get_parameters_names(self): - return [] - - def regenerate_parameters(self): - return [] - - def after_transform_callback(self, image): - self.transform_image(image) - - def end_transform_callback(self, final_image): - self.visualize_stop_and_flush() - - def transform_image(self, image): - sz = self.image_size - self.list.append(image.copy().reshape((sz[0] * sz[1]))) - diff -r 6f606b359df3 -r a9af079892ce utils/__init__.py diff -r 6f606b359df3 -r a9af079892ce utils/scalar_series/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/scalar_series/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,2 @@ +from series import BaseSeries, AccumulatorSeries, SeriesContainer, BasicStatsSeries, SeriesMultiplexer, SeriesList, ParamsArrayStats + diff -r 6f606b359df3 -r a9af079892ce utils/scalar_series/series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/scalar_series/series.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,311 @@ +#!/usr/bin/python +# coding: utf-8 + +from __future__ import with_statement + +import sys +import os +import os.path +import array + +# for BasicStatsSeries +import numpy + +# To access .value if necessary +import theano.tensor.sharedvar + +''' +* TODO: add xy series +* TODO: add graph() for base and accumulator +* TODO: flush_every for BaseStatsSeries +* TODO: warn when Mux append() is called with a nonexisting name +* SeriesContainers are also series, albeit with more complex elements appended +* Each series has a "name" which corresponds in some way to the directory or file in which it's saved +''' + +# Simple class to append numbers and flush them to a file once in a while +class BaseSeries(): + # for types, see http://docs.python.org/library/array.html + def __init__(self, name, directory, type='f', flush_every=1): + self.type = type + self.flush_every = flush_every + + if not name or not directory: + raise Exception("name and directory must be provided (strings)") + + self.directory = directory + self.name = name + + if name and directory: + self.filepath = os.path.join(directory, name) + + self._array = array.array(type) + # stores the length not stored in file, waiting to be flushed + self._buffered = 0 + + def append(self, newitem): + self._array.append(newitem) + + self._buffered += 1 + if self._buffered >= self.flush_every: + self.flush() + + def append_list(self, items): + self._array.fromlist(items) + self._buffered += len(items) + if self._buffered >= self.flush_every: + self.flush() + + def flush(self): + if self._buffered == 0: + return + with open(self.filepath, "wb") as f: + s = self._array[-self._buffered:].tostring() + f.write(s) + + def tolist(self): + return self._array.tolist() + + def load_from_file(self): + if not self.filepath: + raise Exception("No name/directory provided") + + self._array = array.array(self.type) + self._buffered = 0 + + statinfo = os.stat(self.filepath) + size = statinfo.st_size + num_items = size / self._array.itemsize + + with open(self.filepath, "rb") as f: + self._array.fromfile(f, num_items) + +class AccumulatorSeries(BaseSeries): + ''' + reduce_every: group (sum or mean) the last "reduce_every" items whenever we have enough + and create a new item added to the real, saved array + (if elements remain at the end, less then "reduce_every", they'll be discarded on program close) + flush_every: this is for items of the real, saved array, not in terms of number of calls to "append" + ''' + def __init__(self, reduce_every, + name, directory, flush_every=1, + mean=False): + BaseSeries.__init__(self, name=name, directory=directory, type='f', flush_every=flush_every) + self.reduce_every = reduce_every + self._accumulator = 0.0 + self._num_accumulated = 0 + self.use_mean = mean + + @classmethod + def series_constructor(cls, reduce_every, mean=False): + def cstr(name, directory, flush_every=1): + return cls(reduce_every=reduce_every, mean=mean, name=name, directory=directory, flush_every=flush_every) + return cstr + + def append(self, item): + self._accumulator += item + self._num_accumulated += 1 + if self._num_accumulated >= self.reduce_every: + n = self._accumulator + if self.use_mean: + n = n / self.reduce_every + BaseSeries.append(self, n) + + self._num_accumulated = 0 + self._accumulator = 0.0 + + def append_list(self, items): + for i in items: + self.append(i) + +class SeriesContainer(): + def __init__(self, parent_directory, name, + series_constructor=BaseSeries): + self.parent_directory = parent_directory + self.name = name + + if not parent_directory or not name: + raise Exception("parent_directory and name must be provided (strings)") + + self.directory_path = os.path.join(parent_directory, name) + + self.series_constructor = series_constructor + + # attempt to create directory for series + if not os.path.isdir(self.directory_path): + os.mkdir(self.directory_path) + + def graph(self): + pass + +class BasicStatsSeries(SeriesContainer): + def __init__(self, parent_directory, name, series_constructor=BaseSeries, + mean=True, minmax=True, std=True): + SeriesContainer.__init__(self, parent_directory=parent_directory, name=name, series_constructor=series_constructor) + + self.save_mean = mean + self.save_minmax = minmax + self.save_std = std + + self.create_series() + + @classmethod + def series_constructor(cls, mean=True, minmax=True, std=True): + def cstr(name, directory, flush_every=1): + return cls(name=name, parent_directory=directory, + mean=mean, minmax=minmax, std=std) + return cstr + + + def create_series(self): + if self.save_mean: + self.means = self.series_constructor(name="mean", directory=self.directory_path) + + if self.save_minmax: + self.mins = self.series_constructor(name="min", directory=self.directory_path) + self.maxes = self.series_constructor(name="max", directory=self.directory_path) + + if self.save_std: + self.stds = self.series_constructor(name="std", directory=self.directory_path) + + def append(self, array): + # TODO: shouldn't this be the job of the caller? (at least ParamsArraySeries) + if isinstance(array, theano.tensor.sharedvar.TensorSharedVariable): + array = array.value + + if self.save_mean: + n = numpy.mean(array) + self.means.append(n) + if self.save_minmax: + n = numpy.min(array) + self.mins.append(n) + n = numpy.max(array) + self.maxes.append(n) + if self.save_std: + n = numpy.std(array) + self.stds.append(n) + + def load_from_file(self): + self.load_from_directory() + + def load_from_directory(self): + if self.save_mean: + self.means.load_from_file() + + if self.save_minmax: + self.mins.load_from_file() + self.maxes.load_from_file() + + if self.save_std: + self.stds.load_from_file() + + def graph(self, xes=None): + import pylab + + if self.save_minmax: + mn = numpy.array(self.mins.tolist()) + mx = numpy.array(self.maxes.tolist()) + if self.save_mean: + y = numpy.array(self.means.tolist()) + else: + y = (mn+mx) / 2 + + above_y = mx - y + below_y = y - mn + + if not xes: + xes = numpy.arange(len(y)) + + pylab.errorbar(x=xes, y=y, yerr=[below_y, above_y]) + + elif self.save_mean: + y = numpy.array(self.means.tolist()) + if not xes: + xes = numpy.arange(len(y)) + + pylab.plot(x=xes, y=y) + + +class SeriesMultiplexer(): + def __init__(self): + self._series_dict = {} + self._warned_for = {} + + def append(self, series_name, item): + # if we don't have the series, just don't do anything + if self._series_dict.has_key(series_name): + s = self._series_dict[series_name] + s.append(item) + elif not self._warned_for.has_key(series_name): + print "WARNING: SeriesMultiplexer called with unknown name ", series_name + self._warned_for[series_name] = 1 + + def append_list(self, series_name, items): + if self._series_dict.has_key(series_name): + s = self._series_dict[series_name] + s.append_list(items) + elif not self._warned_for.has_key(series_name): + print "WARNING: SeriesMultiplexer called with unknown name ", series_name + self._warned_for[series_name] = 1 + + def add_series(self, series): + if self._series_dict.has_key(series.name): + raise Exception("A series with such a name already exists") + self._series_dict[series.name] = series + +class SeriesList(): + def __init__(self, num_elements, name, directory, series_constructor=BaseSeries): + self._subseries = [None] * num_elements + self.name = name + + for i in range(num_elements): + newname = name + "." + str(i) + self._subseries[i] = series_constructor(name=newname, directory=directory) + + def load_from_files(self): + self.load_from_file() + + def load_from_file(self): + for s in self._subseries: + s.load_from_file() + + # no "append_list", this would get confusing + def append(self, list_of_items): + if len(list_of_items) != len(self._subseries): + raise Exception("bad number of items, expected " + str(len(self._subseries)) + ", got " + str(len(list_of_items))) + for i in range(len(list_of_items)): + self._subseries[i].append(list_of_items[i]) + + +# Just a shortcut +class ParamsArrayStats(SeriesList): + def __init__(self, num_params_arrays, name, directory): + cstr = BasicStatsSeries.series_constructor() + + SeriesList.__init__(self, num_elements=num_params_arrays, + name=name, directory=directory, + series_constructor=cstr) + +# ------------------------ +# Utilities to work with the series files from the command line + +# "dumpf" +def dump_floats_file(filepath): + print "Floats dump of ", filepath + with open(filepath, "rb") as f: + s = os.stat(filepath) + size = s.st_size + num = size / 4 + a = array.array('f') + a.fromfile(f, num) + print a.tolist() + +if __name__ == '__main__': + args = sys.argv[1:] + + if len(args) == 2 and args[0] == "dumpf": + file = args[1] + dump_floats_file(file) + else: + print "Bad arguments" + diff -r 6f606b359df3 -r a9af079892ce utils/scalar_series/test_series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/scalar_series/test_series.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,197 @@ +#!/usr/bin/python +# coding: utf-8 + +import sys +import tempfile +import os.path +import os + +import numpy + +from series import BaseSeries, AccumulatorSeries, SeriesContainer, BasicStatsSeries, SeriesMultiplexer, SeriesList, ParamsArrayStats + + +BASEDIR = tempfile.mkdtemp() + +def tempname(): + file = tempfile.NamedTemporaryFile(dir=BASEDIR) + filepath = file.name + return os.path.split(filepath) + +def tempdir(): + wholepath = os.path.dirname(tempfile.mkdtemp(dir=BASEDIR)) + # split again, interpreting the last directory as a filename + return os.path.split(wholepath) + +def tempseries(type='f', flush_every=1): + dir, filename = tempname() + + s = BaseSeries(name=filename, directory=dir, type=type, flush_every=flush_every) + + return s + +def test_Series_storeload(): + s = tempseries() + + s.append(12.0) + s.append_list([13.0,14.0,15.0]) + + s2 = BaseSeries(name=s.name, directory=s.directory, flush_every=15) + # also test if elements stored before load_from_file (and before a flush) + # are deleted (or array is restarted from scratch... both work) + s2.append(10.0) + s2.append_list([30.0,40.0]) + s2.load_from_file() + + assert s2.tolist() == [12.0,13.0,14.0,15.0] + + +def test_AccumulatorSeries_mean(): + dir, filename = tempname() + + s = AccumulatorSeries(reduce_every=15, mean=True, name=filename, directory=dir) + + for i in range(50): + s.append(i) + + assert s.tolist() == [7.0,22.0,37.0] + +def test_BasicStatsSeries_commoncase(): + a1 = numpy.arange(25).reshape((5,5)) + a2 = numpy.arange(40).reshape((8,5)) + + parent_dir, dir = tempdir() + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir) + + bss.append(a1) + bss.append(a2) + + assert bss.means.tolist() == [12.0, 19.5] + assert bss.mins.tolist() == [0.0, 0.0] + assert bss.maxes.tolist() == [24.0, 39.0] + assert (bss.stds.tolist()[0] - 7.211102) < 1e-3 + assert (bss.stds.tolist()[1] - 11.54339) < 1e-3 + + # try to reload + + bss2 = BasicStatsSeries(parent_directory=parent_dir, name=dir) + bss2.load_from_directory() + + assert bss2.means.tolist() == [12.0, 19.5] + assert bss2.mins.tolist() == [0.0, 0.0] + assert bss2.maxes.tolist() == [24.0, 39.0] + assert (bss2.stds.tolist()[0] - 7.211102) < 1e-3 + assert (bss2.stds.tolist()[1] - 11.54339) < 1e-3 + +def test_BasicStatsSeries_reload(): + a1 = numpy.arange(25).reshape((5,5)) + a2 = numpy.arange(40).reshape((8,5)) + + parent_dir, dir = tempdir() + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir) + + bss.append(a1) + bss.append(a2) + + # try to reload + + bss2 = BasicStatsSeries(parent_directory=parent_dir, name=dir) + bss2.load_from_directory() + + assert bss2.means.tolist() == [12.0, 19.5] + assert bss2.mins.tolist() == [0.0, 0.0] + assert bss2.maxes.tolist() == [24.0, 39.0] + assert (bss2.stds.tolist()[0] - 7.211102) < 1e-3 + assert (bss2.stds.tolist()[1] - 11.54339) < 1e-3 + + +def test_BasicStatsSeries_withaccumulator(): + a1 = numpy.arange(25).reshape((5,5)) + a2 = numpy.arange(40).reshape((8,5)) + a3 = numpy.arange(20).reshape((4,5)) + a4 = numpy.arange(48).reshape((6,8)) + + parent_dir, dir = tempdir() + + sc = AccumulatorSeries.series_constructor(reduce_every=2, mean=False) + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir, series_constructor=sc) + + bss.append(a1) + bss.append(a2) + bss.append(a3) + bss.append(a4) + + assert bss.means.tolist() == [31.5, 33.0] + +def test_SeriesList_withbasicstats(): + dir = tempfile.mkdtemp(dir=BASEDIR) + + bscstr = BasicStatsSeries.series_constructor() + + slist = SeriesList(num_elements=5, name="foo", directory=dir, series_constructor=bscstr) + + for i in range(10): # 10 elements in each list + curlist = [] + for j in range(5): # 5 = num_elements, ie. number of list to append to + dist = numpy.arange(i*j, i*j+10) + curlist.append(dist) + slist.append(curlist) + + slist2 = SeriesList(num_elements=5, name="foo", directory=dir, series_constructor=bscstr) + + slist2.load_from_files() + + l1 = slist2._subseries[0].means.tolist() + l2 = slist2._subseries[4].means.tolist() + + print l1 + print l2 + + assert l1 == [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5] + assert l2 == [4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5] + +# same test as above, just with the shortcut +def test_ParamsArrayStats_reload(): + dir = tempfile.mkdtemp(dir=BASEDIR) + + slist = ParamsArrayStats(5, name="foo", directory=dir) + + for i in range(10): # 10 elements in each list + curlist = [] + for j in range(5): # 5 = num_elements, ie. number of list to append to + dist = numpy.arange(i*j, i*j+10) + curlist.append(dist) + slist.append(curlist) + + slist2 = ParamsArrayStats(5, name="foo", directory=dir) + + slist2.load_from_files() + + l1 = slist2._subseries[0].means.tolist() + l2 = slist2._subseries[4].means.tolist() + + print l1 + print l2 + + assert l1 == [4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5] + assert l2 == [4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5] + + +def manual_BasicStatsSeries_graph(): + parent_dir, dir = tempdir() + + bss = BasicStatsSeries(parent_directory=parent_dir, name=dir) + + for i in range(50): + bss.append(1.0/numpy.arange(i*5, i*5+5)) + + bss.graph() + +#if __name__ == '__main__': +# import pylab +# manual_BasicStatsSeries_graph() +# pylab.show() + diff -r 6f606b359df3 -r a9af079892ce utils/seriestables/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/seriestables/__init__.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,2 @@ +from series import ErrorSeries, BasicStatisticsSeries, AccumulatorSeriesWrapper, SeriesArrayWrapper, SharedParamsStatisticsWrapper, DummySeries + diff -r 6f606b359df3 -r a9af079892ce utils/seriestables/series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/seriestables/series.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,605 @@ +import tables + +import numpy +import time + +############################################################################## +# Utility functions to create IsDescription objects (pytables data types) + +''' +The way these "IsDescription constructor" work is simple: write the +code as if it were in a file, then exec()ute it, leaving us with +a local-scoped LocalDescription which may be used to call createTable. + +It's a small hack, but it's necessary as the names of the columns +are retrieved based on the variable name, which we can't programmatically set +otherwise. +''' + +def _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock, pos=0): + toexec = "" + + if store_timestamp: + toexec += "\ttimestamp = tables.Time32Col(pos="+str(pos)+")\n" + pos += 1 + + if store_cpuclock: + toexec += "\tcpuclock = tables.Float64Col(pos="+str(pos)+")\n" + pos += 1 + + return toexec, pos + +def _get_description_n_ints(int_names, int_width=64, pos=0): + """ + Begins construction of a class inheriting from IsDescription + to construct an HDF5 table with index columns named with int_names. + + See Series().__init__ to see how those are used. + """ + int_constructor = "tables.Int64Col" + if int_width == 32: + int_constructor = "tables.Int32Col" + elif not int_width in (32, 64): + raise "int_width must be left unspecified, or should equal 32 or 64" + + toexec = "" + + for n in int_names: + toexec += "\t" + n + " = " + int_constructor + "(pos=" + str(pos) + ")\n" + pos += 1 + + return toexec, pos + +def _get_description_with_n_ints_n_floats(int_names, float_names, + int_width=64, float_width=32, + store_timestamp=True, store_cpuclock=True): + """ + Constructs a class to be used when constructing a table with PyTables. + + This is useful to construct a series with an index with multiple levels. + E.g. if you want to index your "validation error" with "epoch" first, then + "minibatch_index" second, you'd use two "int_names". + + Parameters + ---------- + int_names : tuple of str + Names of the int (e.g. index) columns + float_names : tuple of str + Names of the float (e.g. error) columns + int_width : {'32', '64'} + Type of ints. + float_width : {'32', '64'} + Type of floats. + store_timestamp : bool + See __init__ of Series + store_cpuclock : bool + See __init__ of Series + + Returns + ------- + A class object, to pass to createTable() + """ + + toexec = "class LocalDescription(tables.IsDescription):\n" + + toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock) + toexec += toexec_ + + toexec_, pos = _get_description_n_ints(int_names, int_width=int_width, pos=pos) + toexec += toexec_ + + float_constructor = "tables.Float32Col" + if float_width == 64: + float_constructor = "tables.Float64Col" + elif not float_width in (32, 64): + raise "float_width must be left unspecified, or should equal 32 or 64" + + for n in float_names: + toexec += "\t" + n + " = " + float_constructor + "(pos=" + str(pos) + ")\n" + pos += 1 + + exec(toexec) + + return LocalDescription + +############################################################################## +# Series classes + +# Shortcut to allow passing a single int as index, instead of a tuple +def _index_to_tuple(index): + if type(index) == tuple: + return index + + if type(index) == list: + index = tuple(index) + return index + + try: + if index % 1 > 0.001 and index % 1 < 0.999: + raise + idx = long(index) + return (idx,) + except: + raise TypeError("index must be a tuple of integers, or at least a single integer") + +class Series(): + """ + Base Series class, with minimal arguments and type checks. + + Yet cannot be used by itself (it's append() method raises an error) + """ + + def __init__(self, table_name, hdf5_file, index_names=('epoch',), + title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True): + """Basic arguments each Series must get. + + Parameters + ---------- + table_name : str + Name of the table to create under group "hd5_group" (other + parameter). No spaces, ie. follow variable naming restrictions. + hdf5_file : open HDF5 file + File opened with openFile() in PyTables (ie. return value of + openFile). + index_names : tuple of str + Columns to use as index for elements in the series, other + example would be ('epoch', 'minibatch'). This would then allow + you to call append(index, element) with index made of two ints, + one for epoch index, one for minibatch index in epoch. + title : str + Title to attach to this table as metadata. Can contain spaces + and be longer then the table_name. + hdf5_group : str + Path of the group (kind of a file) in the HDF5 file under which + to create the table. + store_timestamp : bool + Whether to create a column for timestamps and store them with + each record. + store_cpuclock : bool + Whether to create a column for cpu clock and store it with + each record. + """ + + ######################################### + # checks + + if type(table_name) != str: + raise TypeError("table_name must be a string") + if table_name == "": + raise ValueError("table_name must not be empty") + + if not isinstance(hdf5_file, tables.file.File): + raise TypeError("hdf5_file must be an open HDF5 file (use tables.openFile)") + #if not ('w' in hdf5_file.mode or 'a' in hdf5_file.mode): + # raise ValueError("hdf5_file must be opened in write or append mode") + + if type(index_names) != tuple: + raise TypeError("index_names must be a tuple of strings." + \ + "If you have only one element in the tuple, don't forget " +\ + "to add a comma, e.g. ('epoch',).") + for name in index_names: + if type(name) != str: + raise TypeError("index_names must only contain strings, but also"+\ + "contains a "+str(type(name))+".") + + if type(title) != str: + raise TypeError("title must be a string, even if empty") + + if type(hdf5_group) != str: + raise TypeError("hdf5_group must be a string") + + if type(store_timestamp) != bool: + raise TypeError("store_timestamp must be a bool") + + if type(store_cpuclock) != bool: + raise TypeError("store_timestamp must be a bool") + + ######################################### + + self.table_name = table_name + self.hdf5_file = hdf5_file + self.index_names = index_names + self.title = title + self.hdf5_group = hdf5_group + + self.store_timestamp = store_timestamp + self.store_cpuclock = store_cpuclock + + def append(self, index, element): + raise NotImplementedError + + def _timestamp_cpuclock(self, newrow): + if self.store_timestamp: + newrow["timestamp"] = time.time() + + if self.store_cpuclock: + newrow["cpuclock"] = time.clock() + +class DummySeries(): + """ + To put in a series dictionary instead of a real series, to do nothing + when we don't want a given series to be saved. + + E.g. if we'd normally have a "training_error" series in a dictionary + of series, the training loop would have something like this somewhere: + + series["training_error"].append((15,), 20.0) + + but if we don't want to save the training errors this time, we simply + do + + series["training_error"] = DummySeries() + """ + def append(self, index, element): + pass + +class ErrorSeries(Series): + """ + Most basic Series: saves a single float (called an Error as this is + the most common use case I foresee) along with an index (epoch, for + example) and timestamp/cpu.clock for each of these floats. + """ + + def __init__(self, error_name, table_name, + hdf5_file, index_names=('epoch',), + title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True): + """ + For most parameters, see Series.__init__ + + Parameters + ---------- + error_name : str + In the HDF5 table, column name for the error float itself. + """ + + # most type/value checks are performed in Series.__init__ + Series.__init__(self, table_name, hdf5_file, index_names, title, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock) + + if type(error_name) != str: + raise TypeError("error_name must be a string") + if error_name == "": + raise ValueError("error_name must not be empty") + + self.error_name = error_name + + self._create_table() + + def _create_table(self): + table_description = _get_description_with_n_ints_n_floats( \ + self.index_names, (self.error_name,), + store_timestamp=self.store_timestamp, + store_cpuclock=self.store_cpuclock) + + self._table = self.hdf5_file.createTable(self.hdf5_group, + self.table_name, + table_description, + title=self.title) + + + def append(self, index, error): + """ + Parameters + ---------- + index : tuple of int + Following index_names passed to __init__, e.g. (12, 15) if + index_names were ('epoch', 'minibatch_size'). + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + error : float + Next error in the series. + """ + index = _index_to_tuple(index) + + if len(index) != len(self.index_names): + raise ValueError("index provided does not have the right length (expected " \ + + str(len(self.index_names)) + " got " + str(len(index))) + + # other checks are implicit when calling newrow[..] =, + # which should throw an error if not of the right type + + newrow = self._table.row + + # Columns for index in table are based on index_names + for col_name, value in zip(self.index_names, index): + newrow[col_name] = value + newrow[self.error_name] = error + + # adds timestamp and cpuclock to newrow if necessary + self._timestamp_cpuclock(newrow) + + newrow.append() + + self.hdf5_file.flush() + +# Does not inherit from Series because it does not itself need to +# access the hdf5_file and does not need a series_name (provided +# by the base_series.) +class AccumulatorSeriesWrapper(): + ''' + Wraps a Series by accumulating objects passed its Accumulator.append() + method and "reducing" (e.g. calling numpy.mean(list)) once in a while, + every "reduce_every" calls in fact. + ''' + + def __init__(self, base_series, reduce_every, reduce_function=numpy.mean): + """ + Parameters + ---------- + base_series : Series + This object must have an append(index, value) function. + + reduce_every : int + Apply the reduction function (e.g. mean()) every time we get this + number of elements. E.g. if this is 100, then every 100 numbers + passed to append(), we'll take the mean and call append(this_mean) + on the BaseSeries. + + reduce_function : function + Must take as input an array of "elements", as passed to (this + accumulator's) append(). Basic case would be to take an array of + floats and sum them into one float, for example. + """ + self.base_series = base_series + self.reduce_function = reduce_function + self.reduce_every = reduce_every + + self._buffer = [] + + + def append(self, index, element): + """ + Parameters + ---------- + index : tuple of int + The index used is the one of the last element reduced. E.g. if + you accumulate over the first 1000 minibatches, the index + passed to the base_series.append() function will be 1000. + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + element : float + Element that will be accumulated. + """ + self._buffer.append(element) + + if len(self._buffer) == self.reduce_every: + reduced = self.reduce_function(self._buffer) + self.base_series.append(index, reduced) + self._buffer = [] + + # The >= case should never happen, except if lists + # were appended by accessing _buffer externally (when it's + # intended to be private), which should be a red flag. + assert len(self._buffer) < self.reduce_every + +# Outside of class to fix an issue with exec in Python 2.6. +# My sorries to the god of pretty code. +def _BasicStatisticsSeries_construct_table_toexec(index_names, store_timestamp, store_cpuclock): + toexec = "class LocalDescription(tables.IsDescription):\n" + + toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock) + toexec += toexec_ + + toexec_, pos = _get_description_n_ints(index_names, pos=pos) + toexec += toexec_ + + toexec += "\tmean = tables.Float32Col(pos=" + str(pos) + ")\n" + toexec += "\tmin = tables.Float32Col(pos=" + str(pos+1) + ")\n" + toexec += "\tmax = tables.Float32Col(pos=" + str(pos+2) + ")\n" + toexec += "\tstd = tables.Float32Col(pos=" + str(pos+3) + ")\n" + + # This creates "LocalDescription", which we may then use + exec(toexec) + + return LocalDescription + +# Defaults functions for BasicStatsSeries. These can be replaced. +_basic_stats_functions = {'mean': lambda(x): numpy.mean(x), + 'min': lambda(x): numpy.min(x), + 'max': lambda(x): numpy.max(x), + 'std': lambda(x): numpy.std(x)} + +class BasicStatisticsSeries(Series): + + def __init__(self, table_name, hdf5_file, + stats_functions=_basic_stats_functions, + index_names=('epoch',), title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True): + """ + For most parameters, see Series.__init__ + + Parameters + ---------- + series_name : str + Not optional here. Will be prepended with "Basic statistics for " + + stats_functions : dict, optional + Dictionary with a function for each key "mean", "min", "max", + "std". The function must take whatever is passed to append(...) + and return a single number (float). + """ + + # Most type/value checks performed in Series.__init__ + Series.__init__(self, table_name, hdf5_file, index_names, title, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock) + + if type(hdf5_group) != str: + raise TypeError("hdf5_group must be a string") + + if type(stats_functions) != dict: + # just a basic check. We'll suppose caller knows what he's doing. + raise TypeError("stats_functions must be a dict") + + self.hdf5_group = hdf5_group + + self.stats_functions = stats_functions + + self._create_table() + + def _create_table(self): + table_description = \ + _BasicStatisticsSeries_construct_table_toexec( \ + self.index_names, + self.store_timestamp, self.store_cpuclock) + + self._table = self.hdf5_file.createTable(self.hdf5_group, + self.table_name, table_description) + + def append(self, index, array): + """ + Parameters + ---------- + index : tuple of int + Following index_names passed to __init__, e.g. (12, 15) + if index_names were ('epoch', 'minibatch_size') + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + array + Is of whatever type the stats_functions passed to + __init__ can take. Default is anything numpy.mean(), + min(), max(), std() can take. + """ + index = _index_to_tuple(index) + + if len(index) != len(self.index_names): + raise ValueError("index provided does not have the right length (expected " \ + + str(len(self.index_names)) + " got " + str(len(index))) + + newrow = self._table.row + + for col_name, value in zip(self.index_names, index): + newrow[col_name] = value + + newrow["mean"] = self.stats_functions['mean'](array) + newrow["min"] = self.stats_functions['min'](array) + newrow["max"] = self.stats_functions['max'](array) + newrow["std"] = self.stats_functions['std'](array) + + self._timestamp_cpuclock(newrow) + + newrow.append() + + self.hdf5_file.flush() + +class SeriesArrayWrapper(): + """ + Simply redistributes any number of elements to sub-series to respective + append()s. + + To use if you have many elements to append in similar series, e.g. if you + have an array containing [train_error, valid_error, test_error], and 3 + corresponding series, this allows you to simply pass this array of 3 + values to append() instead of passing each element to each individual + series in turn. + """ + + def __init__(self, base_series_list): + """ + Parameters + ---------- + base_series_list : array or tuple of Series + You must have previously created and configured each of those + series, then put them in an array. This array must follow the + same order as the array passed as ``elements`` parameter of + append(). + """ + self.base_series_list = base_series_list + + def append(self, index, elements): + """ + Parameters + ---------- + index : tuple of int + See for example ErrorSeries.append() + + elements : array or tuple + Array or tuple of elements that will be passed down to + the base_series passed to __init__, in the same order. + """ + if len(elements) != len(self.base_series_list): + raise ValueError("not enough or too much elements provided (expected " \ + + str(len(self.base_series_list)) + " got " + str(len(elements))) + + for series, el in zip(self.base_series_list, elements): + series.append(index, el) + +class SharedParamsStatisticsWrapper(SeriesArrayWrapper): + ''' + Save mean, min/max, std of shared parameters place in an array. + + Here "shared" means "theano.shared", which means elements of the + array will have a .value to use for numpy.mean(), etc. + + This inherits from SeriesArrayWrapper, which provides the append() + method. + ''' + + def __init__(self, arrays_names, new_group_name, hdf5_file, + base_group='/', index_names=('epoch',), title="", + store_timestamp=True, store_cpuclock=True): + """ + For other parameters, see Series.__init__ + + Parameters + ---------- + array_names : array or tuple of str + Name of each array, in order of the array passed to append(). E.g. + ('layer1_b', 'layer1_W', 'layer2_b', 'layer2_W') + + new_group_name : str + Name of a new HDF5 group which will be created under base_group to + store the new series. + + base_group : str + Path of the group under which to create the new group which will + store the series. + + title : str + Here the title is attached to the new group, not a table. + + store_timestamp : bool + Here timestamp and cpuclock are stored in *each* table + + store_cpuclock : bool + Here timestamp and cpuclock are stored in *each* table + """ + + # most other checks done when calling BasicStatisticsSeries + if type(new_group_name) != str: + raise TypeError("new_group_name must be a string") + if new_group_name == "": + raise ValueError("new_group_name must not be empty") + + base_series_list = [] + + new_group = hdf5_file.createGroup(base_group, new_group_name, title=title) + + stats_functions = {'mean': lambda(x): numpy.mean(x.value), + 'min': lambda(x): numpy.min(x.value), + 'max': lambda(x): numpy.max(x.value), + 'std': lambda(x): numpy.std(x.value)} + + for name in arrays_names: + base_series_list.append( + BasicStatisticsSeries( + table_name=name, + hdf5_file=hdf5_file, + index_names=index_names, + stats_functions=stats_functions, + hdf5_group=new_group._v_pathname, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock)) + + SeriesArrayWrapper.__init__(self, base_series_list) + + diff -r 6f606b359df3 -r a9af079892ce utils/seriestables/test_series.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/seriestables/test_series.py Mon Mar 29 17:42:44 2010 -0400 @@ -0,0 +1,311 @@ +import tempfile + +import numpy +import numpy.random + +from jobman import DD + +import tables + +from series import * +import series + +################################################# +# Utils + +def compare_floats(f1,f2): + if f1-f2 < 1e-3: + return True + return False + +def compare_lists(it1, it2, floats=False): + if len(it1) != len(it2): + return False + + for el1, el2 in zip(it1, it2): + if floats: + if not compare_floats(el1,el2): + return False + elif el1 != el2: + return False + + return True + +################################################# +# Basic Series class tests + +def test_Series_types(): + pass + +################################################# +# ErrorSeries tests + +def test_ErrorSeries_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + validation_error.append((1,2), 30.0) + validation_error.append((2,1), 28.0) + validation_error.append((2,2), 26.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1,1,2,2]) + assert compare_lists(table.cols.minibatch[:], [1,2,1,2]) + assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0]) + +def test_ErrorSeries_no_index(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + # empty tuple + index_names=tuple(), + title="Validation error with no index") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append(tuple(), 32.0) + validation_error.append(tuple(), 30.0) + validation_error.append(tuple(), 28.0) + validation_error.append(tuple(), 26.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0]) + assert not ("epoch" in dir(table.cols)) + +def test_ErrorSeries_notimestamp(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch", + store_timestamp=False) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1]) + assert not ("timestamp" in dir(table.cols)) + assert "cpuclock" in dir(table.cols) + +def test_ErrorSeries_nocpuclock(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch", + store_cpuclock=False) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1]) + assert not ("cpuclock" in dir(table.cols)) + assert "timestamp" in dir(table.cols) + +def test_AccumulatorSeriesWrapper_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = ErrorSeries(error_name="accumulated_validation_error", + table_name="accumulated_validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Validation error, summed every 3 minibatches, indexed by epoch and minibatch") + + accumulator = AccumulatorSeriesWrapper(base_series=validation_error, + reduce_every=3, reduce_function=numpy.sum) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + accumulator.append((1,1), 32.0) + accumulator.append((1,2), 30.0) + accumulator.append((2,1), 28.0) + accumulator.append((2,2), 26.0) + accumulator.append((3,1), 24.0) + accumulator.append((3,2), 22.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'accumulated_validation_error') + + assert compare_lists(table.cols.epoch[:], [2,3]) + assert compare_lists(table.cols.minibatch[:], [1,2]) + assert compare_lists(table.cols.accumulated_validation_error[:], [90.0,72.0], floats=True) + +def test_BasicStatisticsSeries_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats_series = BasicStatisticsSeries(table_name="b_vector_statistics", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Basic statistics for b vector indexed by epoch and minibatch") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + stats_series.append((1,1), [0.15, 0.20, 0.30]) + stats_series.append((1,2), [-0.18, 0.30, 0.58]) + stats_series.append((2,1), [0.18, -0.38, -0.68]) + stats_series.append((2,2), [0.15, 0.02, 1.9]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'b_vector_statistics') + + assert compare_lists(table.cols.epoch[:], [1,1,2,2]) + assert compare_lists(table.cols.minibatch[:], [1,2,1,2]) + assert compare_lists(table.cols.mean[:], [0.21666667, 0.23333333, -0.29333332, 0.69], floats=True) + assert compare_lists(table.cols.min[:], [0.15000001, -0.18000001, -0.68000001, 0.02], floats=True) + assert compare_lists(table.cols.max[:], [0.30, 0.58, 0.18, 1.9], floats=True) + assert compare_lists(table.cols.std[:], [0.06236095, 0.31382939, 0.35640177, 0.85724366], floats=True) + +def test_SharedParamsStatisticsWrapper_commoncase(h5f=None): + import numpy.random + + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/", + arrays_names=('b1','b2','b3'), hdf5_file=h5f, + index_names=('epoch','minibatch')) + + b1 = DD({'value':numpy.random.rand(5)}) + b2 = DD({'value':numpy.random.rand(5)}) + b3 = DD({'value':numpy.random.rand(5)}) + stats.append((1,1), [b1,b2,b3]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + b1_table = h5f.getNode('/params', 'b1') + b3_table = h5f.getNode('/params', 'b3') + + assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3 + assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3 + assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3 + assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3 + +def test_SharedParamsStatisticsWrapper_notimestamp(h5f=None): + import numpy.random + + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/", + arrays_names=('b1','b2','b3'), hdf5_file=h5f, + index_names=('epoch','minibatch'), + store_timestamp=False) + + b1 = DD({'value':numpy.random.rand(5)}) + b2 = DD({'value':numpy.random.rand(5)}) + b3 = DD({'value':numpy.random.rand(5)}) + stats.append((1,1), [b1,b2,b3]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + b1_table = h5f.getNode('/params', 'b1') + b3_table = h5f.getNode('/params', 'b3') + + assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3 + assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3 + assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3 + assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3 + + assert not ('timestamp' in dir(b1_table.cols)) + +def test_get_desc(): + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + desc = series._get_description_with_n_ints_n_floats(("col1","col2"), ("col3","col4")) + + mytable = h5f.createTable('/', 'mytable', desc) + + # just make sure the columns are there... otherwise this will throw an exception + mytable.cols.col1 + mytable.cols.col2 + mytable.cols.col3 + mytable.cols.col4 + + try: + # this should fail... LocalDescription must be local to get_desc_etc + test = LocalDescription + assert False + except: + assert True + + assert True + +def test_index_to_tuple_floaterror(): + try: + series._index_to_tuple(5.1) + assert False + except TypeError: + assert True + +def test_index_to_tuple_arrayok(): + tpl = series._index_to_tuple([1,2,3]) + assert type(tpl) == tuple and tpl[1] == 2 and tpl[2] == 3 + +def test_index_to_tuple_intbecomestuple(): + tpl = series._index_to_tuple(32) + + assert type(tpl) == tuple and tpl == (32,) + +def test_index_to_tuple_longbecomestuple(): + tpl = series._index_to_tuple(928374928374928L) + + assert type(tpl) == tuple and tpl == (928374928374928L,) + +if __name__ == '__main__': + import tempfile + test_get_desc() + test_ErrorSeries_common_case() + test_BasicStatisticsSeries_common_case() + test_AccumulatorSeriesWrapper_common_case() + test_SharedParamsStatisticsWrapper_commoncase() +