comparison code_tutoriel/dae.py @ 0:fda5f787baa6

commit initial
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Thu, 21 Jan 2010 11:26:43 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fda5f787baa6
1 """
2 This tutorial introduces denoising auto-encoders using Theano.
3
4 Denoising autoencoders can be used as building blocks for deep networks.
5 They are based on auto-encoders as the ones used in Bengio et al. 2007.
6 An autoencoder takes an input x and first maps it to a hidden representation
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
8 latent representation y is then mapped back to a "reconstructed" vector
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight
10 matrix W' can optionally be constrained such that W' = W^T, in which case
11 the autoencoder is said to have tied weights. The network is trained such
12 that to minimize the reconstruction error (the error between x and z).
13
14 For the denosing autoencoder, during training, first x is corrupted into
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
16 of a stochastic mapping. Afterwards y is computed as before (using
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
18 error is now measured between z and the uncorrupted input x, which is
19 computed as the cross-entropy :
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
21
22 For X iteration of the main program loop it takes *** minutes on an
23 Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
24
25
26 References :
27 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
28 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
29 2008
30 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
31 Training of Deep Networks, Advances in Neural Information Processing
32 Systems 19, 2007
33
34 """
35
36 import numpy
37 from theano import tensor
38 from theano.compile.sandbox import shared, pfunc
39 from theano.compile.sandbox.shared_randomstreams import RandomStreams
40 from theano.tensor import nnet
41 import pylearn.datasets.MNIST
42
43
44 try:
45 #this tells theano to use the GPU if possible
46 from theano.sandbox.cuda import use
47 use()
48 except Exception,e:
49 print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e))
50
51
52 def load_mnist_batches(batch_size):
53 """
54 We should remove the dependency on pylearn.datasets.MNIST .. and maybe
55 provide a pickled version of the dataset..
56 """
57 mnist = pylearn.datasets.MNIST.train_valid_test()
58 train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size])
59 for i in xrange(0, len(mnist.train.x), batch_size)]
60 valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size])
61 for i in xrange(0, len(mnist.valid.x), batch_size)]
62 test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size])
63 for i in xrange(0, len(mnist.test.x), batch_size)]
64 return train_batches, valid_batches, test_batches
65
66
67
68
69 class DAE():
70 """Denoising Auto-Encoder class
71
72 A denoising autoencoders tried to reconstruct the input from a corrupted
73 version of it by projecting it first in a latent space and reprojecting
74 it in the input space. Please refer to Vincent et al.,2008 for more
75 details. If x is the input then equation (1) computes a partially destroyed
76 version of x by means of a stochastic mapping q_D. Equation (2) computes
77 the projection of the input into the latent space. Equation (3) computes
78 the reconstruction of the input, while equation (4) computes the
79 reconstruction error.
80
81 .. latex-eqn:
82 \tilde{x} ~ q_D(\tilde{x}|x) (1)
83 y = s(W \tilde{x} + b) (2)
84 x = s(W' y + b') (3)
85 L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4)
86
87 Tricks and thumbrules for DAE
88 - learning rate should be used in a logarithmic scale ...
89 """
90
91 def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None):
92 """
93 Initialize the DAE class by specifying the number of visible units (the
94 dimension d of the input ), the number of hidden units ( the dimension
95 d' of the latent or hidden space ), a initial value for the learning rate
96 and by giving a symbolic description of the input. Such a symbolic
97 description is of no importance for the simple DAE and therefore can be
98 ignored. This feature is useful when stacking DAEs, since the input of
99 intermediate layers can be symbolically described in terms of the hidden
100 units of the previous layer. See the tutorial on SDAE for more details.
101
102 :param n_visible: number of visible units
103 :param n_hidden: number of hidden units
104 :param lr: a initial value for the learning rate
105 :param input: a symbolic description of the input or None
106 """
107 self.n_visible = n_visible
108 self.n_hidden = n_hidden
109
110 # create a Theano random generator that gives symbolic random values
111 theano_rng = RandomStreams( seed = 1234 )
112 # create a numpy random generator
113 numpy_rng = numpy.random.RandomState( seed = 52432 )
114
115
116 # initial values for weights and biases
117 # note : W' was written as W_prime and b' as b_prime
118 initial_W = numpy_rng.uniform(size = (n_visible, n_hidden))
119 # transform W such that all values are between -.01 and .01
120 initial_W = (initial_W*2.0 - 1.0)*.01
121 initial_b = numpy.zeros(n_hidden)
122 initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible))
123 # transform W_prime such that all values are between -.01 and .01
124 initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01
125 initial_b_prime= numpy.zeros(n_visible)
126
127
128 # theano shared variables for weights and biases
129 self.W = shared(value = initial_W , name = "W")
130 self.b = shared(value = initial_b , name = "b")
131 self.W_prime = shared(value = initial_W_prime, name = "W'")
132 self.b_prime = shared(value = initial_b_prime, name = "b'")
133
134 # theano shared variable for the learning rate
135 self.lr = shared(value = lr , name = "learning_rate")
136
137 # if no input is given generate a variable representing the input
138 if input == None :
139 # we use a matrix because we expect a minibatch of several examples,
140 # each example being a row
141 x = tensor.dmatrix(name = 'input')
142 else:
143 x = input
144 # Equation (1)
145 # note : first argument of theano.rng.binomial is the shape(size) of
146 # random numbers that it should produce
147 # second argument is the number of trials
148 # third argument is the probability of success of any trial
149 #
150 # this will produce an array of 0s and 1s where 1 has a
151 # probability of 0.9 and 0 if 0.1
152 tilde_x = theano_rng.binomial( x.shape, 1, 0.9) * x
153 # Equation (2)
154 # note : y is stored as an attribute of the class so that it can be
155 # used later when stacking DAEs.
156 self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W ) + self.b)
157 # Equation (3)
158 z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime)
159 # Equation (4)
160 L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 )
161 # note : L is now a vector, where each element is the cross-entropy cost
162 # of the reconstruction of the corresponding example of the
163 # minibatch. We need to sum all these to get the cost of the
164 # minibatch
165 cost = tensor.sum(L)
166 # parameters with respect to whom we need to compute the gradient
167 self.params = [ self.W, self.b, self.W_prime, self.b_prime]
168 # use theano automatic differentiation to get the gradients
169 gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params)
170 # update the parameters in the direction of the gradient using the
171 # learning rate
172 updated_W = self.W - gW * self.lr
173 updated_b = self.b - gb * self.lr
174 updated_W_prime = self.W_prime - gW_prime * self.lr
175 updated_b_prime = self.b_prime - gb_prime * self.lr
176
177 # defining the function that evaluate the symbolic description of
178 # one update step
179 self.update = pfunc(params = [x], outputs = cost, updates =
180 { self.W : updated_W,
181 self.b : updated_b,
182 self.W_prime : updated_W_prime,
183 self.b_prime : updated_b_prime } )
184 self.get_cost = pfunc(params = [x], outputs = cost)
185
186
187
188
189
190
191
192
193
194
195
196 def train_DAE_mnist():
197 """
198 Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist)
199 """
200
201 # load dataset as batches
202 train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16)
203
204 # Create a denoising auto-encoders with 28*28 = 784 input units, and 500
205 # units in the hidden layer (latent layer); Learning rate is set to 1e-1
206 dae = DAE( n_visible = 784, n_hidden = 500, lr = 1e-2)
207
208 # Number of iterations (epochs) to run
209 n_iter = 30
210 best_valid_score = float('inf')
211 test_score = float('inf')
212 for i in xrange(n_iter):
213 # train once over the dataset
214 for x,y in train_batches:
215 cost = dae.update(x)
216
217 # compute validation error
218 valid_cost = 0.
219 for x,y in valid_batches:
220 valid_cost = valid_cost + dae.get_cost(x)
221 valid_cost = valid_cost / len(valid_batches)
222 print('epoch %i, validation reconstruction error %f '%(i,valid_cost))
223
224 if valid_cost < best_valid_score :
225 best_valid_score = valid_cost
226 # compute test error !?
227 test_score = 0.
228 for x,y in test_batches:
229 test_score = test_score + dae.get_cost(x)
230 test_score = test_score / len(test_batches)
231 print('epoch %i, test error of best model %f' % (i, test_score))
232
233 print('Optimization done. Best validation score %f, test performance %f' %
234 (best_valid_score, test_score))
235
236
237
238 if __name__ == "__main__":
239 train_DAE_mnist()
240