0
|
1 """
|
|
2 This tutorial introduces denoising auto-encoders using Theano.
|
|
3
|
|
4 Denoising autoencoders can be used as building blocks for deep networks.
|
|
5 They are based on auto-encoders as the ones used in Bengio et al. 2007.
|
|
6 An autoencoder takes an input x and first maps it to a hidden representation
|
|
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
|
|
8 latent representation y is then mapped back to a "reconstructed" vector
|
|
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight
|
|
10 matrix W' can optionally be constrained such that W' = W^T, in which case
|
|
11 the autoencoder is said to have tied weights. The network is trained such
|
|
12 that to minimize the reconstruction error (the error between x and z).
|
|
13
|
|
14 For the denosing autoencoder, during training, first x is corrupted into
|
|
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
|
|
16 of a stochastic mapping. Afterwards y is computed as before (using
|
|
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
|
|
18 error is now measured between z and the uncorrupted input x, which is
|
|
19 computed as the cross-entropy :
|
|
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
|
|
21
|
|
22 For X iteration of the main program loop it takes *** minutes on an
|
|
23 Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor).
|
|
24
|
|
25
|
|
26 References :
|
|
27 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
|
|
28 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
|
|
29 2008
|
|
30 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
|
|
31 Training of Deep Networks, Advances in Neural Information Processing
|
|
32 Systems 19, 2007
|
|
33
|
|
34 """
|
|
35
|
|
36 import numpy
|
|
37 from theano import tensor
|
|
38 from theano.compile.sandbox import shared, pfunc
|
|
39 from theano.compile.sandbox.shared_randomstreams import RandomStreams
|
|
40 from theano.tensor import nnet
|
|
41 import pylearn.datasets.MNIST
|
|
42
|
|
43
|
|
44 try:
|
|
45 #this tells theano to use the GPU if possible
|
|
46 from theano.sandbox.cuda import use
|
|
47 use()
|
|
48 except Exception,e:
|
|
49 print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e))
|
|
50
|
|
51
|
|
52 def load_mnist_batches(batch_size):
|
|
53 """
|
|
54 We should remove the dependency on pylearn.datasets.MNIST .. and maybe
|
|
55 provide a pickled version of the dataset..
|
|
56 """
|
|
57 mnist = pylearn.datasets.MNIST.train_valid_test()
|
|
58 train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size])
|
|
59 for i in xrange(0, len(mnist.train.x), batch_size)]
|
|
60 valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size])
|
|
61 for i in xrange(0, len(mnist.valid.x), batch_size)]
|
|
62 test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size])
|
|
63 for i in xrange(0, len(mnist.test.x), batch_size)]
|
|
64 return train_batches, valid_batches, test_batches
|
|
65
|
|
66
|
|
67
|
|
68
|
|
69 class DAE():
|
|
70 """Denoising Auto-Encoder class
|
|
71
|
|
72 A denoising autoencoders tried to reconstruct the input from a corrupted
|
|
73 version of it by projecting it first in a latent space and reprojecting
|
|
74 it in the input space. Please refer to Vincent et al.,2008 for more
|
|
75 details. If x is the input then equation (1) computes a partially destroyed
|
|
76 version of x by means of a stochastic mapping q_D. Equation (2) computes
|
|
77 the projection of the input into the latent space. Equation (3) computes
|
|
78 the reconstruction of the input, while equation (4) computes the
|
|
79 reconstruction error.
|
|
80
|
|
81 .. latex-eqn:
|
|
82 \tilde{x} ~ q_D(\tilde{x}|x) (1)
|
|
83 y = s(W \tilde{x} + b) (2)
|
|
84 x = s(W' y + b') (3)
|
|
85 L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4)
|
|
86
|
|
87 Tricks and thumbrules for DAE
|
|
88 - learning rate should be used in a logarithmic scale ...
|
|
89 """
|
|
90
|
|
91 def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None):
|
|
92 """
|
|
93 Initialize the DAE class by specifying the number of visible units (the
|
|
94 dimension d of the input ), the number of hidden units ( the dimension
|
|
95 d' of the latent or hidden space ), a initial value for the learning rate
|
|
96 and by giving a symbolic description of the input. Such a symbolic
|
|
97 description is of no importance for the simple DAE and therefore can be
|
|
98 ignored. This feature is useful when stacking DAEs, since the input of
|
|
99 intermediate layers can be symbolically described in terms of the hidden
|
|
100 units of the previous layer. See the tutorial on SDAE for more details.
|
|
101
|
|
102 :param n_visible: number of visible units
|
|
103 :param n_hidden: number of hidden units
|
|
104 :param lr: a initial value for the learning rate
|
|
105 :param input: a symbolic description of the input or None
|
|
106 """
|
|
107 self.n_visible = n_visible
|
|
108 self.n_hidden = n_hidden
|
|
109
|
|
110 # create a Theano random generator that gives symbolic random values
|
|
111 theano_rng = RandomStreams( seed = 1234 )
|
|
112 # create a numpy random generator
|
|
113 numpy_rng = numpy.random.RandomState( seed = 52432 )
|
|
114
|
|
115
|
|
116 # initial values for weights and biases
|
|
117 # note : W' was written as W_prime and b' as b_prime
|
|
118 initial_W = numpy_rng.uniform(size = (n_visible, n_hidden))
|
|
119 # transform W such that all values are between -.01 and .01
|
|
120 initial_W = (initial_W*2.0 - 1.0)*.01
|
|
121 initial_b = numpy.zeros(n_hidden)
|
|
122 initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible))
|
|
123 # transform W_prime such that all values are between -.01 and .01
|
|
124 initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01
|
|
125 initial_b_prime= numpy.zeros(n_visible)
|
|
126
|
|
127
|
|
128 # theano shared variables for weights and biases
|
|
129 self.W = shared(value = initial_W , name = "W")
|
|
130 self.b = shared(value = initial_b , name = "b")
|
|
131 self.W_prime = shared(value = initial_W_prime, name = "W'")
|
|
132 self.b_prime = shared(value = initial_b_prime, name = "b'")
|
|
133
|
|
134 # theano shared variable for the learning rate
|
|
135 self.lr = shared(value = lr , name = "learning_rate")
|
|
136
|
|
137 # if no input is given generate a variable representing the input
|
|
138 if input == None :
|
|
139 # we use a matrix because we expect a minibatch of several examples,
|
|
140 # each example being a row
|
|
141 x = tensor.dmatrix(name = 'input')
|
|
142 else:
|
|
143 x = input
|
|
144 # Equation (1)
|
|
145 # note : first argument of theano.rng.binomial is the shape(size) of
|
|
146 # random numbers that it should produce
|
|
147 # second argument is the number of trials
|
|
148 # third argument is the probability of success of any trial
|
|
149 #
|
|
150 # this will produce an array of 0s and 1s where 1 has a
|
|
151 # probability of 0.9 and 0 if 0.1
|
|
152 tilde_x = theano_rng.binomial( x.shape, 1, 0.9) * x
|
|
153 # Equation (2)
|
|
154 # note : y is stored as an attribute of the class so that it can be
|
|
155 # used later when stacking DAEs.
|
|
156 self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W ) + self.b)
|
|
157 # Equation (3)
|
|
158 z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime)
|
|
159 # Equation (4)
|
|
160 L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 )
|
|
161 # note : L is now a vector, where each element is the cross-entropy cost
|
|
162 # of the reconstruction of the corresponding example of the
|
|
163 # minibatch. We need to sum all these to get the cost of the
|
|
164 # minibatch
|
|
165 cost = tensor.sum(L)
|
|
166 # parameters with respect to whom we need to compute the gradient
|
|
167 self.params = [ self.W, self.b, self.W_prime, self.b_prime]
|
|
168 # use theano automatic differentiation to get the gradients
|
|
169 gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params)
|
|
170 # update the parameters in the direction of the gradient using the
|
|
171 # learning rate
|
|
172 updated_W = self.W - gW * self.lr
|
|
173 updated_b = self.b - gb * self.lr
|
|
174 updated_W_prime = self.W_prime - gW_prime * self.lr
|
|
175 updated_b_prime = self.b_prime - gb_prime * self.lr
|
|
176
|
|
177 # defining the function that evaluate the symbolic description of
|
|
178 # one update step
|
|
179 self.update = pfunc(params = [x], outputs = cost, updates =
|
|
180 { self.W : updated_W,
|
|
181 self.b : updated_b,
|
|
182 self.W_prime : updated_W_prime,
|
|
183 self.b_prime : updated_b_prime } )
|
|
184 self.get_cost = pfunc(params = [x], outputs = cost)
|
|
185
|
|
186
|
|
187
|
|
188
|
|
189
|
|
190
|
|
191
|
|
192
|
|
193
|
|
194
|
|
195
|
|
196 def train_DAE_mnist():
|
|
197 """
|
|
198 Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist)
|
|
199 """
|
|
200
|
|
201 # load dataset as batches
|
|
202 train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16)
|
|
203
|
|
204 # Create a denoising auto-encoders with 28*28 = 784 input units, and 500
|
|
205 # units in the hidden layer (latent layer); Learning rate is set to 1e-1
|
|
206 dae = DAE( n_visible = 784, n_hidden = 500, lr = 1e-2)
|
|
207
|
|
208 # Number of iterations (epochs) to run
|
|
209 n_iter = 30
|
|
210 best_valid_score = float('inf')
|
|
211 test_score = float('inf')
|
|
212 for i in xrange(n_iter):
|
|
213 # train once over the dataset
|
|
214 for x,y in train_batches:
|
|
215 cost = dae.update(x)
|
|
216
|
|
217 # compute validation error
|
|
218 valid_cost = 0.
|
|
219 for x,y in valid_batches:
|
|
220 valid_cost = valid_cost + dae.get_cost(x)
|
|
221 valid_cost = valid_cost / len(valid_batches)
|
|
222 print('epoch %i, validation reconstruction error %f '%(i,valid_cost))
|
|
223
|
|
224 if valid_cost < best_valid_score :
|
|
225 best_valid_score = valid_cost
|
|
226 # compute test error !?
|
|
227 test_score = 0.
|
|
228 for x,y in test_batches:
|
|
229 test_score = test_score + dae.get_cost(x)
|
|
230 test_score = test_score / len(test_batches)
|
|
231 print('epoch %i, test error of best model %f' % (i, test_score))
|
|
232
|
|
233 print('Optimization done. Best validation score %f, test performance %f' %
|
|
234 (best_valid_score, test_score))
|
|
235
|
|
236
|
|
237
|
|
238 if __name__ == "__main__":
|
|
239 train_DAE_mnist()
|
|
240
|