Mercurial > ift6266
comparison code_tutoriel/dae.py @ 0:fda5f787baa6
commit initial
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Thu, 21 Jan 2010 11:26:43 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fda5f787baa6 |
---|---|
1 """ | |
2 This tutorial introduces denoising auto-encoders using Theano. | |
3 | |
4 Denoising autoencoders can be used as building blocks for deep networks. | |
5 They are based on auto-encoders as the ones used in Bengio et al. 2007. | |
6 An autoencoder takes an input x and first maps it to a hidden representation | |
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting | |
8 latent representation y is then mapped back to a "reconstructed" vector | |
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight | |
10 matrix W' can optionally be constrained such that W' = W^T, in which case | |
11 the autoencoder is said to have tied weights. The network is trained such | |
12 that to minimize the reconstruction error (the error between x and z). | |
13 | |
14 For the denosing autoencoder, during training, first x is corrupted into | |
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means | |
16 of a stochastic mapping. Afterwards y is computed as before (using | |
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction | |
18 error is now measured between z and the uncorrupted input x, which is | |
19 computed as the cross-entropy : | |
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] | |
21 | |
22 For X iteration of the main program loop it takes *** minutes on an | |
23 Intel Core i7 and *** minutes on GPU (NVIDIA GTX 285 graphics processor). | |
24 | |
25 | |
26 References : | |
27 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and | |
28 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, | |
29 2008 | |
30 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise | |
31 Training of Deep Networks, Advances in Neural Information Processing | |
32 Systems 19, 2007 | |
33 | |
34 """ | |
35 | |
36 import numpy | |
37 from theano import tensor | |
38 from theano.compile.sandbox import shared, pfunc | |
39 from theano.compile.sandbox.shared_randomstreams import RandomStreams | |
40 from theano.tensor import nnet | |
41 import pylearn.datasets.MNIST | |
42 | |
43 | |
44 try: | |
45 #this tells theano to use the GPU if possible | |
46 from theano.sandbox.cuda import use | |
47 use() | |
48 except Exception,e: | |
49 print ('Warning: Attempt to use GPU resulted in error "%s"'%str(e)) | |
50 | |
51 | |
52 def load_mnist_batches(batch_size): | |
53 """ | |
54 We should remove the dependency on pylearn.datasets.MNIST .. and maybe | |
55 provide a pickled version of the dataset.. | |
56 """ | |
57 mnist = pylearn.datasets.MNIST.train_valid_test() | |
58 train_batches = [(mnist.train.x[i:i+batch_size],mnist.train.y[i:i+batch_size]) | |
59 for i in xrange(0, len(mnist.train.x), batch_size)] | |
60 valid_batches = [(mnist.valid.x[i:i+batch_size], mnist.valid.y[i:i+batch_size]) | |
61 for i in xrange(0, len(mnist.valid.x), batch_size)] | |
62 test_batches = [(mnist.test.x[i:i+batch_size], mnist.test.y[i:i+batch_size]) | |
63 for i in xrange(0, len(mnist.test.x), batch_size)] | |
64 return train_batches, valid_batches, test_batches | |
65 | |
66 | |
67 | |
68 | |
69 class DAE(): | |
70 """Denoising Auto-Encoder class | |
71 | |
72 A denoising autoencoders tried to reconstruct the input from a corrupted | |
73 version of it by projecting it first in a latent space and reprojecting | |
74 it in the input space. Please refer to Vincent et al.,2008 for more | |
75 details. If x is the input then equation (1) computes a partially destroyed | |
76 version of x by means of a stochastic mapping q_D. Equation (2) computes | |
77 the projection of the input into the latent space. Equation (3) computes | |
78 the reconstruction of the input, while equation (4) computes the | |
79 reconstruction error. | |
80 | |
81 .. latex-eqn: | |
82 \tilde{x} ~ q_D(\tilde{x}|x) (1) | |
83 y = s(W \tilde{x} + b) (2) | |
84 x = s(W' y + b') (3) | |
85 L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) | |
86 | |
87 Tricks and thumbrules for DAE | |
88 - learning rate should be used in a logarithmic scale ... | |
89 """ | |
90 | |
91 def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None): | |
92 """ | |
93 Initialize the DAE class by specifying the number of visible units (the | |
94 dimension d of the input ), the number of hidden units ( the dimension | |
95 d' of the latent or hidden space ), a initial value for the learning rate | |
96 and by giving a symbolic description of the input. Such a symbolic | |
97 description is of no importance for the simple DAE and therefore can be | |
98 ignored. This feature is useful when stacking DAEs, since the input of | |
99 intermediate layers can be symbolically described in terms of the hidden | |
100 units of the previous layer. See the tutorial on SDAE for more details. | |
101 | |
102 :param n_visible: number of visible units | |
103 :param n_hidden: number of hidden units | |
104 :param lr: a initial value for the learning rate | |
105 :param input: a symbolic description of the input or None | |
106 """ | |
107 self.n_visible = n_visible | |
108 self.n_hidden = n_hidden | |
109 | |
110 # create a Theano random generator that gives symbolic random values | |
111 theano_rng = RandomStreams( seed = 1234 ) | |
112 # create a numpy random generator | |
113 numpy_rng = numpy.random.RandomState( seed = 52432 ) | |
114 | |
115 | |
116 # initial values for weights and biases | |
117 # note : W' was written as W_prime and b' as b_prime | |
118 initial_W = numpy_rng.uniform(size = (n_visible, n_hidden)) | |
119 # transform W such that all values are between -.01 and .01 | |
120 initial_W = (initial_W*2.0 - 1.0)*.01 | |
121 initial_b = numpy.zeros(n_hidden) | |
122 initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible)) | |
123 # transform W_prime such that all values are between -.01 and .01 | |
124 initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01 | |
125 initial_b_prime= numpy.zeros(n_visible) | |
126 | |
127 | |
128 # theano shared variables for weights and biases | |
129 self.W = shared(value = initial_W , name = "W") | |
130 self.b = shared(value = initial_b , name = "b") | |
131 self.W_prime = shared(value = initial_W_prime, name = "W'") | |
132 self.b_prime = shared(value = initial_b_prime, name = "b'") | |
133 | |
134 # theano shared variable for the learning rate | |
135 self.lr = shared(value = lr , name = "learning_rate") | |
136 | |
137 # if no input is given generate a variable representing the input | |
138 if input == None : | |
139 # we use a matrix because we expect a minibatch of several examples, | |
140 # each example being a row | |
141 x = tensor.dmatrix(name = 'input') | |
142 else: | |
143 x = input | |
144 # Equation (1) | |
145 # note : first argument of theano.rng.binomial is the shape(size) of | |
146 # random numbers that it should produce | |
147 # second argument is the number of trials | |
148 # third argument is the probability of success of any trial | |
149 # | |
150 # this will produce an array of 0s and 1s where 1 has a | |
151 # probability of 0.9 and 0 if 0.1 | |
152 tilde_x = theano_rng.binomial( x.shape, 1, 0.9) * x | |
153 # Equation (2) | |
154 # note : y is stored as an attribute of the class so that it can be | |
155 # used later when stacking DAEs. | |
156 self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W ) + self.b) | |
157 # Equation (3) | |
158 z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime) | |
159 # Equation (4) | |
160 L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 ) | |
161 # note : L is now a vector, where each element is the cross-entropy cost | |
162 # of the reconstruction of the corresponding example of the | |
163 # minibatch. We need to sum all these to get the cost of the | |
164 # minibatch | |
165 cost = tensor.sum(L) | |
166 # parameters with respect to whom we need to compute the gradient | |
167 self.params = [ self.W, self.b, self.W_prime, self.b_prime] | |
168 # use theano automatic differentiation to get the gradients | |
169 gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params) | |
170 # update the parameters in the direction of the gradient using the | |
171 # learning rate | |
172 updated_W = self.W - gW * self.lr | |
173 updated_b = self.b - gb * self.lr | |
174 updated_W_prime = self.W_prime - gW_prime * self.lr | |
175 updated_b_prime = self.b_prime - gb_prime * self.lr | |
176 | |
177 # defining the function that evaluate the symbolic description of | |
178 # one update step | |
179 self.update = pfunc(params = [x], outputs = cost, updates = | |
180 { self.W : updated_W, | |
181 self.b : updated_b, | |
182 self.W_prime : updated_W_prime, | |
183 self.b_prime : updated_b_prime } ) | |
184 self.get_cost = pfunc(params = [x], outputs = cost) | |
185 | |
186 | |
187 | |
188 | |
189 | |
190 | |
191 | |
192 | |
193 | |
194 | |
195 | |
196 def train_DAE_mnist(): | |
197 """ | |
198 Trains a DAE on the MNIST dataset (http://yann.lecun.com/exdb/mnist) | |
199 """ | |
200 | |
201 # load dataset as batches | |
202 train_batches,valid_batches,test_batches=load_mnist_batches(batch_size=16) | |
203 | |
204 # Create a denoising auto-encoders with 28*28 = 784 input units, and 500 | |
205 # units in the hidden layer (latent layer); Learning rate is set to 1e-1 | |
206 dae = DAE( n_visible = 784, n_hidden = 500, lr = 1e-2) | |
207 | |
208 # Number of iterations (epochs) to run | |
209 n_iter = 30 | |
210 best_valid_score = float('inf') | |
211 test_score = float('inf') | |
212 for i in xrange(n_iter): | |
213 # train once over the dataset | |
214 for x,y in train_batches: | |
215 cost = dae.update(x) | |
216 | |
217 # compute validation error | |
218 valid_cost = 0. | |
219 for x,y in valid_batches: | |
220 valid_cost = valid_cost + dae.get_cost(x) | |
221 valid_cost = valid_cost / len(valid_batches) | |
222 print('epoch %i, validation reconstruction error %f '%(i,valid_cost)) | |
223 | |
224 if valid_cost < best_valid_score : | |
225 best_valid_score = valid_cost | |
226 # compute test error !? | |
227 test_score = 0. | |
228 for x,y in test_batches: | |
229 test_score = test_score + dae.get_cost(x) | |
230 test_score = test_score / len(test_batches) | |
231 print('epoch %i, test error of best model %f' % (i, test_score)) | |
232 | |
233 print('Optimization done. Best validation score %f, test performance %f' % | |
234 (best_valid_score, test_score)) | |
235 | |
236 | |
237 | |
238 if __name__ == "__main__": | |
239 train_DAE_mnist() | |
240 |