Mercurial > ift6266
comparison code_tutoriel/dA.py @ 165:4bc5eeec6394
Updating the tutorial code to the latest revisions.
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 13:55:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
164:e3de934a98b6 | 165:4bc5eeec6394 |
---|---|
1 """ | |
2 This tutorial introduces denoising auto-encoders (dA) using Theano. | |
3 | |
4 Denoising autoencoders are the building blocks for SdA. | |
5 They are based on auto-encoders as the ones used in Bengio et al. 2007. | |
6 An autoencoder takes an input x and first maps it to a hidden representation | |
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting | |
8 latent representation y is then mapped back to a "reconstructed" vector | |
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight | |
10 matrix W' can optionally be constrained such that W' = W^T, in which case | |
11 the autoencoder is said to have tied weights. The network is trained such | |
12 that to minimize the reconstruction error (the error between x and z). | |
13 | |
14 For the denosing autoencoder, during training, first x is corrupted into | |
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means | |
16 of a stochastic mapping. Afterwards y is computed as before (using | |
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction | |
18 error is now measured between z and the uncorrupted input x, which is | |
19 computed as the cross-entropy : | |
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)] | |
21 | |
22 | |
23 References : | |
24 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and | |
25 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103, | |
26 2008 | |
27 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise | |
28 Training of Deep Networks, Advances in Neural Information Processing | |
29 Systems 19, 2007 | |
30 | |
31 """ | |
32 | |
33 import numpy, time, cPickle, gzip | |
34 | |
35 import theano | |
36 import theano.tensor as T | |
37 from theano.tensor.shared_randomstreams import RandomStreams | |
38 | |
39 from logistic_sgd import load_data | |
40 from utils import tile_raster_images | |
41 | |
42 import PIL.Image | |
43 | |
44 | |
45 class dA(object): | |
46 """Denoising Auto-Encoder class (dA) | |
47 | |
48 A denoising autoencoders tries to reconstruct the input from a corrupted | |
49 version of it by projecting it first in a latent space and reprojecting | |
50 it afterwards back in the input space. Please refer to Vincent et al.,2008 | |
51 for more details. If x is the input then equation (1) computes a partially | |
52 destroyed version of x by means of a stochastic mapping q_D. Equation (2) | |
53 computes the projection of the input into the latent space. Equation (3) | |
54 computes the reconstruction of the input, while equation (4) computes the | |
55 reconstruction error. | |
56 | |
57 .. math:: | |
58 | |
59 \tilde{x} ~ q_D(\tilde{x}|x) (1) | |
60 | |
61 y = s(W \tilde{x} + b) (2) | |
62 | |
63 x = s(W' y + b') (3) | |
64 | |
65 L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4) | |
66 | |
67 """ | |
68 | |
69 def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n_hidden= 500, | |
70 W = None, bhid = None, bvis = None): | |
71 """ | |
72 Initialize the dA class by specifying the number of visible units (the | |
73 dimension d of the input ), the number of hidden units ( the dimension | |
74 d' of the latent or hidden space ) and the corruption level. The | |
75 constructor also receives symbolic variables for the input, weights and | |
76 bias. Such a symbolic variables are useful when, for example the input is | |
77 the result of some computations, or when weights are shared between the | |
78 dA and an MLP layer. When dealing with SdAs this always happens, | |
79 the dA on layer 2 gets as input the output of the dA on layer 1, | |
80 and the weights of the dA are used in the second stage of training | |
81 to construct an MLP. | |
82 | |
83 :type numpy_rng: numpy.random.RandomState | |
84 :param numpy_rng: number random generator used to generate weights | |
85 | |
86 :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams | |
87 :param theano_rng: Theano random generator; if None is given one is generated | |
88 based on a seed drawn from `rng` | |
89 | |
90 :type input: theano.tensor.TensorType | |
91 :paran input: a symbolic description of the input or None for standalone | |
92 dA | |
93 | |
94 :type n_visible: int | |
95 :param n_visible: number of visible units | |
96 | |
97 :type n_hidden: int | |
98 :param n_hidden: number of hidden units | |
99 | |
100 :type W: theano.tensor.TensorType | |
101 :param W: Theano variable pointing to a set of weights that should be | |
102 shared belong the dA and another architecture; if dA should | |
103 be standalone set this to None | |
104 | |
105 :type bhid: theano.tensor.TensorType | |
106 :param bhid: Theano variable pointing to a set of biases values (for | |
107 hidden units) that should be shared belong dA and another | |
108 architecture; if dA should be standalone set this to None | |
109 | |
110 :type bvis: theano.tensor.TensorType | |
111 :param bvis: Theano variable pointing to a set of biases values (for | |
112 visible units) that should be shared belong dA and another | |
113 architecture; if dA should be standalone set this to None | |
114 | |
115 | |
116 """ | |
117 self.n_visible = n_visible | |
118 self.n_hidden = n_hidden | |
119 | |
120 # create a Theano random generator that gives symbolic random values | |
121 if not theano_rng : | |
122 theano_rng = RandomStreams(rng.randint(2**30)) | |
123 | |
124 # note : W' was written as `W_prime` and b' as `b_prime` | |
125 if not W: | |
126 # W is initialized with `initial_W` which is uniformely sampled | |
127 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible) | |
128 # the output of uniform if converted using asarray to dtype | |
129 # theano.config.floatX so that the code is runable on GPU | |
130 initial_W = numpy.asarray( numpy_rng.uniform( | |
131 low = -numpy.sqrt(6./(n_hidden+n_visible)), | |
132 high = numpy.sqrt(6./(n_hidden+n_visible)), | |
133 size = (n_visible, n_hidden)), dtype = theano.config.floatX) | |
134 W = theano.shared(value = initial_W, name ='W') | |
135 | |
136 if not bvis: | |
137 bvis = theano.shared(value = numpy.zeros(n_visible, | |
138 dtype = theano.config.floatX)) | |
139 | |
140 if not bhid: | |
141 bhid = theano.shared(value = numpy.zeros(n_hidden, | |
142 dtype = theano.config.floatX)) | |
143 | |
144 | |
145 self.W = W | |
146 # b corresponds to the bias of the hidden | |
147 self.b = bhid | |
148 # b_prime corresponds to the bias of the visible | |
149 self.b_prime = bvis | |
150 # tied weights, therefore W_prime is W transpose | |
151 self.W_prime = self.W.T | |
152 self.theano_rng = theano_rng | |
153 # if no input is given, generate a variable representing the input | |
154 if input == None : | |
155 # we use a matrix because we expect a minibatch of several examples, | |
156 # each example being a row | |
157 self.x = T.dmatrix(name = 'input') | |
158 else: | |
159 self.x = input | |
160 | |
161 self.params = [self.W, self.b, self.b_prime] | |
162 | |
163 def get_corrupted_input(self, input, corruption_level): | |
164 """ This function keeps ``1-corruption_level`` entries of the inputs the same | |
165 and zero-out randomly selected subset of size ``coruption_level`` | |
166 Note : first argument of theano.rng.binomial is the shape(size) of | |
167 random numbers that it should produce | |
168 second argument is the number of trials | |
169 third argument is the probability of success of any trial | |
170 | |
171 this will produce an array of 0s and 1s where 1 has a probability of | |
172 1 - ``corruption_level`` and 0 with ``corruption_level`` | |
173 """ | |
174 return self.theano_rng.binomial( size = input.shape, n = 1, prob = 1 - corruption_level) * input | |
175 | |
176 | |
177 def get_hidden_values(self, input): | |
178 """ Computes the values of the hidden layer """ | |
179 return T.nnet.sigmoid(T.dot(input, self.W) + self.b) | |
180 | |
181 def get_reconstructed_input(self, hidden ): | |
182 """ Computes the reconstructed input given the values of the hidden layer """ | |
183 return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime) | |
184 | |
185 def get_cost_updates(self, corruption_level, learning_rate): | |
186 """ This function computes the cost and the updates for one trainng | |
187 step of the dA """ | |
188 | |
189 tilde_x = self.get_corrupted_input(self.x, corruption_level) | |
190 y = self.get_hidden_values( tilde_x) | |
191 z = self.get_reconstructed_input(y) | |
192 # note : we sum over the size of a datapoint; if we are using minibatches, | |
193 # L will be a vector, with one entry per example in minibatch | |
194 L = - T.sum( self.x*T.log(z) + (1-self.x)*T.log(1-z), axis=1 ) | |
195 # note : L is now a vector, where each element is the cross-entropy cost | |
196 # of the reconstruction of the corresponding example of the | |
197 # minibatch. We need to compute the average of all these to get | |
198 # the cost of the minibatch | |
199 cost = T.mean(L) | |
200 | |
201 # compute the gradients of the cost of the `dA` with respect | |
202 # to its parameters | |
203 gparams = T.grad(cost, self.params) | |
204 # generate the list of updates | |
205 updates = {} | |
206 for param, gparam in zip(self.params, gparams): | |
207 updates[param] = param - learning_rate*gparam | |
208 | |
209 return (cost, updates) | |
210 | |
211 | |
212 | |
213 | |
214 def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz' ): | |
215 | |
216 """ | |
217 This demo is tested on MNIST | |
218 | |
219 :type learning_rate: float | |
220 :param learning_rate: learning rate used for training the DeNosing AutoEncoder | |
221 | |
222 :type training_epochs: int | |
223 :param training_epochs: number of epochs used for training | |
224 | |
225 :type dataset: string | |
226 :param dataset: path to the picked dataset | |
227 | |
228 """ | |
229 datasets = load_data(dataset) | |
230 train_set_x, train_set_y = datasets[0] | |
231 | |
232 batch_size = 20 # size of the minibatch | |
233 | |
234 # compute number of minibatches for training, validation and testing | |
235 n_train_batches = train_set_x.value.shape[0] / batch_size | |
236 | |
237 # allocate symbolic variables for the data | |
238 index = T.lscalar() # index to a [mini]batch | |
239 x = T.matrix('x') # the data is presented as rasterized images | |
240 | |
241 #################################### | |
242 # BUILDING THE MODEL NO CORRUPTION # | |
243 #################################### | |
244 | |
245 rng = numpy.random.RandomState(123) | |
246 theano_rng = RandomStreams( rng.randint(2**30)) | |
247 | |
248 da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x, | |
249 n_visible = 28*28, n_hidden = 500) | |
250 | |
251 cost, updates = da.get_cost_updates(corruption_level = 0., | |
252 learning_rate = learning_rate) | |
253 | |
254 | |
255 train_da = theano.function([index], cost, updates = updates, | |
256 givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]}) | |
257 | |
258 start_time = time.clock() | |
259 | |
260 ############ | |
261 # TRAINING # | |
262 ############ | |
263 | |
264 # go through training epochs | |
265 for epoch in xrange(training_epochs): | |
266 # go through trainng set | |
267 c = [] | |
268 for batch_index in xrange(n_train_batches): | |
269 c.append(train_da(batch_index)) | |
270 | |
271 print 'Training epoch %d, cost '%epoch, numpy.mean(c) | |
272 | |
273 end_time = time.clock() | |
274 | |
275 training_time = (end_time - start_time) | |
276 | |
277 print ('Training took %f minutes' %(training_time/60.)) | |
278 | |
279 image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T, | |
280 img_shape = (28,28),tile_shape = (10,10), | |
281 tile_spacing=(1,1))) | |
282 image.save('filters_corruption_0.png') | |
283 | |
284 ##################################### | |
285 # BUILDING THE MODEL CORRUPTION 30% # | |
286 ##################################### | |
287 | |
288 rng = numpy.random.RandomState(123) | |
289 theano_rng = RandomStreams( rng.randint(2**30)) | |
290 | |
291 da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x, | |
292 n_visible = 28*28, n_hidden = 500) | |
293 | |
294 cost, updates = da.get_cost_updates(corruption_level = 0.3, | |
295 learning_rate = learning_rate) | |
296 | |
297 | |
298 train_da = theano.function([index], cost, updates = updates, | |
299 givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]}) | |
300 | |
301 start_time = time.clock() | |
302 | |
303 ############ | |
304 # TRAINING # | |
305 ############ | |
306 | |
307 # go through training epochs | |
308 for epoch in xrange(training_epochs): | |
309 # go through trainng set | |
310 c = [] | |
311 for batch_index in xrange(n_train_batches): | |
312 c.append(train_da(batch_index)) | |
313 | |
314 print 'Training epoch %d, cost '%epoch, numpy.mean(c) | |
315 | |
316 end_time = time.clock() | |
317 | |
318 training_time = (end_time - start_time) | |
319 | |
320 print ('Training took %f minutes' %(training_time/60.)) | |
321 | |
322 image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T, | |
323 img_shape = (28,28),tile_shape = (10,10), | |
324 tile_spacing=(1,1))) | |
325 image.save('filters_corruption_30.png') | |
326 | |
327 | |
328 | |
329 if __name__ == '__main__': | |
330 test_dA() |