comparison code_tutoriel/dA.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents
children
comparison
equal deleted inserted replaced
164:e3de934a98b6 165:4bc5eeec6394
1 """
2 This tutorial introduces denoising auto-encoders (dA) using Theano.
3
4 Denoising autoencoders are the building blocks for SdA.
5 They are based on auto-encoders as the ones used in Bengio et al. 2007.
6 An autoencoder takes an input x and first maps it to a hidden representation
7 y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
8 latent representation y is then mapped back to a "reconstructed" vector
9 z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b'). The weight
10 matrix W' can optionally be constrained such that W' = W^T, in which case
11 the autoencoder is said to have tied weights. The network is trained such
12 that to minimize the reconstruction error (the error between x and z).
13
14 For the denosing autoencoder, during training, first x is corrupted into
15 \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
16 of a stochastic mapping. Afterwards y is computed as before (using
17 \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
18 error is now measured between z and the uncorrupted input x, which is
19 computed as the cross-entropy :
20 - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
21
22
23 References :
24 - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
25 Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
26 2008
27 - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
28 Training of Deep Networks, Advances in Neural Information Processing
29 Systems 19, 2007
30
31 """
32
33 import numpy, time, cPickle, gzip
34
35 import theano
36 import theano.tensor as T
37 from theano.tensor.shared_randomstreams import RandomStreams
38
39 from logistic_sgd import load_data
40 from utils import tile_raster_images
41
42 import PIL.Image
43
44
45 class dA(object):
46 """Denoising Auto-Encoder class (dA)
47
48 A denoising autoencoders tries to reconstruct the input from a corrupted
49 version of it by projecting it first in a latent space and reprojecting
50 it afterwards back in the input space. Please refer to Vincent et al.,2008
51 for more details. If x is the input then equation (1) computes a partially
52 destroyed version of x by means of a stochastic mapping q_D. Equation (2)
53 computes the projection of the input into the latent space. Equation (3)
54 computes the reconstruction of the input, while equation (4) computes the
55 reconstruction error.
56
57 .. math::
58
59 \tilde{x} ~ q_D(\tilde{x}|x) (1)
60
61 y = s(W \tilde{x} + b) (2)
62
63 x = s(W' y + b') (3)
64
65 L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)] (4)
66
67 """
68
69 def __init__(self, numpy_rng, theano_rng = None, input = None, n_visible= 784, n_hidden= 500,
70 W = None, bhid = None, bvis = None):
71 """
72 Initialize the dA class by specifying the number of visible units (the
73 dimension d of the input ), the number of hidden units ( the dimension
74 d' of the latent or hidden space ) and the corruption level. The
75 constructor also receives symbolic variables for the input, weights and
76 bias. Such a symbolic variables are useful when, for example the input is
77 the result of some computations, or when weights are shared between the
78 dA and an MLP layer. When dealing with SdAs this always happens,
79 the dA on layer 2 gets as input the output of the dA on layer 1,
80 and the weights of the dA are used in the second stage of training
81 to construct an MLP.
82
83 :type numpy_rng: numpy.random.RandomState
84 :param numpy_rng: number random generator used to generate weights
85
86 :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
87 :param theano_rng: Theano random generator; if None is given one is generated
88 based on a seed drawn from `rng`
89
90 :type input: theano.tensor.TensorType
91 :paran input: a symbolic description of the input or None for standalone
92 dA
93
94 :type n_visible: int
95 :param n_visible: number of visible units
96
97 :type n_hidden: int
98 :param n_hidden: number of hidden units
99
100 :type W: theano.tensor.TensorType
101 :param W: Theano variable pointing to a set of weights that should be
102 shared belong the dA and another architecture; if dA should
103 be standalone set this to None
104
105 :type bhid: theano.tensor.TensorType
106 :param bhid: Theano variable pointing to a set of biases values (for
107 hidden units) that should be shared belong dA and another
108 architecture; if dA should be standalone set this to None
109
110 :type bvis: theano.tensor.TensorType
111 :param bvis: Theano variable pointing to a set of biases values (for
112 visible units) that should be shared belong dA and another
113 architecture; if dA should be standalone set this to None
114
115
116 """
117 self.n_visible = n_visible
118 self.n_hidden = n_hidden
119
120 # create a Theano random generator that gives symbolic random values
121 if not theano_rng :
122 theano_rng = RandomStreams(rng.randint(2**30))
123
124 # note : W' was written as `W_prime` and b' as `b_prime`
125 if not W:
126 # W is initialized with `initial_W` which is uniformely sampled
127 # from -6./sqrt(n_visible+n_hidden) and 6./sqrt(n_hidden+n_visible)
128 # the output of uniform if converted using asarray to dtype
129 # theano.config.floatX so that the code is runable on GPU
130 initial_W = numpy.asarray( numpy_rng.uniform(
131 low = -numpy.sqrt(6./(n_hidden+n_visible)),
132 high = numpy.sqrt(6./(n_hidden+n_visible)),
133 size = (n_visible, n_hidden)), dtype = theano.config.floatX)
134 W = theano.shared(value = initial_W, name ='W')
135
136 if not bvis:
137 bvis = theano.shared(value = numpy.zeros(n_visible,
138 dtype = theano.config.floatX))
139
140 if not bhid:
141 bhid = theano.shared(value = numpy.zeros(n_hidden,
142 dtype = theano.config.floatX))
143
144
145 self.W = W
146 # b corresponds to the bias of the hidden
147 self.b = bhid
148 # b_prime corresponds to the bias of the visible
149 self.b_prime = bvis
150 # tied weights, therefore W_prime is W transpose
151 self.W_prime = self.W.T
152 self.theano_rng = theano_rng
153 # if no input is given, generate a variable representing the input
154 if input == None :
155 # we use a matrix because we expect a minibatch of several examples,
156 # each example being a row
157 self.x = T.dmatrix(name = 'input')
158 else:
159 self.x = input
160
161 self.params = [self.W, self.b, self.b_prime]
162
163 def get_corrupted_input(self, input, corruption_level):
164 """ This function keeps ``1-corruption_level`` entries of the inputs the same
165 and zero-out randomly selected subset of size ``coruption_level``
166 Note : first argument of theano.rng.binomial is the shape(size) of
167 random numbers that it should produce
168 second argument is the number of trials
169 third argument is the probability of success of any trial
170
171 this will produce an array of 0s and 1s where 1 has a probability of
172 1 - ``corruption_level`` and 0 with ``corruption_level``
173 """
174 return self.theano_rng.binomial( size = input.shape, n = 1, prob = 1 - corruption_level) * input
175
176
177 def get_hidden_values(self, input):
178 """ Computes the values of the hidden layer """
179 return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
180
181 def get_reconstructed_input(self, hidden ):
182 """ Computes the reconstructed input given the values of the hidden layer """
183 return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
184
185 def get_cost_updates(self, corruption_level, learning_rate):
186 """ This function computes the cost and the updates for one trainng
187 step of the dA """
188
189 tilde_x = self.get_corrupted_input(self.x, corruption_level)
190 y = self.get_hidden_values( tilde_x)
191 z = self.get_reconstructed_input(y)
192 # note : we sum over the size of a datapoint; if we are using minibatches,
193 # L will be a vector, with one entry per example in minibatch
194 L = - T.sum( self.x*T.log(z) + (1-self.x)*T.log(1-z), axis=1 )
195 # note : L is now a vector, where each element is the cross-entropy cost
196 # of the reconstruction of the corresponding example of the
197 # minibatch. We need to compute the average of all these to get
198 # the cost of the minibatch
199 cost = T.mean(L)
200
201 # compute the gradients of the cost of the `dA` with respect
202 # to its parameters
203 gparams = T.grad(cost, self.params)
204 # generate the list of updates
205 updates = {}
206 for param, gparam in zip(self.params, gparams):
207 updates[param] = param - learning_rate*gparam
208
209 return (cost, updates)
210
211
212
213
214 def test_dA( learning_rate = 0.1, training_epochs = 15, dataset ='mnist.pkl.gz' ):
215
216 """
217 This demo is tested on MNIST
218
219 :type learning_rate: float
220 :param learning_rate: learning rate used for training the DeNosing AutoEncoder
221
222 :type training_epochs: int
223 :param training_epochs: number of epochs used for training
224
225 :type dataset: string
226 :param dataset: path to the picked dataset
227
228 """
229 datasets = load_data(dataset)
230 train_set_x, train_set_y = datasets[0]
231
232 batch_size = 20 # size of the minibatch
233
234 # compute number of minibatches for training, validation and testing
235 n_train_batches = train_set_x.value.shape[0] / batch_size
236
237 # allocate symbolic variables for the data
238 index = T.lscalar() # index to a [mini]batch
239 x = T.matrix('x') # the data is presented as rasterized images
240
241 ####################################
242 # BUILDING THE MODEL NO CORRUPTION #
243 ####################################
244
245 rng = numpy.random.RandomState(123)
246 theano_rng = RandomStreams( rng.randint(2**30))
247
248 da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x,
249 n_visible = 28*28, n_hidden = 500)
250
251 cost, updates = da.get_cost_updates(corruption_level = 0.,
252 learning_rate = learning_rate)
253
254
255 train_da = theano.function([index], cost, updates = updates,
256 givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
257
258 start_time = time.clock()
259
260 ############
261 # TRAINING #
262 ############
263
264 # go through training epochs
265 for epoch in xrange(training_epochs):
266 # go through trainng set
267 c = []
268 for batch_index in xrange(n_train_batches):
269 c.append(train_da(batch_index))
270
271 print 'Training epoch %d, cost '%epoch, numpy.mean(c)
272
273 end_time = time.clock()
274
275 training_time = (end_time - start_time)
276
277 print ('Training took %f minutes' %(training_time/60.))
278
279 image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T,
280 img_shape = (28,28),tile_shape = (10,10),
281 tile_spacing=(1,1)))
282 image.save('filters_corruption_0.png')
283
284 #####################################
285 # BUILDING THE MODEL CORRUPTION 30% #
286 #####################################
287
288 rng = numpy.random.RandomState(123)
289 theano_rng = RandomStreams( rng.randint(2**30))
290
291 da = dA(numpy_rng = rng, theano_rng = theano_rng, input = x,
292 n_visible = 28*28, n_hidden = 500)
293
294 cost, updates = da.get_cost_updates(corruption_level = 0.3,
295 learning_rate = learning_rate)
296
297
298 train_da = theano.function([index], cost, updates = updates,
299 givens = {x:train_set_x[index*batch_size:(index+1)*batch_size]})
300
301 start_time = time.clock()
302
303 ############
304 # TRAINING #
305 ############
306
307 # go through training epochs
308 for epoch in xrange(training_epochs):
309 # go through trainng set
310 c = []
311 for batch_index in xrange(n_train_batches):
312 c.append(train_da(batch_index))
313
314 print 'Training epoch %d, cost '%epoch, numpy.mean(c)
315
316 end_time = time.clock()
317
318 training_time = (end_time - start_time)
319
320 print ('Training took %f minutes' %(training_time/60.))
321
322 image = PIL.Image.fromarray(tile_raster_images( X = da.W.value.T,
323 img_shape = (28,28),tile_shape = (10,10),
324 tile_spacing=(1,1)))
325 image.save('filters_corruption_30.png')
326
327
328
329 if __name__ == '__main__':
330 test_dA()