Mercurial > ift6266
comparison scripts/deepmlp.py @ 21:afdd41db8152
Initial commit of the multiple hidden layer perceptron
author | Owner <salahmeister@gmail.com> |
---|---|
date | Thu, 28 Jan 2010 23:03:44 -0600 |
parents | |
children | cb47cbc95a21 |
comparison
equal
deleted
inserted
replaced
20:1e9525aba832 | 21:afdd41db8152 |
---|---|
1 # | |
2 | |
3 import numpy, cPickle, gzip | |
4 | |
5 | |
6 import theano | |
7 import theano.tensor as T | |
8 | |
9 import time | |
10 | |
11 import theano.tensor.nnet | |
12 | |
13 class MLP(object): | |
14 """Multi-Layer Perceptron Class | |
15 | |
16 A multilayer perceptron is a feedforward artificial neural network model | |
17 that has one layer or more of hidden units and nonlinear activations. | |
18 Intermidiate layers usually have as activation function thanh or the | |
19 sigmoid function while the top layer is a softamx layer. | |
20 """ | |
21 | |
22 | |
23 | |
24 def __init__(self, input, n_in, n_hidden, n_out): | |
25 """Initialize the parameters for the multilayer perceptron | |
26 | |
27 :param input: symbolic variable that describes the input of the | |
28 architecture (one minibatch) | |
29 | |
30 :param n_in: number of input units, the dimension of the space in | |
31 which the datapoints lie | |
32 | |
33 :param n_hidden: List representing the number of units for each | |
34 hidden layer | |
35 | |
36 #:param n_layer: Number of hidden layers | |
37 | |
38 :param n_out: number of output units, the dimension of the space in | |
39 which the labels lie | |
40 | |
41 """ | |
42 | |
43 # initialize the parameters theta = (W,b) ; Here W and b are lists | |
44 # where W[i] and b[i] represent the parameters and the bias vector | |
45 # of the i-th layer. | |
46 n_layer=len(n_hidden) | |
47 W_values=[] | |
48 b_values=[] | |
49 self.W=[] | |
50 self.b=[] | |
51 | |
52 # We first initialize the matrix W[0] and b[0] that represent the parameters | |
53 # from the input to the first hidden layer | |
54 W_values.append(numpy.asarray( numpy.random.uniform( \ | |
55 low = -numpy.sqrt(6./(n_in+n_hidden[0])), \ | |
56 high = numpy.sqrt(6./(n_in+n_hidden[0])), \ | |
57 size = (n_in, n_hidden[0])), dtype = theano.config.floatX)) | |
58 self.W.append(theano.shared( value = W_values[0] )) | |
59 self.b.append(theano.shared( value = numpy.zeros((n_hidden[0],), | |
60 dtype= theano.config.floatX))) | |
61 | |
62 # We initialize the parameters between all consecutive hidden layers | |
63 for i in range(1,n_layer): | |
64 # Each `W[i]` is initialized with `W_values[i]` which is uniformely sampled | |
65 # from -6./sqrt(n_hidden[i]+n_hidden[i+1]) and 6./sqrt(n_hidden[i]+n_hidden[i+1]) | |
66 # the output of uniform if converted using asarray to dtype | |
67 # theano.config.floatX so that the code is runable on GPU | |
68 W_values.append(numpy.asarray( numpy.random.uniform( \ | |
69 low = -numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ | |
70 high = numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \ | |
71 size = (n_hidden[i-1], n_hidden[i])), dtype = theano.config.floatX)) | |
72 self.W.append(theano.shared( value = W_values[i] )) | |
73 self.b.append(theano.shared( value = numpy.zeros((n_hidden[i],), | |
74 dtype= theano.config.floatX))) | |
75 | |
76 # We initialize the matrix W[n_layer] and b[n_layer] that represent | |
77 # the parameters from the last hidden layer to the output layer using the | |
78 # same uniform sampling. | |
79 W_values.append(numpy.asarray( numpy.random.uniform( | |
80 low = -numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)), \ | |
81 high= numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)),\ | |
82 size= (n_hidden[n_layer-1], n_out)), dtype = theano.config.floatX)) | |
83 self.W.append(theano.shared( value = W_values[n_layer])) | |
84 self.b.append(theano.shared( value = numpy.zeros((n_out,), | |
85 dtype= theano.config.floatX))) | |
86 | |
87 # List of the symbolic expressions computing the values each hidden layer | |
88 self.hidden = [] | |
89 | |
90 # Symbolic expression of the first hidden layer | |
91 self.hidden.append(T.tanh(T.dot(input, self.W[0])+ self.b[0])) | |
92 for i in range(1,n_layer): | |
93 # Symbolic expression of the i-th hidden layer | |
94 self.hidden.append(T.tanh(T.dot(self.hidden[i-1], self.W[i])+ self.b[i])) | |
95 | |
96 # symbolic expression computing the values of the top layer | |
97 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden[n_layer-1], self.W[n_layer])+self.b[n_layer]) | |
98 | |
99 # compute prediction as class whose probability is maximal in | |
100 # symbolic form | |
101 self.y_pred = T.argmax( self.p_y_given_x, axis =1) | |
102 | |
103 # L1 norm ; one regularization option is to enforce L1 norm to | |
104 # be small | |
105 self.L1=abs(self.W[i]).sum() | |
106 self.L2_sqr=abs(self.W[i]).sum() | |
107 for i in range(1,n_layer+1): | |
108 self.L1 += abs(self.W[i]).sum() | |
109 # square of L2 norm ; one regularization option is to enforce | |
110 # square of L2 norm to be small | |
111 for i in range(n_layer+1): | |
112 self.L2_sqr += abs(self.W[i]**2).sum() | |
113 | |
114 def negative_log_likelihood(self, y): | |
115 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y]) | |
116 | |
117 def errors(self, y): | |
118 """Return a float representing the number of errors in the minibatch | |
119 over the total number of examples of the minibatch | |
120 """ | |
121 | |
122 # check if y has same dimension of y_pred | |
123 if y.ndim != self.y_pred.ndim: | |
124 raise TypeError('y should have the same shape as self.y_pred', | |
125 ('y', target.type, 'y_pred', self.y_pred.type)) | |
126 # check if y is of the correct datatype | |
127 if y.dtype.startswith('int'): | |
128 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
129 # represents a mistake in prediction | |
130 return T.mean(T.neq(self.y_pred, y)) | |
131 else: | |
132 raise NotImplementedError() | |
133 def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \ | |
134 L2_reg = 0.0001, n_iter=100,n_hidden=[200,100,90,80,70]): | |
135 """ | |
136 Demonstrate stochastic gradient descent optimization for a multilayer | |
137 perceptron | |
138 | |
139 This is demonstrated on MNIST. | |
140 | |
141 :param learning_rate: learning rate used (factor for the stochastic | |
142 gradient | |
143 | |
144 :param L1_reg: L1-norm's weight when added to the cost (see | |
145 regularization) | |
146 | |
147 :param L2_reg: L2-norm's weight when added to the cost (see | |
148 regularization) | |
149 | |
150 :param n_iter: maximal number of iterations ot run the optimizer | |
151 | |
152 """ | |
153 | |
154 # Load the dataset | |
155 f = gzip.open('mnist.pkl.gz','rb') | |
156 train_set, valid_set, test_set = cPickle.load(f) | |
157 f.close() | |
158 | |
159 # make minibatches of size 20 | |
160 batch_size = 20 # sized of the minibatch | |
161 | |
162 # Dealing with the training set | |
163 # get the list of training images (x) and their labels (y) | |
164 (train_set_x, train_set_y) = train_set | |
165 | |
166 # initialize the list of training minibatches with empty list | |
167 train_batches = [] | |
168 for i in xrange(0, len(train_set_x), batch_size): | |
169 # add to the list of minibatches the minibatch starting at | |
170 # position i, ending at position i+batch_size | |
171 # a minibatch is a pair ; the first element of the pair is a list | |
172 # of datapoints, the second element is the list of corresponding | |
173 # labels | |
174 train_batches = train_batches + \ | |
175 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])] | |
176 | |
177 # Dealing with the validation set | |
178 (valid_set_x, valid_set_y) = valid_set | |
179 # initialize the list of validation minibatches | |
180 valid_batches = [] | |
181 for i in xrange(0, len(valid_set_x), batch_size): | |
182 valid_batches = valid_batches + \ | |
183 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])] | |
184 | |
185 # Dealing with the testing set | |
186 (test_set_x, test_set_y) = test_set | |
187 # initialize the list of testing minibatches | |
188 test_batches = [] | |
189 for i in xrange(0, len(test_set_x), batch_size): | |
190 test_batches = test_batches + \ | |
191 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])] | |
192 | |
193 | |
194 ishape = (28,28) # this is the size of MNIST images | |
195 | |
196 # allocate symbolic variables for the data | |
197 x = T.fmatrix() # the data is presented as rasterized images | |
198 y = T.lvector() # the labels are presented as 1D vector of | |
199 # [long int] labels | |
200 | |
201 # construct the logistic regression class | |
202 classifier = MLP( input=x.reshape((batch_size,28*28)),\ | |
203 n_in=28*28, n_hidden=n_hidden, n_out=10) | |
204 | |
205 # the cost we minimize during training is the negative log likelihood of | |
206 # the model plus the regularization terms (L1 and L2); cost is expressed | |
207 # here symbolically | |
208 cost = classifier.negative_log_likelihood(y) \ | |
209 + L1_reg * classifier.L1 \ | |
210 + L2_reg * classifier.L2_sqr | |
211 | |
212 # compiling a theano function that computes the mistakes that are made by | |
213 # the model on a minibatch | |
214 test_model = theano.function([x,y], classifier.errors(y)) | |
215 g_W=[] | |
216 g_b=[] | |
217 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2) | |
218 for i in range(len(n_hidden)+1): | |
219 g_W.append(T.grad(cost, classifier.W[i])) | |
220 g_b.append(T.grad(cost, classifier.b[i])) | |
221 | |
222 | |
223 # specify how to update the parameters of the model as a dictionary | |
224 updates={} | |
225 for i in range(len(n_hidden)+1): | |
226 updates[classifier.W[i]]= classifier.W[i] - learning_rate*g_W[i] | |
227 updates[classifier.b[i]]= classifier.b[i] - learning_rate*g_b[i] | |
228 # compiling a theano function `train_model` that returns the cost, but in | |
229 # the same time updates the parameter of the model based on the rules | |
230 # defined in `updates` | |
231 train_model = theano.function([x, y], cost, updates = updates ) | |
232 n_minibatches = len(train_batches) | |
233 | |
234 # early-stopping parameters | |
235 patience = 10000 # look as this many examples regardless | |
236 patience_increase = 2 # wait this much longer when a new best is | |
237 # found | |
238 improvement_threshold = 0.995 # a relative improvement of this much is | |
239 # considered significant | |
240 validation_frequency = n_minibatches # go through this many | |
241 # minibatche before checking the network | |
242 # on the validation set; in this case we | |
243 # check every epoch | |
244 | |
245 | |
246 best_params = None | |
247 best_validation_loss = float('inf') | |
248 best_iter = 0 | |
249 test_score = 0. | |
250 start_time = time.clock() | |
251 # have a maximum of `n_iter` iterations through the entire dataset | |
252 for iter in xrange(n_iter* n_minibatches): | |
253 | |
254 # get epoch and minibatch index | |
255 epoch = iter / n_minibatches | |
256 minibatch_index = iter % n_minibatches | |
257 | |
258 # get the minibatches corresponding to `iter` modulo | |
259 # `len(train_batches)` | |
260 x,y = train_batches[ minibatch_index ] | |
261 cost_ij = train_model(x,y) | |
262 | |
263 if (iter+1) % validation_frequency == 0: | |
264 # compute zero-one loss on validation set | |
265 this_validation_loss = 0. | |
266 for x,y in valid_batches: | |
267 # sum up the errors for each minibatch | |
268 this_validation_loss += test_model(x,y) | |
269 # get the average by dividing with the number of minibatches | |
270 this_validation_loss /= len(valid_batches) | |
271 | |
272 print('epoch %i, minibatch %i/%i, validation error %f %%' % \ | |
273 (epoch, minibatch_index+1, n_minibatches, \ | |
274 this_validation_loss*100.)) | |
275 | |
276 | |
277 # if we got the best validation score until now | |
278 if this_validation_loss < best_validation_loss: | |
279 | |
280 #improve patience if loss improvement is good enough | |
281 if this_validation_loss < best_validation_loss * \ | |
282 improvement_threshold : | |
283 patience = max(patience, iter * patience_increase) | |
284 | |
285 # save best validation score and iteration number | |
286 best_validation_loss = this_validation_loss | |
287 best_iter = iter | |
288 | |
289 # test it on the test set | |
290 test_score = 0. | |
291 for x,y in test_batches: | |
292 test_score += test_model(x,y) | |
293 test_score /= len(test_batches) | |
294 print((' epoch %i, minibatch %i/%i, test error of best ' | |
295 'model %f %%') % | |
296 (epoch, minibatch_index+1, n_minibatches, | |
297 test_score*100.)) | |
298 | |
299 if patience <= iter : | |
300 break | |
301 | |
302 end_time = time.clock() | |
303 print(('Optimization complete. Best validation score of %f %% ' | |
304 'obtained at iteration %i, with test performance %f %%') % | |
305 (best_validation_loss * 100., best_iter, test_score*100.)) | |
306 print ('The code ran for %f minutes' % ((end_time-start_time)/60.)) | |
307 #test on NIST (you need pylearn and access to NIST to do that) | |
308 if __name__ == '__main__': | |
309 sgd_optimization_mnist() | |
310 |