comparison scripts/deepmlp.py @ 21:afdd41db8152

Initial commit of the multiple hidden layer perceptron
author Owner <salahmeister@gmail.com>
date Thu, 28 Jan 2010 23:03:44 -0600
parents
children cb47cbc95a21
comparison
equal deleted inserted replaced
20:1e9525aba832 21:afdd41db8152
1 #
2
3 import numpy, cPickle, gzip
4
5
6 import theano
7 import theano.tensor as T
8
9 import time
10
11 import theano.tensor.nnet
12
13 class MLP(object):
14 """Multi-Layer Perceptron Class
15
16 A multilayer perceptron is a feedforward artificial neural network model
17 that has one layer or more of hidden units and nonlinear activations.
18 Intermidiate layers usually have as activation function thanh or the
19 sigmoid function while the top layer is a softamx layer.
20 """
21
22
23
24 def __init__(self, input, n_in, n_hidden, n_out):
25 """Initialize the parameters for the multilayer perceptron
26
27 :param input: symbolic variable that describes the input of the
28 architecture (one minibatch)
29
30 :param n_in: number of input units, the dimension of the space in
31 which the datapoints lie
32
33 :param n_hidden: List representing the number of units for each
34 hidden layer
35
36 #:param n_layer: Number of hidden layers
37
38 :param n_out: number of output units, the dimension of the space in
39 which the labels lie
40
41 """
42
43 # initialize the parameters theta = (W,b) ; Here W and b are lists
44 # where W[i] and b[i] represent the parameters and the bias vector
45 # of the i-th layer.
46 n_layer=len(n_hidden)
47 W_values=[]
48 b_values=[]
49 self.W=[]
50 self.b=[]
51
52 # We first initialize the matrix W[0] and b[0] that represent the parameters
53 # from the input to the first hidden layer
54 W_values.append(numpy.asarray( numpy.random.uniform( \
55 low = -numpy.sqrt(6./(n_in+n_hidden[0])), \
56 high = numpy.sqrt(6./(n_in+n_hidden[0])), \
57 size = (n_in, n_hidden[0])), dtype = theano.config.floatX))
58 self.W.append(theano.shared( value = W_values[0] ))
59 self.b.append(theano.shared( value = numpy.zeros((n_hidden[0],),
60 dtype= theano.config.floatX)))
61
62 # We initialize the parameters between all consecutive hidden layers
63 for i in range(1,n_layer):
64 # Each `W[i]` is initialized with `W_values[i]` which is uniformely sampled
65 # from -6./sqrt(n_hidden[i]+n_hidden[i+1]) and 6./sqrt(n_hidden[i]+n_hidden[i+1])
66 # the output of uniform if converted using asarray to dtype
67 # theano.config.floatX so that the code is runable on GPU
68 W_values.append(numpy.asarray( numpy.random.uniform( \
69 low = -numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \
70 high = numpy.sqrt(6./(n_hidden[i-1]+n_hidden[i])), \
71 size = (n_hidden[i-1], n_hidden[i])), dtype = theano.config.floatX))
72 self.W.append(theano.shared( value = W_values[i] ))
73 self.b.append(theano.shared( value = numpy.zeros((n_hidden[i],),
74 dtype= theano.config.floatX)))
75
76 # We initialize the matrix W[n_layer] and b[n_layer] that represent
77 # the parameters from the last hidden layer to the output layer using the
78 # same uniform sampling.
79 W_values.append(numpy.asarray( numpy.random.uniform(
80 low = -numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)), \
81 high= numpy.sqrt(6./(n_hidden[n_layer-1]+n_out)),\
82 size= (n_hidden[n_layer-1], n_out)), dtype = theano.config.floatX))
83 self.W.append(theano.shared( value = W_values[n_layer]))
84 self.b.append(theano.shared( value = numpy.zeros((n_out,),
85 dtype= theano.config.floatX)))
86
87 # List of the symbolic expressions computing the values each hidden layer
88 self.hidden = []
89
90 # Symbolic expression of the first hidden layer
91 self.hidden.append(T.tanh(T.dot(input, self.W[0])+ self.b[0]))
92 for i in range(1,n_layer):
93 # Symbolic expression of the i-th hidden layer
94 self.hidden.append(T.tanh(T.dot(self.hidden[i-1], self.W[i])+ self.b[i]))
95
96 # symbolic expression computing the values of the top layer
97 self.p_y_given_x= T.nnet.softmax(T.dot(self.hidden[n_layer-1], self.W[n_layer])+self.b[n_layer])
98
99 # compute prediction as class whose probability is maximal in
100 # symbolic form
101 self.y_pred = T.argmax( self.p_y_given_x, axis =1)
102
103 # L1 norm ; one regularization option is to enforce L1 norm to
104 # be small
105 self.L1=abs(self.W[i]).sum()
106 self.L2_sqr=abs(self.W[i]).sum()
107 for i in range(1,n_layer+1):
108 self.L1 += abs(self.W[i]).sum()
109 # square of L2 norm ; one regularization option is to enforce
110 # square of L2 norm to be small
111 for i in range(n_layer+1):
112 self.L2_sqr += abs(self.W[i]**2).sum()
113
114 def negative_log_likelihood(self, y):
115 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
116
117 def errors(self, y):
118 """Return a float representing the number of errors in the minibatch
119 over the total number of examples of the minibatch
120 """
121
122 # check if y has same dimension of y_pred
123 if y.ndim != self.y_pred.ndim:
124 raise TypeError('y should have the same shape as self.y_pred',
125 ('y', target.type, 'y_pred', self.y_pred.type))
126 # check if y is of the correct datatype
127 if y.dtype.startswith('int'):
128 # the T.neq operator returns a vector of 0s and 1s, where 1
129 # represents a mistake in prediction
130 return T.mean(T.neq(self.y_pred, y))
131 else:
132 raise NotImplementedError()
133 def sgd_optimization_mnist( learning_rate=0.01, L1_reg = 0.00, \
134 L2_reg = 0.0001, n_iter=100,n_hidden=[200,100,90,80,70]):
135 """
136 Demonstrate stochastic gradient descent optimization for a multilayer
137 perceptron
138
139 This is demonstrated on MNIST.
140
141 :param learning_rate: learning rate used (factor for the stochastic
142 gradient
143
144 :param L1_reg: L1-norm's weight when added to the cost (see
145 regularization)
146
147 :param L2_reg: L2-norm's weight when added to the cost (see
148 regularization)
149
150 :param n_iter: maximal number of iterations ot run the optimizer
151
152 """
153
154 # Load the dataset
155 f = gzip.open('mnist.pkl.gz','rb')
156 train_set, valid_set, test_set = cPickle.load(f)
157 f.close()
158
159 # make minibatches of size 20
160 batch_size = 20 # sized of the minibatch
161
162 # Dealing with the training set
163 # get the list of training images (x) and their labels (y)
164 (train_set_x, train_set_y) = train_set
165
166 # initialize the list of training minibatches with empty list
167 train_batches = []
168 for i in xrange(0, len(train_set_x), batch_size):
169 # add to the list of minibatches the minibatch starting at
170 # position i, ending at position i+batch_size
171 # a minibatch is a pair ; the first element of the pair is a list
172 # of datapoints, the second element is the list of corresponding
173 # labels
174 train_batches = train_batches + \
175 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
176
177 # Dealing with the validation set
178 (valid_set_x, valid_set_y) = valid_set
179 # initialize the list of validation minibatches
180 valid_batches = []
181 for i in xrange(0, len(valid_set_x), batch_size):
182 valid_batches = valid_batches + \
183 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
184
185 # Dealing with the testing set
186 (test_set_x, test_set_y) = test_set
187 # initialize the list of testing minibatches
188 test_batches = []
189 for i in xrange(0, len(test_set_x), batch_size):
190 test_batches = test_batches + \
191 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
192
193
194 ishape = (28,28) # this is the size of MNIST images
195
196 # allocate symbolic variables for the data
197 x = T.fmatrix() # the data is presented as rasterized images
198 y = T.lvector() # the labels are presented as 1D vector of
199 # [long int] labels
200
201 # construct the logistic regression class
202 classifier = MLP( input=x.reshape((batch_size,28*28)),\
203 n_in=28*28, n_hidden=n_hidden, n_out=10)
204
205 # the cost we minimize during training is the negative log likelihood of
206 # the model plus the regularization terms (L1 and L2); cost is expressed
207 # here symbolically
208 cost = classifier.negative_log_likelihood(y) \
209 + L1_reg * classifier.L1 \
210 + L2_reg * classifier.L2_sqr
211
212 # compiling a theano function that computes the mistakes that are made by
213 # the model on a minibatch
214 test_model = theano.function([x,y], classifier.errors(y))
215 g_W=[]
216 g_b=[]
217 # compute the gradient of cost with respect to theta = (W1, b1, W2, b2)
218 for i in range(len(n_hidden)+1):
219 g_W.append(T.grad(cost, classifier.W[i]))
220 g_b.append(T.grad(cost, classifier.b[i]))
221
222
223 # specify how to update the parameters of the model as a dictionary
224 updates={}
225 for i in range(len(n_hidden)+1):
226 updates[classifier.W[i]]= classifier.W[i] - learning_rate*g_W[i]
227 updates[classifier.b[i]]= classifier.b[i] - learning_rate*g_b[i]
228 # compiling a theano function `train_model` that returns the cost, but in
229 # the same time updates the parameter of the model based on the rules
230 # defined in `updates`
231 train_model = theano.function([x, y], cost, updates = updates )
232 n_minibatches = len(train_batches)
233
234 # early-stopping parameters
235 patience = 10000 # look as this many examples regardless
236 patience_increase = 2 # wait this much longer when a new best is
237 # found
238 improvement_threshold = 0.995 # a relative improvement of this much is
239 # considered significant
240 validation_frequency = n_minibatches # go through this many
241 # minibatche before checking the network
242 # on the validation set; in this case we
243 # check every epoch
244
245
246 best_params = None
247 best_validation_loss = float('inf')
248 best_iter = 0
249 test_score = 0.
250 start_time = time.clock()
251 # have a maximum of `n_iter` iterations through the entire dataset
252 for iter in xrange(n_iter* n_minibatches):
253
254 # get epoch and minibatch index
255 epoch = iter / n_minibatches
256 minibatch_index = iter % n_minibatches
257
258 # get the minibatches corresponding to `iter` modulo
259 # `len(train_batches)`
260 x,y = train_batches[ minibatch_index ]
261 cost_ij = train_model(x,y)
262
263 if (iter+1) % validation_frequency == 0:
264 # compute zero-one loss on validation set
265 this_validation_loss = 0.
266 for x,y in valid_batches:
267 # sum up the errors for each minibatch
268 this_validation_loss += test_model(x,y)
269 # get the average by dividing with the number of minibatches
270 this_validation_loss /= len(valid_batches)
271
272 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
273 (epoch, minibatch_index+1, n_minibatches, \
274 this_validation_loss*100.))
275
276
277 # if we got the best validation score until now
278 if this_validation_loss < best_validation_loss:
279
280 #improve patience if loss improvement is good enough
281 if this_validation_loss < best_validation_loss * \
282 improvement_threshold :
283 patience = max(patience, iter * patience_increase)
284
285 # save best validation score and iteration number
286 best_validation_loss = this_validation_loss
287 best_iter = iter
288
289 # test it on the test set
290 test_score = 0.
291 for x,y in test_batches:
292 test_score += test_model(x,y)
293 test_score /= len(test_batches)
294 print((' epoch %i, minibatch %i/%i, test error of best '
295 'model %f %%') %
296 (epoch, minibatch_index+1, n_minibatches,
297 test_score*100.))
298
299 if patience <= iter :
300 break
301
302 end_time = time.clock()
303 print(('Optimization complete. Best validation score of %f %% '
304 'obtained at iteration %i, with test performance %f %%') %
305 (best_validation_loss * 100., best_iter, test_score*100.))
306 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
307 #test on NIST (you need pylearn and access to NIST to do that)
308 if __name__ == '__main__':
309 sgd_optimization_mnist()
310