comparison code_tutoriel/logistic_sgd.py @ 0:fda5f787baa6

commit initial
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Thu, 21 Jan 2010 11:26:43 -0500
parents
children bcc87d3e33a3
comparison
equal deleted inserted replaced
-1:000000000000 0:fda5f787baa6
1 """
2 This tutorial introduces logistic regression using Theano and stochastic
3 gradient descent.
4
5 Logistic regression is a probabilistic, linear classifier. It is parametrized
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
7 done by projecting data points onto a set of hyperplanes, the distance to
8 which is used to determine a class membership probability.
9
10 Mathematically, this can be written as:
11
12 .. math::
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
15
16
17 The output of the model or prediction is then done by taking the argmax of
18 the vector whose i'th element is P(Y=i|x).
19
20 .. math::
21
22 y_{pred} = argmax_i P(Y=i|x,W,b)
23
24
25 This tutorial presents a stochastic gradient descent optimization method
26 suitable for large datasets, and a conjugate gradient optimization method
27 that is suitable for smaller datasets.
28
29
30 References:
31
32 - textbooks: "Pattern Recognition and Machine Learning" -
33 Christopher M. Bishop, section 4.3.2
34
35
36 """
37 __docformat__ = 'restructedtext en'
38
39
40 import numpy, cPickle, gzip
41
42 import time
43
44 import theano
45 import theano.tensor as T
46
47 import theano.tensor.nnet
48
49
50 class LogisticRegression(object):
51 """Multi-class Logistic Regression Class
52
53 The logistic regression is fully described by a weight matrix :math:`W`
54 and bias vector :math:`b`. Classification is done by projecting data
55 points onto a set of hyperplanes, the distance to which is used to
56 determine a class membership probability.
57 """
58
59
60
61
62 def __init__(self, input, n_in, n_out):
63 """ Initialize the parameters of the logistic regression
64
65 :param input: symbolic variable that describes the input of the
66 architecture (one minibatch)
67
68 :param n_in: number of input units, the dimension of the space in
69 which the datapoints lie
70
71 :param n_out: number of output units, the dimension of the space in
72 which the labels lie
73
74 """
75
76 # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
77 self.W = theano.shared( value=numpy.zeros((n_in,n_out),
78 dtype = theano.config.floatX) )
79 # initialize the baises b as a vector of n_out 0s
80 self.b = theano.shared( value=numpy.zeros((n_out,),
81 dtype = theano.config.floatX) )
82
83
84 # compute vector of class-membership probabilities in symbolic form
85 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
86
87 # compute prediction as class whose probability is maximal in
88 # symbolic form
89 self.y_pred=T.argmax(self.p_y_given_x, axis=1)
90
91
92
93
94
95 def negative_log_likelihood(self, y):
96 """Return the mean of the negative log-likelihood of the prediction
97 of this model under a given target distribution.
98
99 .. math::
100
101 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
102 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
103 \ell (\theta=\{W,b\}, \mathcal{D})
104
105
106 :param y: corresponds to a vector that gives for each example the
107 :correct label
108
109 Note: we use the mean instead of the sum so that
110 the learning rate is less dependent on the batch size
111 """
112 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
113
114
115
116
117
118 def errors(self, y):
119 """Return a float representing the number of errors in the minibatch
120 over the total number of examples of the minibatch ; zero one
121 loss over the size of the minibatch
122 """
123
124 # check if y has same dimension of y_pred
125 if y.ndim != self.y_pred.ndim:
126 raise TypeError('y should have the same shape as self.y_pred',
127 ('y', target.type, 'y_pred', self.y_pred.type))
128 # check if y is of the correct datatype
129 if y.dtype.startswith('int'):
130 # the T.neq operator returns a vector of 0s and 1s, where 1
131 # represents a mistake in prediction
132 return T.mean(T.neq(self.y_pred, y))
133 else:
134 raise NotImplementedError()
135
136
137
138
139
140 def sgd_optimization_mnist( learning_rate=0.01, n_iter=100):
141 """
142 Demonstrate stochastic gradient descent optimization of a log-linear
143 model
144
145 This is demonstrated on MNIST.
146
147 :param learning_rate: learning rate used (factor for the stochastic
148 gradient
149
150 :param n_iter: number of iterations ot run the optimizer
151
152 """
153
154 # Load the dataset
155 f = gzip.open('mnist.pkl.gz','rb')
156 train_set, valid_set, test_set = cPickle.load(f)
157 f.close()
158
159 # make minibatches of size 20
160 batch_size = 20 # sized of the minibatch
161
162 # Dealing with the training set
163 # get the list of training images (x) and their labels (y)
164 (train_set_x, train_set_y) = train_set
165 # initialize the list of training minibatches with empty list
166 train_batches = []
167 for i in xrange(0, len(train_set_x), batch_size):
168 # add to the list of minibatches the minibatch starting at
169 # position i, ending at position i+batch_size
170 # a minibatch is a pair ; the first element of the pair is a list
171 # of datapoints, the second element is the list of corresponding
172 # labels
173 train_batches = train_batches + \
174 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
175
176 # Dealing with the validation set
177 (valid_set_x, valid_set_y) = valid_set
178 # initialize the list of validation minibatches
179 valid_batches = []
180 for i in xrange(0, len(valid_set_x), batch_size):
181 valid_batches = valid_batches + \
182 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
183
184 # Dealing with the testing set
185 (test_set_x, test_set_y) = test_set
186 # initialize the list of testing minibatches
187 test_batches = []
188 for i in xrange(0, len(test_set_x), batch_size):
189 test_batches = test_batches + \
190 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
191
192
193 ishape = (28,28) # this is the size of MNIST images
194
195 # allocate symbolic variables for the data
196 x = T.fmatrix() # the data is presented as rasterized images
197 y = T.lvector() # the labels are presented as 1D vector of
198 # [long int] labels
199
200 # construct the logistic regression class
201 classifier = LogisticRegression( \
202 input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
203
204 # the cost we minimize during training is the negative log likelihood of
205 # the model in symbolic format
206 cost = classifier.negative_log_likelihood(y)
207
208 # compiling a Theano function that computes the mistakes that are made by
209 # the model on a minibatch
210 test_model = theano.function([x,y], classifier.errors(y))
211
212 # compute the gradient of cost with respect to theta = (W,b)
213 g_W = T.grad(cost, classifier.W)
214 g_b = T.grad(cost, classifier.b)
215
216 # specify how to update the parameters of the model as a dictionary
217 updates ={classifier.W: classifier.W - learning_rate*g_W,\
218 classifier.b: classifier.b - learning_rate*g_b}
219
220 # compiling a Theano function `train_model` that returns the cost, but in
221 # the same time updates the parameter of the model based on the rules
222 # defined in `updates`
223 train_model = theano.function([x, y], cost, updates = updates )
224
225 n_minibatches = len(train_batches) # number of minibatchers
226
227 # early-stopping parameters
228 patience = 5000 # look as this many examples regardless
229 patience_increase = 2 # wait this much longer when a new best is
230 # found
231 improvement_threshold = 0.995 # a relative improvement of this much is
232 # considered significant
233 validation_frequency = n_minibatches # go through this many
234 # minibatche before checking the network
235 # on the validation set; in this case we
236 # check every epoch
237
238 best_params = None
239 best_validation_loss = float('inf')
240 test_score = 0.
241 start_time = time.clock()
242 # have a maximum of `n_iter` iterations through the entire dataset
243 for iter in xrange(n_iter* n_minibatches):
244
245 # get epoch and minibatch index
246 epoch = iter / n_minibatches
247 minibatch_index = iter % n_minibatches
248
249 # get the minibatches corresponding to `iter` modulo
250 # `len(train_batches)`
251 x,y = train_batches[ minibatch_index ]
252 cost_ij = train_model(x,y)
253
254 if (iter+1) % validation_frequency == 0:
255 # compute zero-one loss on validation set
256 this_validation_loss = 0.
257 for x,y in valid_batches:
258 # sum up the errors for each minibatch
259 this_validation_loss += test_model(x,y)
260 # get the average by dividing with the number of minibatches
261 this_validation_loss /= len(valid_batches)
262
263 print('epoch %i, minibatch %i/%i, validation error %f %%' % \
264 (epoch, minibatch_index+1,n_minibatches, \
265 this_validation_loss*100.))
266
267
268 # if we got the best validation score until now
269 if this_validation_loss < best_validation_loss:
270 #improve patience if loss improvement is good enough
271 if this_validation_loss < best_validation_loss * \
272 improvement_threshold :
273 patience = max(patience, iter * patience_increase)
274
275 best_validation_loss = this_validation_loss
276 # test it on the test set
277
278 test_score = 0.
279 for x,y in test_batches:
280 test_score += test_model(x,y)
281 test_score /= len(test_batches)
282 print((' epoch %i, minibatch %i/%i, test error of best '
283 'model %f %%') % \
284 (epoch, minibatch_index+1, n_minibatches,test_score*100.))
285
286 if patience <= iter :
287 break
288
289 end_time = time.clock()
290 print(('Optimization complete with best validation score of %f %%,'
291 'with test performance %f %%') %
292 (best_validation_loss * 100., test_score*100.))
293 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
294
295
296
297
298
299
300
301 if __name__ == '__main__':
302 sgd_optimization_mnist()
303