comparison code_tutoriel/logistic_cg.py @ 165:4bc5eeec6394

Updating the tutorial code to the latest revisions.
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 13:55:27 -0500
parents
children
comparison
equal deleted inserted replaced
164:e3de934a98b6 165:4bc5eeec6394
1 """
2 This tutorial introduces logistic regression using Theano and conjugate
3 gradient descent.
4
5 Logistic regression is a probabilistic, linear classifier. It is parametrized
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
7 done by projecting data points onto a set of hyperplanes, the distance to
8 which is used to determine a class membership probability.
9
10 Mathematically, this can be written as:
11
12 .. math::
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
15
16
17 The output of the model or prediction is then done by taking the argmax of
18 the vector whose i'th element is P(Y=i|x).
19
20 .. math::
21
22 y_{pred} = argmax_i P(Y=i|x,W,b)
23
24
25 This tutorial presents a stochastic gradient descent optimization method
26 suitable for large datasets, and a conjugate gradient optimization method
27 that is suitable for smaller datasets.
28
29
30 References:
31
32 - textbooks: "Pattern Recognition and Machine Learning" -
33 Christopher M. Bishop, section 4.3.2
34
35
36 """
37 __docformat__ = 'restructedtext en'
38
39
40 import numpy, time, cPickle, gzip
41
42 import theano
43 import theano.tensor as T
44
45
46 class LogisticRegression(object):
47 """Multi-class Logistic Regression Class
48
49 The logistic regression is fully described by a weight matrix :math:`W`
50 and bias vector :math:`b`. Classification is done by projecting data
51 points onto a set of hyperplanes, the distance to which is used to
52 determine a class membership probability.
53 """
54
55
56
57
58 def __init__(self, input, n_in, n_out):
59 """ Initialize the parameters of the logistic regression
60
61 :type input: theano.tensor.TensorType
62 :param input: symbolic variable that describes the input of the
63 architecture ( one minibatch)
64
65 :type n_in: int
66 :param n_in: number of input units, the dimension of the space in
67 which the datapoint lies
68
69 :type n_out: int
70 :param n_out: number of output units, the dimension of the space in
71 which the target lies
72
73 """
74
75 # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
76 # while b is a vector of n_out elements, making theta a vector of
77 # n_in*n_out + n_out elements
78 self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out, dtype = theano.config.floatX) )
79 # W is represented by the fisr n_in*n_out elements of theta
80 self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out))
81 # b is the rest (last n_out elements)
82 self.b = self.theta[n_in*n_out:n_in*n_out+n_out]
83
84
85 # compute vector of class-membership probabilities in symbolic form
86 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
87
88 # compute prediction as class whose probability is maximal in
89 # symbolic form
90 self.y_pred=T.argmax(self.p_y_given_x, axis=1)
91
92
93
94
95
96 def negative_log_likelihood(self, y):
97 """Return the negative log-likelihood of the prediction of this model
98 under a given target distribution.
99
100 .. math::
101
102 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
103 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
104 \ell (\theta=\{W,b\}, \mathcal{D})
105
106 :type y: theano.tensor.TensorType
107 :param y: corresponds to a vector that gives for each example the
108 correct label
109 """
110 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
111
112
113
114
115
116 def errors(self, y):
117 """Return a float representing the number of errors in the minibatch
118 over the total number of examples of the minibatch
119
120 :type y: theano.tensor.TensorType
121 :param y: corresponds to a vector that gives for each example
122 the correct label
123 """
124
125 # check if y has same dimension of y_pred
126 if y.ndim != self.y_pred.ndim:
127 raise TypeError('y should have the same shape as self.y_pred',
128 ('y', target.type, 'y_pred', self.y_pred.type))
129 # check if y is of the correct datatype
130 if y.dtype.startswith('int'):
131 # the T.neq operator returns a vector of 0s and 1s, where 1
132 # represents a mistake in prediction
133 return T.mean(T.neq(self.y_pred, y))
134 else:
135 raise NotImplementedError()
136
137
138
139
140
141
142
143 def cg_optimization_mnist( n_epochs=50, mnist_pkl_gz='mnist.pkl.gz' ):
144 """Demonstrate conjugate gradient optimization of a log-linear model
145
146 This is demonstrated on MNIST.
147
148 :type n_epochs: int
149 :param n_epochs: number of epochs to run the optimizer
150
151 :type mnist_pkl_gz: string
152 :param mnist_pkl_gz: the path of the mnist training file from
153 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
154
155 """
156 #############
157 # LOAD DATA #
158 #############
159 print '... loading data'
160
161 # Load the dataset
162 f = gzip.open(mnist_pkl_gz,'rb')
163 train_set, valid_set, test_set = cPickle.load(f)
164 f.close()
165
166 def shared_dataset(data_xy):
167 """ Function that loads the dataset into shared variables
168
169 The reason we store our dataset in shared variables is to allow
170 Theano to copy it into the GPU memory (when code is run on GPU).
171 Since copying data into the GPU is slow, copying a minibatch everytime
172 is needed (the default behaviour if the data is not in a shared
173 variable) would lead to a large decrease in performance.
174 """
175 data_x, data_y = data_xy
176 shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
177 shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
178 # When storing data on the GPU it has to be stored as floats
179 # therefore we will store the labels as ``floatX`` as well
180 # (``shared_y`` does exactly that). But during our computations
181 # we need them as ints (we use labels as index, and if they are
182 # floats it doesn't make sense) therefore instead of returning
183 # ``shared_y`` we will have to cast it to int. This little hack
184 # lets ous get around this issue
185 return shared_x, T.cast(shared_y, 'int32')
186
187
188 test_set_x, test_set_y = shared_dataset(test_set)
189 valid_set_x, valid_set_y = shared_dataset(valid_set)
190 train_set_x, train_set_y = shared_dataset(train_set)
191
192 batch_size = 600 # size of the minibatch
193
194 n_train_batches = train_set_x.value.shape[0] / batch_size
195 n_valid_batches = valid_set_x.value.shape[0] / batch_size
196 n_test_batches = test_set_x.value.shape[0] / batch_size
197
198
199 ishape = (28,28) # this is the size of MNIST images
200 n_in = 28*28 # number of input units
201 n_out = 10 # number of output units
202
203
204 ######################
205 # BUILD ACTUAL MODEL #
206 ######################
207 print '... building the model'
208
209 # allocate symbolic variables for the data
210 minibatch_offset = T.lscalar() # offset to the start of a [mini]batch
211 x = T.matrix() # the data is presented as rasterized images
212 y = T.ivector() # the labels are presented as 1D vector of
213 # [int] labels
214
215
216 # construct the logistic regression class
217 classifier = LogisticRegression( input=x, n_in=28*28, n_out=10)
218
219 # the cost we minimize during training is the negative log likelihood of
220 # the model in symbolic format
221 cost = classifier.negative_log_likelihood(y).mean()
222
223 # compile a theano function that computes the mistakes that are made by
224 # the model on a minibatch
225 test_model = theano.function([minibatch_offset], classifier.errors(y),
226 givens={
227 x:test_set_x[minibatch_offset:minibatch_offset+batch_size],
228 y:test_set_y[minibatch_offset:minibatch_offset+batch_size]})
229
230 validate_model = theano.function([minibatch_offset],classifier.errors(y),
231 givens={
232 x:valid_set_x[minibatch_offset:minibatch_offset+batch_size],
233 y:valid_set_y[minibatch_offset:minibatch_offset+batch_size]})
234
235 # compile a thenao function that returns the cost of a minibatch
236 batch_cost = theano.function([minibatch_offset], cost,
237 givens= {
238 x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
239 y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
240
241
242
243 # compile a theano function that returns the gradient of the minibatch
244 # with respect to theta
245 batch_grad = theano.function([minibatch_offset], T.grad(cost,classifier.theta),
246 givens= {
247 x : train_set_x[minibatch_offset:minibatch_offset+batch_size],
248 y : train_set_y[minibatch_offset:minibatch_offset+batch_size]})
249
250
251 # creates a function that computes the average cost on the training set
252 def train_fn(theta_value):
253 classifier.theta.value = theta_value
254 train_losses = [batch_cost(i*batch_size) for i in xrange(n_train_batches)]
255 return numpy.mean(train_losses)
256
257 # creates a function that computes the average gradient of cost with
258 # respect to theta
259 def train_fn_grad(theta_value):
260 classifier.theta.value = theta_value
261 grad = batch_grad(0)
262 for i in xrange(1,n_train_batches):
263 grad += batch_grad(i*batch_size)
264 return grad/n_train_batches
265
266
267 validation_scores = [float('inf'), 0]
268
269 # creates the validation function
270 def callback(theta_value):
271 classifier.theta.value = theta_value
272 #compute the validation loss
273 validation_losses = [validate_model(i*batch_size) for i in xrange(n_valid_batches)]
274 this_validation_loss = numpy.mean(validation_losses)
275 print('validation error %f %%' % (this_validation_loss*100.,))
276
277 # check if it is better then best validation score got until now
278 if this_validation_loss < validation_scores[0]:
279 # if so, replace the old one, and compute the score on the
280 # testing dataset
281 validation_scores[0] = this_validation_loss
282 test_loses = [test_model(i*batch_size) for i in xrange(n_test_batches)]
283 validation_scores[1] = numpy.mean(test_loses)
284
285 ###############
286 # TRAIN MODEL #
287 ###############
288
289 # using scipy conjugate gradient optimizer
290 import scipy.optimize
291 print ("Optimizing using scipy.optimize.fmin_cg...")
292 start_time = time.clock()
293 best_w_b = scipy.optimize.fmin_cg(
294 f = train_fn,
295 x0 = numpy.zeros((n_in+1)*n_out, dtype=x.dtype),
296 fprime = train_fn_grad,
297 callback = callback,
298 disp = 0,
299 maxiter = n_epochs)
300 end_time = time.clock()
301 print(('Optimization complete with best validation score of %f %%, with '
302 'test performance %f %%') %
303 (validation_scores[0]*100., validation_scores[1]*100.))
304
305 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
306
307
308 if __name__ == '__main__':
309 cg_optimization_mnist()
310