comparison code_tutoriel/logistic_cg.py @ 0:fda5f787baa6

commit initial
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Thu, 21 Jan 2010 11:26:43 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fda5f787baa6
1 """
2 This tutorial introduces logistic regression using Theano and conjugate
3 gradient descent.
4
5 Logistic regression is a probabilistic, linear classifier. It is parametrized
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
7 done by projecting data points onto a set of hyperplanes, the distance to
8 which is used to determine a class membership probability.
9
10 Mathematically, this can be written as:
11
12 .. math::
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
15
16
17 The output of the model or prediction is then done by taking the argmax of
18 the vector whose i'th element is P(Y=i|x).
19
20 .. math::
21
22 y_{pred} = argmax_i P(Y=i|x,W,b)
23
24
25 This tutorial presents a stochastic gradient descent optimization method
26 suitable for large datasets, and a conjugate gradient optimization method
27 that is suitable for smaller datasets.
28
29
30 References:
31
32 - textbooks: "Pattern Recognition and Machine Learning" -
33 Christopher M. Bishop, section 4.3.2
34
35
36 """
37 __docformat__ = 'restructedtext en'
38
39
40 import numpy, cPickle, gzip
41
42 import time
43
44 import theano
45 import theano.tensor as T
46 import theano.tensor.nnet
47
48
49 class LogisticRegression(object):
50 """Multi-class Logistic Regression Class
51
52 The logistic regression is fully described by a weight matrix :math:`W`
53 and bias vector :math:`b`. Classification is done by projecting data
54 points onto a set of hyperplanes, the distance to which is used to
55 determine a class membership probability.
56 """
57
58
59
60
61 def __init__(self, input, n_in, n_out):
62 """ Initialize the parameters of the logistic regression
63
64 :param input: symbolic variable that describes the input of the
65 architecture ( one minibatch)
66
67 :param n_in: number of input units, the dimension of the space in
68 which the datapoint lies
69
70 :param n_out: number of output units, the dimension of the space in
71 which the target lies
72
73 """
74
75 # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
76 # while b is a vector of n_out elements, making theta a vector of
77 # n_in*n_out + n_out elements
78 self.theta = theano.shared( value = numpy.zeros(n_in*n_out+n_out) )
79 # W is represented by the fisr n_in*n_out elements of theta
80 self.W = self.theta[0:n_in*n_out].reshape((n_in,n_out))
81 # b is the rest (last n_out elements)
82 self.b = self.theta[n_in*n_out:n_in*n_out+n_out]
83
84
85 # compute vector of class-membership probabilities in symbolic form
86 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+self.b)
87
88 # compute prediction as class whose probability is maximal in
89 # symbolic form
90 self.y_pred=T.argmax(self.p_y_given_x, axis=1)
91
92
93
94
95
96 def negative_log_likelihood(self, y):
97 """Return the negative log-likelihood of the prediction of this model
98 under a given target distribution.
99
100 .. math::
101
102 \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
103 \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
104 \ell (\theta=\{W,b\}, \mathcal{D})
105
106
107 :param y: corresponds to a vector that gives for each example the
108 :correct label
109 """
110 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]),y])
111
112
113
114
115
116 def errors(self, y):
117 """Return a float representing the number of errors in the minibatch
118 over the total number of examples of the minibatch
119 """
120
121 # check if y has same dimension of y_pred
122 if y.ndim != self.y_pred.ndim:
123 raise TypeError('y should have the same shape as self.y_pred',
124 ('y', target.type, 'y_pred', self.y_pred.type))
125 # check if y is of the correct datatype
126 if y.dtype.startswith('int'):
127 # the T.neq operator returns a vector of 0s and 1s, where 1
128 # represents a mistake in prediction
129 return T.mean(T.neq(self.y_pred, y))
130 else:
131 raise NotImplementedError()
132
133
134
135
136
137
138
139 def cg_optimization_mnist( n_iter=50 ):
140 """Demonstrate conjugate gradient optimization of a log-linear model
141
142 This is demonstrated on MNIST.
143
144 :param n_iter: number of iterations ot run the optimizer
145
146 """
147
148 # Load the dataset
149 f = gzip.open('mnist.pkl.gz','rb')
150 train_set, valid_set, test_set = cPickle.load(f)
151 f.close()
152
153 # make minibatches of size 20
154 batch_size = 20 # sized of the minibatch
155
156 # Dealing with the training set
157 # get the list of training images (x) and their labels (y)
158 (train_set_x, train_set_y) = train_set
159 # initialize the list of training minibatches with empty list
160 train_batches = []
161 for i in xrange(0, len(train_set_x), batch_size):
162 # add to the list of minibatches the minibatch starting at
163 # position i, ending at position i+batch_size
164 # a minibatch is a pair ; the first element of the pair is a list
165 # of datapoints, the second element is the list of corresponding
166 # labels
167 train_batches = train_batches + \
168 [(train_set_x[i:i+batch_size], train_set_y[i:i+batch_size])]
169
170 # Dealing with the validation set
171 (valid_set_x, valid_set_y) = valid_set
172 # initialize the list of validation minibatches
173 valid_batches = []
174 for i in xrange(0, len(valid_set_x), batch_size):
175 valid_batches = valid_batches + \
176 [(valid_set_x[i:i+batch_size], valid_set_y[i:i+batch_size])]
177
178 # Dealing with the testing set
179 (test_set_x, test_set_y) = test_set
180 # initialize the list of testing minibatches
181 test_batches = []
182 for i in xrange(0, len(test_set_x), batch_size):
183 test_batches = test_batches + \
184 [(test_set_x[i:i+batch_size], test_set_y[i:i+batch_size])]
185
186
187 ishape = (28,28) # this is the size of MNIST images
188 n_in = 28*28 # number of input units
189 n_out = 10 # number of output units
190 # allocate symbolic variables for the data
191 x = T.fmatrix() # the data is presented as rasterized images
192 y = T.lvector() # the labels are presented as 1D vector of
193 # [long int] labels
194
195
196 # construct the logistic regression class
197 classifier = LogisticRegression( \
198 input=x.reshape((batch_size,28*28)), n_in=28*28, n_out=10)
199
200 # the cost we minimize during training is the negative log likelihood of
201 # the model in symbolic format
202 cost = classifier.negative_log_likelihood(y).mean()
203
204 # compile a theano function that computes the mistakes that are made by
205 # the model on a minibatch
206 test_model = theano.function([x,y], classifier.errors(y))
207 # compile a theano function that returns the gradient of the minibatch
208 # with respect to theta
209 batch_grad = theano.function([x, y], T.grad(cost, classifier.theta))
210 # compile a thenao function that returns the cost of a minibatch
211 batch_cost = theano.function([x, y], cost)
212
213 # creates a function that computes the average cost on the training set
214 def train_fn(theta_value):
215 classifier.theta.value = theta_value
216 cost = 0.
217 for x,y in train_batches :
218 cost += batch_cost(x,y)
219 return cost / len(train_batches)
220
221 # creates a function that computes the average gradient of cost with
222 # respect to theta
223 def train_fn_grad(theta_value):
224 classifier.theta.value = theta_value
225 grad = numpy.zeros(n_in * n_out + n_out)
226 for x,y in train_batches:
227 grad += batch_grad(x,y)
228 return grad/ len(train_batches)
229
230
231
232 validation_scores = [float('inf'), 0]
233
234 # creates the validation function
235 def callback(theta_value):
236 classifier.theta.value = theta_value
237 #compute the validation loss
238 this_validation_loss = 0.
239 for x,y in valid_batches:
240 this_validation_loss += test_model(x,y)
241
242 this_validation_loss /= len(valid_batches)
243
244 print('validation error %f %%' % (this_validation_loss*100.,))
245
246 # check if it is better then best validation score got until now
247 if this_validation_loss < validation_scores[0]:
248 # if so, replace the old one, and compute the score on the
249 # testing dataset
250 validation_scores[0] = this_validation_loss
251 test_score = 0.
252 for x,y in test_batches:
253 test_score += test_model(x,y)
254 validation_scores[1] = test_score / len(test_batches)
255
256 # using scipy conjugate gradient optimizer
257 import scipy.optimize
258 print ("Optimizing using scipy.optimize.fmin_cg...")
259 start_time = time.clock()
260 best_w_b = scipy.optimize.fmin_cg(
261 f=train_fn,
262 x0=numpy.zeros((n_in+1)*n_out, dtype=x.dtype),
263 fprime=train_fn_grad,
264 callback=callback,
265 disp=0,
266 maxiter=n_iter)
267 end_time = time.clock()
268 print(('Optimization complete with best validation score of %f %%, with '
269 'test performance %f %%') %
270 (validation_scores[0]*100., validation_scores[1]*100.))
271
272 print ('The code ran for %f minutes' % ((end_time-start_time)/60.))
273
274
275
276
277
278
279
280 if __name__ == '__main__':
281 cg_optimization_mnist()
282