Mercurial > pylearn
comparison mlp.py @ 118:d0a1bd0378c6
Finished draft of OneHiddenLayerNNetClassifier to debut learner.py
author | Yoshua Bengio <bengioy@iro.umontreal.ca> |
---|---|
date | Wed, 07 May 2008 15:07:56 -0400 |
parents | 88257dfedf8c |
children | 2ca8dccba270 |
comparison
equal
deleted
inserted
replaced
111:88257dfedf8c | 118:d0a1bd0378c6 |
---|---|
1 | 1 |
2 from learner import * | 2 from learner import * |
3 from theano import tensor as t | 3 from theano import tensor as t |
4 from theano.scalar import as_scalar | 4 from theano.scalar import as_scalar |
5 from nnet_ops import * | |
5 | 6 |
6 # this is one of the simplest example of learner, and illustrates | 7 # this is one of the simplest example of learner, and illustrates |
7 # the use of theano | 8 # the use of theano |
8 | 9 |
9 | 10 |
80 return self.parameterAttributes() + ["L2_regularizer"] | 81 return self.parameterAttributes() + ["L2_regularizer"] |
81 | 82 |
82 def updateMinibatchInputFields(self): | 83 def updateMinibatchInputFields(self): |
83 return ["input","target"] | 84 return ["input","target"] |
84 | 85 |
85 def updateMinibatchInputAttributes(self): | |
86 return self.parameterAttributes() | |
87 | |
88 def updateMinibatchOutputAttributes(self): | |
89 return self.parameterAttributes() | |
90 | |
91 def updateEndInputAttributes(self): | |
92 return self.parameterAttributes() | |
93 | |
94 def updateEndOutputAttributes(self): | 86 def updateEndOutputAttributes(self): |
95 return ["regularization_term"] | 87 return ["regularization_term"] |
96 | 88 |
89 def lossAttribute(self): | |
90 return "minibatch_criterion" | |
91 | |
97 def defaultOutputFields(self, input_fields): | 92 def defaultOutputFields(self, input_fields): |
98 output_fields = ["output", "output_class",] | 93 output_fields = ["output", "output_class",] |
99 if "target" in input_fields: | 94 if "target" in input_fields: |
100 output_fields += ["class_error", "nll"] | 95 output_fields += ["class_error", "nll"] |
101 return output_fields | 96 return output_fields |
102 | 97 |
98 def __init__(self,n_hidden,n_classes,learning_rate,init_range=1.): | |
99 self._n_outputs = n_classes | |
100 self._n_hidden = n_hidden | |
101 self._init_range = init_range | |
102 self.learning_rate = learning_rate # this is the float | |
103 self._learning_rate = t.scalar('learning_rate') # this is the symbol | |
104 self._input = t.matrix('input') # n_examples x n_inputs | |
105 self._target = t.matrix('target') # n_examples x n_outputs | |
106 self._L2_regularizer = as_scalar(0.,'L2_regularizer') | |
107 self._W1 = t.matrix('W1') | |
108 self._W2 = t.matrix('W2') | |
109 self._b1 = t.row('b1') | |
110 self._b2 = t.row('b2') | |
111 self._regularizer = self._L2_regularizer * (t.dot(self._W1,self._W1) + t.dot(self._W2,self._W2)) | |
112 self._output_activations =self._b2+t.dot(t.tanh(self._b1+t.dot(self._input,self._W1.T)),self._W2.T) | |
113 self._output = t.softmax(self._output_activations) | |
114 self._output_class = t.argmax(self._output,1) | |
115 self._class_error = self._output_class != self._target | |
116 self._nll,self._output = crossentropy_softmax_1hot(self._output_activation,self._target) | |
117 self._minibatch_criterion = self._nll + self._regularizer / t.shape(self._input)[0] | |
118 MinibatchUpdatesTLearner.__init__(self) | |
119 | |
120 def allocate(self,minibatch): | |
121 minibatch_n_inputs = minibatch["input"].shape[1] | |
122 if not self._n_inputs: | |
123 self._n_inputs = minibatch_n_inputs | |
124 self.b1 = numpy.zeros(self._n_hidden) | |
125 self.b2 = numpy.zeros(self._n_outputs) | |
126 self.forget() | |
127 elif self._n_inputs!=minibatch_n_inputs: | |
128 # if the input changes dimension on the fly, we resize and forget everything | |
129 self.forget() | |
130 | |
131 def forget(self): | |
132 if self._n_inputs: | |
133 r = self._init_range/math.sqrt(self._n_inputs) | |
134 self.W1 = numpy.random.uniform(low=-r,high=r, | |
135 size=(self._n_hidden,self._n_inputs)) | |
136 r = self._init_range/math.sqrt(self._n_hidden) | |
137 self.W2 = numpy.random.uniform(low=-r,high=r, | |
138 size=(self._n_outputs,self._n_hidden)) | |
139 self.b1[:]=0 | |
140 self.b2[:]=0 | |
141 | |
142 | |
143 class MLP(MinibatchUpdatesTLearner): | |
144 """ | |
145 Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization. | |
146 | |
147 The predictor parameters are obtained by minibatch/online gradient descent. | |
148 Training can proceed sequentially (with multiple calls to update with | |
149 different disjoint subsets of the training sets). | |
150 | |
151 Hyper-parameters: | |
152 - L1_regularizer | |
153 - L2_regularizer | |
154 - neuron_sparsity_regularizer | |
155 - initial_learning_rate | |
156 - learning_rate_decrease_rate | |
157 - n_hidden_per_layer (a list of integers) | |
158 - activation_function ("sigmoid","tanh", or "ratio") | |
159 | |
160 The output/task type (classification, regression, etc.) is obtained by specializing MLP. | |
161 | |
162 For each (input[t],output[t]) pair in a minibatch,:: | |
163 | |
164 activation[0] = input_t | |
165 for k=1 to n_hidden_layers: | |
166 activation[k]=activation_function(b[k]+ W[k]*activation[k-1]) | |
167 output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers]) | |
168 | |
169 and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent:: | |
170 | |
171 L2_regularizer sum_{ijk} W_{kij}^2 + L1_regularizer sum_{kij} |W_{kij}| | |
172 + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity| | |
173 - sum_t log P_{output_model}(target_t | output_t) | |
174 | |
175 The fields and attributes expected and produced by use and update are the following: | |
176 | |
177 - Input and output fields (example-wise quantities): | |
178 | |
179 - 'input' (always expected by use and update) | |
180 - 'target' (optionally expected by use and always by update) | |
181 - 'output' (optionally produced by use) | |
182 - error fields produced by sub-class of MLP | |
183 | |
184 - optional attributes (optionally expected as input_dataset attributes) | |
185 (warning, this may be dangerous, the 'use' method will use those provided in the | |
186 input_dataset rather than those learned during 'update'; currently no support | |
187 for providing these to update): | |
188 | |
189 - 'L1_regularizer' | |
190 - 'L2_regularizer' | |
191 - 'b' | |
192 - 'W' | |
193 - 'parameters' = [b[1], W[1], b[2], W[2], ...] | |
194 - 'regularization_term' | |
195 | |
196 """ | |
197 | |
198 def attributeNames(self): | |
199 return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"] | |
200 | |
201 def useInputAttributes(self): | |
202 return ["b","W"] | |
203 | |
204 def useOutputAttributes(self): | |
205 return [] | |
206 | |
207 def updateInputAttributes(self): | |
208 return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"] | |
209 | |
210 def updateMinibatchInputFields(self): | |
211 return ["input","target"] | |
212 | |
213 def updateMinibatchInputAttributes(self): | |
214 return ["b","W"] | |
215 | |
216 def updateMinibatchOutputAttributes(self): | |
217 return ["new_XtX","new_XtY"] | |
218 | |
219 def updateEndInputAttributes(self): | |
220 return ["theta","XtX","XtY"] | |
221 | |
222 def updateEndOutputAttributes(self): | |
223 return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ? | |
224 | |
225 def parameterAttributes(self): | |
226 return ["b","W"] | |
227 | |
228 def defaultOutputFields(self, input_fields): | |
229 output_fields = ["output"] | |
230 if "target" in input_fields: | |
231 output_fields.append("squared_error") | |
232 return output_fields | |
233 | |
103 def __init__(self): | 234 def __init__(self): |
104 self._input = t.matrix('input') # n_examples x n_inputs | 235 self._input = t.matrix('input') # n_examples x n_inputs |
105 self._target = t.matrix('target') # n_examples x n_outputs | 236 self._target = t.matrix('target') # n_examples x n_outputs |
106 self._lambda = as_scalar(0.,'lambda') | 237 self._L2_regularizer = as_scalar(0.,'L2_regularizer') |
107 self._theta = t.matrix('theta') | 238 self._theta = t.matrix('theta') |
108 self._W = self._theta[:,1:] | 239 self._W = self._theta[:,1:] |
109 self._b = self._theta[:,0] | 240 self._b = self._theta[:,0] |
110 self._XtX = t.matrix('XtX') | 241 self._XtX = t.matrix('XtX') |
111 self._XtY = t.matrix('XtY') | 242 self._XtY = t.matrix('XtY') |
112 self._extended_input = t.prepend_one_to_each_row(self._input) | 243 self._extended_input = t.prepend_one_to_each_row(self._input) |
113 self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix | 244 self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix |
114 self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector | 245 self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector |
115 self._regularizer = self._lambda * t.dot(self._W,self._W) | 246 self._regularizer = self._L2_regularizer * t.dot(self._W,self._W) |
116 self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input)) | 247 self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input)) |
117 self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target)) | 248 self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target)) |
118 self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY) | 249 self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY) |
119 | 250 |
120 OneShotTLearner.__init__(self) | 251 OneShotTLearner.__init__(self) |
137 if self._n_inputs and self._n_outputs: | 268 if self._n_inputs and self._n_outputs: |
138 self.XtX.resize((1+self.n_inputs,1+self.n_inputs)) | 269 self.XtX.resize((1+self.n_inputs,1+self.n_inputs)) |
139 self.XtY.resize((1+self.n_inputs,self.n_outputs)) | 270 self.XtY.resize((1+self.n_inputs,self.n_outputs)) |
140 self.XtX.data[:,:]=0 | 271 self.XtX.data[:,:]=0 |
141 self.XtY.data[:,:]=0 | 272 self.XtY.data[:,:]=0 |
142 numpy.diag(self.XtX.data)[1:]=self.lambda | 273 numpy.diag(self.XtX.data)[1:]=self.L2_regularizer |
143 | 274 |
144 | |
145 class MLP(MinibatchUpdatesTLearner): | |
146 """ | |
147 Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization. | |
148 | |
149 The predictor parameters are obtained by minibatch/online gradient descent. | |
150 Training can proceed sequentially (with multiple calls to update with | |
151 different disjoint subsets of the training sets). | |
152 | |
153 Hyper-parameters: | |
154 - L1_regularizer | |
155 - L2_regularizer | |
156 - neuron_sparsity_regularizer | |
157 - initial_learning_rate | |
158 - learning_rate_decrease_rate | |
159 - n_hidden_per_layer (a list of integers) | |
160 - activation_function ("sigmoid","tanh", or "ratio") | |
161 | |
162 The output/task type (classification, regression, etc.) is obtained by specializing MLP. | |
163 | |
164 For each (input[t],output[t]) pair in a minibatch,:: | |
165 | |
166 activation[0] = input_t | |
167 for k=1 to n_hidden_layers: | |
168 activation[k]=activation_function(b[k]+ W[k]*activation[k-1]) | |
169 output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers]) | |
170 | |
171 and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent:: | |
172 | |
173 L2_regularizer sum_{ijk} W_{kij}^2 + L1_regularizer sum_{kij} |W_{kij}| | |
174 + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity| | |
175 - sum_t log P_{output_model}(target_t | output_t) | |
176 | |
177 The fields and attributes expected and produced by use and update are the following: | |
178 | |
179 - Input and output fields (example-wise quantities): | |
180 | |
181 - 'input' (always expected by use and update) | |
182 - 'target' (optionally expected by use and always by update) | |
183 - 'output' (optionally produced by use) | |
184 - error fields produced by sub-class of MLP | |
185 | |
186 - optional attributes (optionally expected as input_dataset attributes) | |
187 (warning, this may be dangerous, the 'use' method will use those provided in the | |
188 input_dataset rather than those learned during 'update'; currently no support | |
189 for providing these to update): | |
190 | |
191 - 'L1_regularizer' | |
192 - 'L2_regularizer' | |
193 - 'b' | |
194 - 'W' | |
195 - 'parameters' = [b[1], W[1], b[2], W[2], ...] | |
196 - 'regularization_term' | |
197 | |
198 """ | |
199 | |
200 def attributeNames(self): | |
201 return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"] | |
202 | |
203 def useInputAttributes(self): | |
204 return ["b","W"] | |
205 | |
206 def useOutputAttributes(self): | |
207 return [] | |
208 | |
209 def updateInputAttributes(self): | |
210 return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"] | |
211 | |
212 def updateMinibatchInputFields(self): | |
213 return ["input","target"] | |
214 | |
215 def updateMinibatchInputAttributes(self): | |
216 return ["b","W"] | |
217 | |
218 def updateMinibatchOutputAttributes(self): | |
219 return ["new_XtX","new_XtY"] | |
220 | |
221 def updateEndInputAttributes(self): | |
222 return ["theta","XtX","XtY"] | |
223 | |
224 def updateEndOutputAttributes(self): | |
225 return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ? | |
226 | |
227 def parameterAttributes(self): | |
228 return ["b","W"] | |
229 | |
230 def defaultOutputFields(self, input_fields): | |
231 output_fields = ["output"] | |
232 if "target" in input_fields: | |
233 output_fields.append("squared_error") | |
234 return output_fields | |
235 | |
236 def __init__(self): | |
237 self._input = t.matrix('input') # n_examples x n_inputs | |
238 self._target = t.matrix('target') # n_examples x n_outputs | |
239 self._lambda = as_scalar(0.,'lambda') | |
240 self._theta = t.matrix('theta') | |
241 self._W = self._theta[:,1:] | |
242 self._b = self._theta[:,0] | |
243 self._XtX = t.matrix('XtX') | |
244 self._XtY = t.matrix('XtY') | |
245 self._extended_input = t.prepend_one_to_each_row(self._input) | |
246 self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix | |
247 self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector | |
248 self._regularizer = self._lambda * t.dot(self._W,self._W) | |
249 self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input)) | |
250 self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target)) | |
251 self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY) | |
252 | |
253 OneShotTLearner.__init__(self) | |
254 | |
255 def allocate(self,minibatch): | |
256 minibatch_n_inputs = minibatch["input"].shape[1] | |
257 minibatch_n_outputs = minibatch["target"].shape[1] | |
258 if not self._n_inputs: | |
259 self._n_inputs = minibatch_n_inputs | |
260 self._n_outputs = minibatch_n_outputs | |
261 self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs)) | |
262 self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs)) | |
263 self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs)) | |
264 self.forget() | |
265 elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs: | |
266 # if the input or target changes dimension on the fly, we resize and forget everything | |
267 self.forget() | |
268 | |
269 def forget(self): | |
270 if self._n_inputs and self._n_outputs: | |
271 self.XtX.resize((1+self.n_inputs,1+self.n_inputs)) | |
272 self.XtY.resize((1+self.n_inputs,self.n_outputs)) | |
273 self.XtX.data[:,:]=0 | |
274 self.XtY.data[:,:]=0 | |
275 numpy.diag(self.XtX.data)[1:]=self.lambda | |
276 |