comparison mlp.py @ 118:d0a1bd0378c6

Finished draft of OneHiddenLayerNNetClassifier to debut learner.py
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Wed, 07 May 2008 15:07:56 -0400
parents 88257dfedf8c
children 2ca8dccba270
comparison
equal deleted inserted replaced
111:88257dfedf8c 118:d0a1bd0378c6
1 1
2 from learner import * 2 from learner import *
3 from theano import tensor as t 3 from theano import tensor as t
4 from theano.scalar import as_scalar 4 from theano.scalar import as_scalar
5 from nnet_ops import *
5 6
6 # this is one of the simplest example of learner, and illustrates 7 # this is one of the simplest example of learner, and illustrates
7 # the use of theano 8 # the use of theano
8 9
9 10
80 return self.parameterAttributes() + ["L2_regularizer"] 81 return self.parameterAttributes() + ["L2_regularizer"]
81 82
82 def updateMinibatchInputFields(self): 83 def updateMinibatchInputFields(self):
83 return ["input","target"] 84 return ["input","target"]
84 85
85 def updateMinibatchInputAttributes(self):
86 return self.parameterAttributes()
87
88 def updateMinibatchOutputAttributes(self):
89 return self.parameterAttributes()
90
91 def updateEndInputAttributes(self):
92 return self.parameterAttributes()
93
94 def updateEndOutputAttributes(self): 86 def updateEndOutputAttributes(self):
95 return ["regularization_term"] 87 return ["regularization_term"]
96 88
89 def lossAttribute(self):
90 return "minibatch_criterion"
91
97 def defaultOutputFields(self, input_fields): 92 def defaultOutputFields(self, input_fields):
98 output_fields = ["output", "output_class",] 93 output_fields = ["output", "output_class",]
99 if "target" in input_fields: 94 if "target" in input_fields:
100 output_fields += ["class_error", "nll"] 95 output_fields += ["class_error", "nll"]
101 return output_fields 96 return output_fields
102 97
98 def __init__(self,n_hidden,n_classes,learning_rate,init_range=1.):
99 self._n_outputs = n_classes
100 self._n_hidden = n_hidden
101 self._init_range = init_range
102 self.learning_rate = learning_rate # this is the float
103 self._learning_rate = t.scalar('learning_rate') # this is the symbol
104 self._input = t.matrix('input') # n_examples x n_inputs
105 self._target = t.matrix('target') # n_examples x n_outputs
106 self._L2_regularizer = as_scalar(0.,'L2_regularizer')
107 self._W1 = t.matrix('W1')
108 self._W2 = t.matrix('W2')
109 self._b1 = t.row('b1')
110 self._b2 = t.row('b2')
111 self._regularizer = self._L2_regularizer * (t.dot(self._W1,self._W1) + t.dot(self._W2,self._W2))
112 self._output_activations =self._b2+t.dot(t.tanh(self._b1+t.dot(self._input,self._W1.T)),self._W2.T)
113 self._output = t.softmax(self._output_activations)
114 self._output_class = t.argmax(self._output,1)
115 self._class_error = self._output_class != self._target
116 self._nll,self._output = crossentropy_softmax_1hot(self._output_activation,self._target)
117 self._minibatch_criterion = self._nll + self._regularizer / t.shape(self._input)[0]
118 MinibatchUpdatesTLearner.__init__(self)
119
120 def allocate(self,minibatch):
121 minibatch_n_inputs = minibatch["input"].shape[1]
122 if not self._n_inputs:
123 self._n_inputs = minibatch_n_inputs
124 self.b1 = numpy.zeros(self._n_hidden)
125 self.b2 = numpy.zeros(self._n_outputs)
126 self.forget()
127 elif self._n_inputs!=minibatch_n_inputs:
128 # if the input changes dimension on the fly, we resize and forget everything
129 self.forget()
130
131 def forget(self):
132 if self._n_inputs:
133 r = self._init_range/math.sqrt(self._n_inputs)
134 self.W1 = numpy.random.uniform(low=-r,high=r,
135 size=(self._n_hidden,self._n_inputs))
136 r = self._init_range/math.sqrt(self._n_hidden)
137 self.W2 = numpy.random.uniform(low=-r,high=r,
138 size=(self._n_outputs,self._n_hidden))
139 self.b1[:]=0
140 self.b2[:]=0
141
142
143 class MLP(MinibatchUpdatesTLearner):
144 """
145 Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization.
146
147 The predictor parameters are obtained by minibatch/online gradient descent.
148 Training can proceed sequentially (with multiple calls to update with
149 different disjoint subsets of the training sets).
150
151 Hyper-parameters:
152 - L1_regularizer
153 - L2_regularizer
154 - neuron_sparsity_regularizer
155 - initial_learning_rate
156 - learning_rate_decrease_rate
157 - n_hidden_per_layer (a list of integers)
158 - activation_function ("sigmoid","tanh", or "ratio")
159
160 The output/task type (classification, regression, etc.) is obtained by specializing MLP.
161
162 For each (input[t],output[t]) pair in a minibatch,::
163
164 activation[0] = input_t
165 for k=1 to n_hidden_layers:
166 activation[k]=activation_function(b[k]+ W[k]*activation[k-1])
167 output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers])
168
169 and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent::
170
171 L2_regularizer sum_{ijk} W_{kij}^2 + L1_regularizer sum_{kij} |W_{kij}|
172 + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity|
173 - sum_t log P_{output_model}(target_t | output_t)
174
175 The fields and attributes expected and produced by use and update are the following:
176
177 - Input and output fields (example-wise quantities):
178
179 - 'input' (always expected by use and update)
180 - 'target' (optionally expected by use and always by update)
181 - 'output' (optionally produced by use)
182 - error fields produced by sub-class of MLP
183
184 - optional attributes (optionally expected as input_dataset attributes)
185 (warning, this may be dangerous, the 'use' method will use those provided in the
186 input_dataset rather than those learned during 'update'; currently no support
187 for providing these to update):
188
189 - 'L1_regularizer'
190 - 'L2_regularizer'
191 - 'b'
192 - 'W'
193 - 'parameters' = [b[1], W[1], b[2], W[2], ...]
194 - 'regularization_term'
195
196 """
197
198 def attributeNames(self):
199 return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"]
200
201 def useInputAttributes(self):
202 return ["b","W"]
203
204 def useOutputAttributes(self):
205 return []
206
207 def updateInputAttributes(self):
208 return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"]
209
210 def updateMinibatchInputFields(self):
211 return ["input","target"]
212
213 def updateMinibatchInputAttributes(self):
214 return ["b","W"]
215
216 def updateMinibatchOutputAttributes(self):
217 return ["new_XtX","new_XtY"]
218
219 def updateEndInputAttributes(self):
220 return ["theta","XtX","XtY"]
221
222 def updateEndOutputAttributes(self):
223 return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?
224
225 def parameterAttributes(self):
226 return ["b","W"]
227
228 def defaultOutputFields(self, input_fields):
229 output_fields = ["output"]
230 if "target" in input_fields:
231 output_fields.append("squared_error")
232 return output_fields
233
103 def __init__(self): 234 def __init__(self):
104 self._input = t.matrix('input') # n_examples x n_inputs 235 self._input = t.matrix('input') # n_examples x n_inputs
105 self._target = t.matrix('target') # n_examples x n_outputs 236 self._target = t.matrix('target') # n_examples x n_outputs
106 self._lambda = as_scalar(0.,'lambda') 237 self._L2_regularizer = as_scalar(0.,'L2_regularizer')
107 self._theta = t.matrix('theta') 238 self._theta = t.matrix('theta')
108 self._W = self._theta[:,1:] 239 self._W = self._theta[:,1:]
109 self._b = self._theta[:,0] 240 self._b = self._theta[:,0]
110 self._XtX = t.matrix('XtX') 241 self._XtX = t.matrix('XtX')
111 self._XtY = t.matrix('XtY') 242 self._XtY = t.matrix('XtY')
112 self._extended_input = t.prepend_one_to_each_row(self._input) 243 self._extended_input = t.prepend_one_to_each_row(self._input)
113 self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix 244 self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix
114 self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector 245 self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
115 self._regularizer = self._lambda * t.dot(self._W,self._W) 246 self._regularizer = self._L2_regularizer * t.dot(self._W,self._W)
116 self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input)) 247 self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
117 self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target)) 248 self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
118 self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY) 249 self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
119 250
120 OneShotTLearner.__init__(self) 251 OneShotTLearner.__init__(self)
137 if self._n_inputs and self._n_outputs: 268 if self._n_inputs and self._n_outputs:
138 self.XtX.resize((1+self.n_inputs,1+self.n_inputs)) 269 self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
139 self.XtY.resize((1+self.n_inputs,self.n_outputs)) 270 self.XtY.resize((1+self.n_inputs,self.n_outputs))
140 self.XtX.data[:,:]=0 271 self.XtX.data[:,:]=0
141 self.XtY.data[:,:]=0 272 self.XtY.data[:,:]=0
142 numpy.diag(self.XtX.data)[1:]=self.lambda 273 numpy.diag(self.XtX.data)[1:]=self.L2_regularizer
143 274
144
145 class MLP(MinibatchUpdatesTLearner):
146 """
147 Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization.
148
149 The predictor parameters are obtained by minibatch/online gradient descent.
150 Training can proceed sequentially (with multiple calls to update with
151 different disjoint subsets of the training sets).
152
153 Hyper-parameters:
154 - L1_regularizer
155 - L2_regularizer
156 - neuron_sparsity_regularizer
157 - initial_learning_rate
158 - learning_rate_decrease_rate
159 - n_hidden_per_layer (a list of integers)
160 - activation_function ("sigmoid","tanh", or "ratio")
161
162 The output/task type (classification, regression, etc.) is obtained by specializing MLP.
163
164 For each (input[t],output[t]) pair in a minibatch,::
165
166 activation[0] = input_t
167 for k=1 to n_hidden_layers:
168 activation[k]=activation_function(b[k]+ W[k]*activation[k-1])
169 output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers])
170
171 and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent::
172
173 L2_regularizer sum_{ijk} W_{kij}^2 + L1_regularizer sum_{kij} |W_{kij}|
174 + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity|
175 - sum_t log P_{output_model}(target_t | output_t)
176
177 The fields and attributes expected and produced by use and update are the following:
178
179 - Input and output fields (example-wise quantities):
180
181 - 'input' (always expected by use and update)
182 - 'target' (optionally expected by use and always by update)
183 - 'output' (optionally produced by use)
184 - error fields produced by sub-class of MLP
185
186 - optional attributes (optionally expected as input_dataset attributes)
187 (warning, this may be dangerous, the 'use' method will use those provided in the
188 input_dataset rather than those learned during 'update'; currently no support
189 for providing these to update):
190
191 - 'L1_regularizer'
192 - 'L2_regularizer'
193 - 'b'
194 - 'W'
195 - 'parameters' = [b[1], W[1], b[2], W[2], ...]
196 - 'regularization_term'
197
198 """
199
200 def attributeNames(self):
201 return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"]
202
203 def useInputAttributes(self):
204 return ["b","W"]
205
206 def useOutputAttributes(self):
207 return []
208
209 def updateInputAttributes(self):
210 return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"]
211
212 def updateMinibatchInputFields(self):
213 return ["input","target"]
214
215 def updateMinibatchInputAttributes(self):
216 return ["b","W"]
217
218 def updateMinibatchOutputAttributes(self):
219 return ["new_XtX","new_XtY"]
220
221 def updateEndInputAttributes(self):
222 return ["theta","XtX","XtY"]
223
224 def updateEndOutputAttributes(self):
225 return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?
226
227 def parameterAttributes(self):
228 return ["b","W"]
229
230 def defaultOutputFields(self, input_fields):
231 output_fields = ["output"]
232 if "target" in input_fields:
233 output_fields.append("squared_error")
234 return output_fields
235
236 def __init__(self):
237 self._input = t.matrix('input') # n_examples x n_inputs
238 self._target = t.matrix('target') # n_examples x n_outputs
239 self._lambda = as_scalar(0.,'lambda')
240 self._theta = t.matrix('theta')
241 self._W = self._theta[:,1:]
242 self._b = self._theta[:,0]
243 self._XtX = t.matrix('XtX')
244 self._XtY = t.matrix('XtY')
245 self._extended_input = t.prepend_one_to_each_row(self._input)
246 self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix
247 self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
248 self._regularizer = self._lambda * t.dot(self._W,self._W)
249 self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
250 self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
251 self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
252
253 OneShotTLearner.__init__(self)
254
255 def allocate(self,minibatch):
256 minibatch_n_inputs = minibatch["input"].shape[1]
257 minibatch_n_outputs = minibatch["target"].shape[1]
258 if not self._n_inputs:
259 self._n_inputs = minibatch_n_inputs
260 self._n_outputs = minibatch_n_outputs
261 self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
262 self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
263 self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
264 self.forget()
265 elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
266 # if the input or target changes dimension on the fly, we resize and forget everything
267 self.forget()
268
269 def forget(self):
270 if self._n_inputs and self._n_outputs:
271 self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
272 self.XtY.resize((1+self.n_inputs,self.n_outputs))
273 self.XtX.data[:,:]=0
274 self.XtY.data[:,:]=0
275 numpy.diag(self.XtX.data)[1:]=self.lambda
276