Mercurial > pylearn
changeset 111:88257dfedf8c
Added another work in progress, for mlp's
author | bengioy@bengiomac.local |
---|---|
date | Wed, 07 May 2008 09:16:04 -0400 |
parents | 8fa1ef2411a0 |
children | b6bc1e769b36 d0a1bd0378c6 |
files | learner.py linear_regression.py mlp.py |
diffstat | 3 files changed, 323 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/learner.py Tue May 06 22:24:55 2008 -0400 +++ b/learner.py Wed May 07 09:16:04 2008 -0400 @@ -1,5 +1,6 @@ from dataset import * +from compile import Function class Learner(AttributesHolder): """Base class for learning algorithms, provides an interface @@ -84,8 +85,10 @@ """ A subset of self.attributeNames() which are the names of attributes modified/created by update() in order to do its work. + + By default these are inferred from the various update output attributes: """ - raise AbstractFunction() + return ["parameters"] + self.updateMinibatchOutputAttributes() + self.updateEndOutputAttributes() def useOutputAttributes(self): """ @@ -251,7 +254,7 @@ return output_dataset -class OneShotTLearner(TLearner): +class MinibatchUpdatesTLearner(TLearner): """ This adds to TLearner a - updateStart(), updateEnd(), updateMinibatch(minibatch), isLastEpoch(): @@ -262,6 +265,10 @@ going only once through the training data. For more complicated models, more specialized subclasses of TLearner should be used or a learning-algorithm specific update method should be defined. + + - a 'parameters' attribute which is a list of parameters (whose names are + specified by the user's subclass with the parameterAttributes() method) + """ def __init__(self): @@ -288,12 +295,16 @@ def updateEndOutputAttributes(self): raise AbstractFunction() - def updateStart(self): pass + def parameterAttributes(self): + raise AbstractFunction() + + def updateStart(self): pass def updateEnd(self): self.setAttributes(self.updateEndOutputAttributes(), self.update_end_function (self.names2attributes(self.updateEndInputAttributes()))) + self.parameters = self.names2attributes(self.parameterAttributes()) def updateMinibatch(self,minibatch): # make sure all required fields are allocated and initialized @@ -331,3 +342,22 @@ self.updateEnd() return self.use +class OnlineGradientBasedTLearner(MinibatchUpdatesTLearner): + """ + Specialization of MinibatchUpdatesTLearner in which the minibatch updates + are obtained by performing an online (minibatch-based) gradient step. + + Sub-classes must define the following methods: + + """ + def __init__(self,truly_online=False): + """ + If truly_online then only one pass is made through the training set passed to update(). + + """ + self.truly_online=truly_online + + def isLastEpoch(self): + return self.truly_online + +
--- a/linear_regression.py Tue May 06 22:24:55 2008 -0400 +++ b/linear_regression.py Wed May 07 09:16:04 2008 -0400 @@ -1,12 +1,11 @@ from learner import * from theano import tensor as t -from compile import Function from theano.scalar import as_scalar # this is one of the simplest example of learner, and illustrates # the use of theano -class LinearRegression(OneShotTLearner): +class LinearRegression(MinibatchUpdatesTLearner): """ Implement linear regression, with or without L2 regularization (the former is called Ridge Regression and the latter Ordinary Least Squares). @@ -18,14 +17,13 @@ of all the training sets passed to update since construction or since the last call to forget). - The L2 regularization coefficient is obtained analytically. For each (input[t],output[t]) pair in a minibatch,:: output_t = b + W * input_t where b and W are obtained by minimizing:: - lambda sum_{ij} W_{ij}^2 + sum_t ||output_t - target_t||^2 + L2_regularizer sum_{ij} W_{ij}^2 + sum_t ||output_t - target_t||^2 Let X be the whole training set inputs matrix (one input example per row), with the first column full of 1's, and Let Y the whole training set @@ -36,7 +34,7 @@ XtX * theta[:,i] = XtY[:,i] where XtX is a (n_inputs+1)x(n_inputs+1) matrix containing X'*X - plus lambda on the diagonal except at (0,0), + plus L2_regularizer on the diagonal except at (0,0), and XtY is a (n_inputs+1)*n_outputs matrix containing X'*Y. The fields and attributes expected and produced by use and update are the following: @@ -53,10 +51,10 @@ input_dataset rather than those learned during 'update'; currently no support for providing these to update): - - 'lambda' + - 'L2_regularizer' - 'b' - 'W' - - 'parameters' = (b, W) tuple + - 'parameters' = [b, W] - 'regularization_term' - 'XtX' - 'XtY' @@ -64,7 +62,7 @@ """ def attributeNames(self): - return ["lambda","parameters","b","W","regularization_term","XtX","XtY"] + return ["L2_regularizer","parameters","b","W","regularization_term","XtX","XtY"] def useInputAttributes(self): return ["b","W"] @@ -73,10 +71,7 @@ return [] def updateInputAttributes(self): - return ["lambda","XtX","XtY"] - - def updateOutputAttributes(self): - return ["parameters"] + self.updateMinibatchOutputAttributes() + self.updateEndOutputAttributes() + return ["L2_regularizer","XtX","XtY"] def updateMinibatchInputFields(self): return ["input","target"] @@ -93,6 +88,9 @@ def updateEndOutputAttributes(self): return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ? + def parameterAttributes(self): + return ["b","W"] + def defaultOutputFields(self, input_fields): output_fields = ["output"] if "target" in input_fields: @@ -102,7 +100,7 @@ def __init__(self): self._input = t.matrix('input') # n_examples x n_inputs self._target = t.matrix('target') # n_examples x n_outputs - self._lambda = as_scalar(0.,'lambda') + self._L2_regularizer = as_scalar(0.,'L2_regularizer') self._theta = t.matrix('theta') self._W = self._theta[:,1:] self._b = self._theta[:,0] @@ -111,13 +109,12 @@ self._extended_input = t.prepend_one_to_each_row(self._input) self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector - self._regularizer = self._lambda * t.dot(self._W,self._W) + self._regularizer = self._L2_regularizer * t.dot(self._W,self._W) self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input)) self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target)) self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY) OneShotTLearner.__init__(self) - self.allocate() def allocate(self,minibatch): minibatch_n_inputs = minibatch["input"].shape[1] @@ -130,7 +127,7 @@ self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs)) self.forget() elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs: - # if the input or target changes dimension on the fly, we forget everything + # if the input or target changes dimension on the fly, we resize and forget everything self.forget() def forget(self): @@ -139,9 +136,5 @@ self.XtY.resize((1+self.n_inputs,self.n_outputs)) self.XtX.data[:,:]=0 self.XtY.data[:,:]=0 - numpy.diag(self.XtX.data)[1:]=self.lambda + numpy.diag(self.XtX.data)[1:]=self.L2_regularizer - def updateEnd(self): - TLearner.updateEnd(self) - self.parameters = (self.W,self.b) -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mlp.py Wed May 07 09:16:04 2008 -0400 @@ -0,0 +1,276 @@ + +from learner import * +from theano import tensor as t +from theano.scalar import as_scalar + +# this is one of the simplest example of learner, and illustrates +# the use of theano + + +class OneHiddenLayerNNetClassifier(MinibatchUpdatesTLearner): + """ + Implement a straightforward classicial feedforward + one-hidden-layer neural net, with L2 regularization. + + The predictor parameters are obtained by minibatch/online gradient descent. + Training can proceed sequentially (with multiple calls to update with + different disjoint subsets of the training sets). + + Hyper-parameters: + - L2_regularizer + - learning_rate + - n_hidden + + For each (input_t,output_t) pair in a minibatch,:: + + output_activations_t = b2+W2*tanh(b1+W1*input_t) + output_t = softmax(output_activations_t) + output_class_t = argmax(output_activations_t) + class_error_t = 1_{output_class_t != target_t} + nll_t = -log(output_t[target_t]) + + and the training criterion is:: + + loss = L2_regularizer*(||W1||^2 + ||W2||^2) + sum_t nll_t + + The parameters are [b1,W1,b2,W2] and are obtained by minimizing the loss by + stochastic minibatch gradient descent:: + + parameters[i] -= learning_rate * dloss/dparameters[i] + + The fields and attributes expected and produced by use and update are the following: + + - Input and output fields (example-wise quantities): + + - 'input' (always expected by use and update) + - 'target' (optionally expected by use and always by update) + - 'output' (optionally produced by use) + - 'output_class' (optionally produced by use) + - 'class_error' (optionally produced by use) + - 'nll' (optionally produced by use) + + - optional attributes (optionally expected as input_dataset attributes) + (warning, this may be dangerous, the 'use' method will use those provided in the + input_dataset rather than those learned during 'update'; currently no support + for providing these to update): + + - 'L2_regularizer' + - 'b1' + - 'W1' + - 'b2' + - 'W2' + - 'parameters' = [b1, W1, b2, W2] + - 'regularization_term' + + """ + + def attributeNames(self): + return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"] + + def parameterAttributes(self): + return ["b1","W1", "b2", "W2"] + + def useInputAttributes(self): + return self.parameterAttributes() + + def useOutputAttributes(self): + return [] + + def updateInputAttributes(self): + return self.parameterAttributes() + ["L2_regularizer"] + + def updateMinibatchInputFields(self): + return ["input","target"] + + def updateMinibatchInputAttributes(self): + return self.parameterAttributes() + + def updateMinibatchOutputAttributes(self): + return self.parameterAttributes() + + def updateEndInputAttributes(self): + return self.parameterAttributes() + + def updateEndOutputAttributes(self): + return ["regularization_term"] + + def defaultOutputFields(self, input_fields): + output_fields = ["output", "output_class",] + if "target" in input_fields: + output_fields += ["class_error", "nll"] + return output_fields + + def __init__(self): + self._input = t.matrix('input') # n_examples x n_inputs + self._target = t.matrix('target') # n_examples x n_outputs + self._lambda = as_scalar(0.,'lambda') + self._theta = t.matrix('theta') + self._W = self._theta[:,1:] + self._b = self._theta[:,0] + self._XtX = t.matrix('XtX') + self._XtY = t.matrix('XtY') + self._extended_input = t.prepend_one_to_each_row(self._input) + self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix + self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector + self._regularizer = self._lambda * t.dot(self._W,self._W) + self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input)) + self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target)) + self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY) + + OneShotTLearner.__init__(self) + + def allocate(self,minibatch): + minibatch_n_inputs = minibatch["input"].shape[1] + minibatch_n_outputs = minibatch["target"].shape[1] + if not self._n_inputs: + self._n_inputs = minibatch_n_inputs + self._n_outputs = minibatch_n_outputs + self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs)) + self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs)) + self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs)) + self.forget() + elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs: + # if the input or target changes dimension on the fly, we resize and forget everything + self.forget() + + def forget(self): + if self._n_inputs and self._n_outputs: + self.XtX.resize((1+self.n_inputs,1+self.n_inputs)) + self.XtY.resize((1+self.n_inputs,self.n_outputs)) + self.XtX.data[:,:]=0 + self.XtY.data[:,:]=0 + numpy.diag(self.XtX.data)[1:]=self.lambda + + +class MLP(MinibatchUpdatesTLearner): + """ + Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization. + + The predictor parameters are obtained by minibatch/online gradient descent. + Training can proceed sequentially (with multiple calls to update with + different disjoint subsets of the training sets). + + Hyper-parameters: + - L1_regularizer + - L2_regularizer + - neuron_sparsity_regularizer + - initial_learning_rate + - learning_rate_decrease_rate + - n_hidden_per_layer (a list of integers) + - activation_function ("sigmoid","tanh", or "ratio") + + The output/task type (classification, regression, etc.) is obtained by specializing MLP. + + For each (input[t],output[t]) pair in a minibatch,:: + + activation[0] = input_t + for k=1 to n_hidden_layers: + activation[k]=activation_function(b[k]+ W[k]*activation[k-1]) + output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers]) + + and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent:: + + L2_regularizer sum_{ijk} W_{kij}^2 + L1_regularizer sum_{kij} |W_{kij}| + + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity| + - sum_t log P_{output_model}(target_t | output_t) + + The fields and attributes expected and produced by use and update are the following: + + - Input and output fields (example-wise quantities): + + - 'input' (always expected by use and update) + - 'target' (optionally expected by use and always by update) + - 'output' (optionally produced by use) + - error fields produced by sub-class of MLP + + - optional attributes (optionally expected as input_dataset attributes) + (warning, this may be dangerous, the 'use' method will use those provided in the + input_dataset rather than those learned during 'update'; currently no support + for providing these to update): + + - 'L1_regularizer' + - 'L2_regularizer' + - 'b' + - 'W' + - 'parameters' = [b[1], W[1], b[2], W[2], ...] + - 'regularization_term' + + """ + + def attributeNames(self): + return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"] + + def useInputAttributes(self): + return ["b","W"] + + def useOutputAttributes(self): + return [] + + def updateInputAttributes(self): + return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"] + + def updateMinibatchInputFields(self): + return ["input","target"] + + def updateMinibatchInputAttributes(self): + return ["b","W"] + + def updateMinibatchOutputAttributes(self): + return ["new_XtX","new_XtY"] + + def updateEndInputAttributes(self): + return ["theta","XtX","XtY"] + + def updateEndOutputAttributes(self): + return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ? + + def parameterAttributes(self): + return ["b","W"] + + def defaultOutputFields(self, input_fields): + output_fields = ["output"] + if "target" in input_fields: + output_fields.append("squared_error") + return output_fields + + def __init__(self): + self._input = t.matrix('input') # n_examples x n_inputs + self._target = t.matrix('target') # n_examples x n_outputs + self._lambda = as_scalar(0.,'lambda') + self._theta = t.matrix('theta') + self._W = self._theta[:,1:] + self._b = self._theta[:,0] + self._XtX = t.matrix('XtX') + self._XtY = t.matrix('XtY') + self._extended_input = t.prepend_one_to_each_row(self._input) + self._output = t.dot(self._input,self._W.T) + self._b # (n_examples , n_outputs) matrix + self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector + self._regularizer = self._lambda * t.dot(self._W,self._W) + self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input)) + self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target)) + self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY) + + OneShotTLearner.__init__(self) + + def allocate(self,minibatch): + minibatch_n_inputs = minibatch["input"].shape[1] + minibatch_n_outputs = minibatch["target"].shape[1] + if not self._n_inputs: + self._n_inputs = minibatch_n_inputs + self._n_outputs = minibatch_n_outputs + self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs)) + self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs)) + self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs)) + self.forget() + elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs: + # if the input or target changes dimension on the fly, we resize and forget everything + self.forget() + + def forget(self): + if self._n_inputs and self._n_outputs: + self.XtX.resize((1+self.n_inputs,1+self.n_inputs)) + self.XtY.resize((1+self.n_inputs,self.n_outputs)) + self.XtX.data[:,:]=0 + self.XtY.data[:,:]=0 + numpy.diag(self.XtX.data)[1:]=self.lambda +