diff mlp.py @ 111:88257dfedf8c

Added another work in progress, for mlp's
author bengioy@bengiomac.local
date Wed, 07 May 2008 09:16:04 -0400
parents
children d0a1bd0378c6
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mlp.py	Wed May 07 09:16:04 2008 -0400
@@ -0,0 +1,276 @@
+
+from learner import *
+from theano import tensor as t
+from theano.scalar import as_scalar
+
+# this is one of the simplest example of learner, and illustrates
+# the use of theano
+
+
+class OneHiddenLayerNNetClassifier(MinibatchUpdatesTLearner):
+    """
+    Implement a straightforward classicial feedforward
+    one-hidden-layer neural net, with L2 regularization.
+
+    The predictor parameters are obtained by minibatch/online gradient descent.
+    Training can proceed sequentially (with multiple calls to update with
+    different disjoint subsets of the training sets).
+
+    Hyper-parameters:
+      - L2_regularizer
+      - learning_rate
+      - n_hidden
+
+    For each (input_t,output_t) pair in a minibatch,::
+
+       output_activations_t = b2+W2*tanh(b1+W1*input_t)
+       output_t = softmax(output_activations_t)
+       output_class_t = argmax(output_activations_t)
+       class_error_t = 1_{output_class_t != target_t}
+       nll_t = -log(output_t[target_t])
+
+    and the training criterion is::
+
+       loss = L2_regularizer*(||W1||^2 + ||W2||^2) + sum_t nll_t
+
+    The parameters are [b1,W1,b2,W2] and are obtained by minimizing the loss by
+    stochastic minibatch gradient descent::
+
+       parameters[i] -= learning_rate * dloss/dparameters[i]
+       
+    The fields and attributes expected and produced by use and update are the following:
+
+     - Input and output fields (example-wise quantities):
+
+       - 'input' (always expected by use and update)
+       - 'target' (optionally expected by use and always by update)
+       - 'output' (optionally produced by use)
+       - 'output_class' (optionally produced by use)
+       - 'class_error' (optionally produced by use)
+       - 'nll' (optionally produced by use)
+       
+     - optional attributes (optionally expected as input_dataset attributes)
+       (warning, this may be dangerous, the 'use' method will use those provided in the 
+       input_dataset rather than those learned during 'update'; currently no support
+       for providing these to update):
+       
+       - 'L2_regularizer'
+       - 'b1' 
+       - 'W1'
+       - 'b2' 
+       - 'W2'
+       - 'parameters' = [b1, W1, b2, W2]
+       - 'regularization_term'
+
+    """
+
+    def attributeNames(self):
+        return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"]
+
+    def parameterAttributes(self):
+        return ["b1","W1", "b2", "W2"]
+    
+    def useInputAttributes(self):
+        return self.parameterAttributes()
+
+    def useOutputAttributes(self):
+        return []
+
+    def updateInputAttributes(self):
+        return self.parameterAttributes() + ["L2_regularizer"]
+
+    def updateMinibatchInputFields(self):
+        return ["input","target"]
+    
+    def updateMinibatchInputAttributes(self):
+        return self.parameterAttributes()
+    
+    def updateMinibatchOutputAttributes(self):
+        return self.parameterAttributes()
+    
+    def updateEndInputAttributes(self):
+        return self.parameterAttributes()
+
+    def updateEndOutputAttributes(self):
+        return ["regularization_term"]
+
+    def defaultOutputFields(self, input_fields):
+        output_fields = ["output", "output_class",]
+        if "target" in input_fields:
+            output_fields += ["class_error", "nll"]
+        return output_fields
+        
+    def __init__(self):
+        self._input = t.matrix('input') # n_examples x n_inputs
+        self._target = t.matrix('target') # n_examples x n_outputs
+        self._lambda = as_scalar(0.,'lambda')
+        self._theta = t.matrix('theta')
+        self._W = self._theta[:,1:] 
+        self._b = self._theta[:,0]
+        self._XtX = t.matrix('XtX')
+        self._XtY = t.matrix('XtY')
+        self._extended_input = t.prepend_one_to_each_row(self._input)
+        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
+        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
+        self._regularizer = self._lambda * t.dot(self._W,self._W)
+        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
+        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
+        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
+
+        OneShotTLearner.__init__(self)
+            
+    def allocate(self,minibatch):
+        minibatch_n_inputs  = minibatch["input"].shape[1]
+        minibatch_n_outputs = minibatch["target"].shape[1]
+        if not self._n_inputs:
+            self._n_inputs = minibatch_n_inputs 
+            self._n_outputs = minibatch_n_outputs
+            self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
+            self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
+            self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
+            self.forget()
+        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
+            # if the input or target changes dimension on the fly, we resize and forget everything
+            self.forget()
+            
+    def forget(self):
+        if self._n_inputs and self._n_outputs:
+            self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
+            self.XtY.resize((1+self.n_inputs,self.n_outputs))
+            self.XtX.data[:,:]=0
+            self.XtY.data[:,:]=0
+            numpy.diag(self.XtX.data)[1:]=self.lambda
+
+
+class MLP(MinibatchUpdatesTLearner):
+    """
+    Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization.
+
+    The predictor parameters are obtained by minibatch/online gradient descent.
+    Training can proceed sequentially (with multiple calls to update with
+    different disjoint subsets of the training sets).
+
+    Hyper-parameters:
+      - L1_regularizer
+      - L2_regularizer
+      - neuron_sparsity_regularizer
+      - initial_learning_rate
+      - learning_rate_decrease_rate
+      - n_hidden_per_layer (a list of integers)
+      - activation_function ("sigmoid","tanh", or "ratio")
+
+    The output/task type (classification, regression, etc.) is obtained by specializing MLP.
+
+    For each (input[t],output[t]) pair in a minibatch,::
+
+       activation[0] = input_t
+       for k=1 to n_hidden_layers:
+          activation[k]=activation_function(b[k]+ W[k]*activation[k-1])
+       output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers])
+
+    and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent::
+
+       L2_regularizer sum_{ijk} W_{kij}^2  + L1_regularizer sum_{kij} |W_{kij}|
+       + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity|
+       - sum_t log P_{output_model}(target_t | output_t)
+
+    The fields and attributes expected and produced by use and update are the following:
+
+     - Input and output fields (example-wise quantities):
+
+       - 'input' (always expected by use and update)
+       - 'target' (optionally expected by use and always by update)
+       - 'output' (optionally produced by use)
+       - error fields produced by sub-class of MLP
+
+     - optional attributes (optionally expected as input_dataset attributes)
+       (warning, this may be dangerous, the 'use' method will use those provided in the 
+       input_dataset rather than those learned during 'update'; currently no support
+       for providing these to update):
+       
+       - 'L1_regularizer'
+       - 'L2_regularizer'
+       - 'b' 
+       - 'W'
+       - 'parameters' = [b[1], W[1], b[2], W[2], ...] 
+       - 'regularization_term'
+
+    """
+
+    def attributeNames(self):
+        return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"]
+
+    def useInputAttributes(self):
+        return ["b","W"]
+
+    def useOutputAttributes(self):
+        return []
+
+    def updateInputAttributes(self):
+        return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"]
+
+    def updateMinibatchInputFields(self):
+        return ["input","target"]
+    
+    def updateMinibatchInputAttributes(self):
+        return ["b","W"]
+    
+    def updateMinibatchOutputAttributes(self):
+        return ["new_XtX","new_XtY"]
+    
+    def updateEndInputAttributes(self):
+        return ["theta","XtX","XtY"]
+
+    def updateEndOutputAttributes(self):
+        return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?
+
+    def parameterAttributes(self):
+        return ["b","W"]
+    
+    def defaultOutputFields(self, input_fields):
+        output_fields = ["output"]
+        if "target" in input_fields:
+            output_fields.append("squared_error")
+        return output_fields
+        
+    def __init__(self):
+        self._input = t.matrix('input') # n_examples x n_inputs
+        self._target = t.matrix('target') # n_examples x n_outputs
+        self._lambda = as_scalar(0.,'lambda')
+        self._theta = t.matrix('theta')
+        self._W = self._theta[:,1:] 
+        self._b = self._theta[:,0]
+        self._XtX = t.matrix('XtX')
+        self._XtY = t.matrix('XtY')
+        self._extended_input = t.prepend_one_to_each_row(self._input)
+        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
+        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
+        self._regularizer = self._lambda * t.dot(self._W,self._W)
+        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
+        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
+        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
+
+        OneShotTLearner.__init__(self)
+            
+    def allocate(self,minibatch):
+        minibatch_n_inputs  = minibatch["input"].shape[1]
+        minibatch_n_outputs = minibatch["target"].shape[1]
+        if not self._n_inputs:
+            self._n_inputs = minibatch_n_inputs 
+            self._n_outputs = minibatch_n_outputs
+            self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
+            self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
+            self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
+            self.forget()
+        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
+            # if the input or target changes dimension on the fly, we resize and forget everything
+            self.forget()
+            
+    def forget(self):
+        if self._n_inputs and self._n_outputs:
+            self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
+            self.XtY.resize((1+self.n_inputs,self.n_outputs))
+            self.XtX.data[:,:]=0
+            self.XtY.data[:,:]=0
+            numpy.diag(self.XtX.data)[1:]=self.lambda
+