changeset 111:88257dfedf8c

Added another work in progress, for mlp's
author bengioy@bengiomac.local
date Wed, 07 May 2008 09:16:04 -0400
parents 8fa1ef2411a0
children b6bc1e769b36 d0a1bd0378c6
files learner.py linear_regression.py mlp.py
diffstat 3 files changed, 323 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/learner.py	Tue May 06 22:24:55 2008 -0400
+++ b/learner.py	Wed May 07 09:16:04 2008 -0400
@@ -1,5 +1,6 @@
 
 from dataset import *
+from compile import Function
     
 class Learner(AttributesHolder):
     """Base class for learning algorithms, provides an interface
@@ -84,8 +85,10 @@
         """
         A subset of self.attributeNames() which are the names of attributes modified/created by update() in order
         to do its work.
+
+        By default these are inferred from the various update output attributes:
         """
-        raise AbstractFunction()
+        return ["parameters"] + self.updateMinibatchOutputAttributes() + self.updateEndOutputAttributes()
 
     def useOutputAttributes(self):
         """
@@ -251,7 +254,7 @@
         return output_dataset
 
 
-class OneShotTLearner(TLearner):
+class MinibatchUpdatesTLearner(TLearner):
     """
     This adds to TLearner a 
       - updateStart(), updateEnd(), updateMinibatch(minibatch), isLastEpoch():
@@ -262,6 +265,10 @@
                           going only once through the training data. For more complicated
                           models, more specialized subclasses of TLearner should be used
                           or a learning-algorithm specific update method should be defined.
+
+      - a 'parameters' attribute which is a list of parameters (whose names are
+      specified by the user's subclass with the parameterAttributes() method)
+      
     """
 
     def __init__(self):
@@ -288,12 +295,16 @@
     def updateEndOutputAttributes(self):
         raise AbstractFunction()
 
-    def updateStart(self): pass
+    def parameterAttributes(self):
+        raise AbstractFunction()
+
+        def updateStart(self): pass
 
     def updateEnd(self):
         self.setAttributes(self.updateEndOutputAttributes(),
                            self.update_end_function
                            (self.names2attributes(self.updateEndInputAttributes())))
+        self.parameters = self.names2attributes(self.parameterAttributes())
         
     def updateMinibatch(self,minibatch):
         # make sure all required fields are allocated and initialized
@@ -331,3 +342,22 @@
         self.updateEnd()
         return self.use
 
+class OnlineGradientBasedTLearner(MinibatchUpdatesTLearner):
+    """
+    Specialization of MinibatchUpdatesTLearner in which the minibatch updates
+    are obtained by performing an online (minibatch-based) gradient step.
+
+    Sub-classes must define the following methods:
+    
+    """
+    def __init__(self,truly_online=False):
+        """
+        If truly_online then only one pass is made through the training set passed to update().
+        
+        """
+        self.truly_online=truly_online
+
+    def isLastEpoch(self):
+        return self.truly_online
+
+
--- a/linear_regression.py	Tue May 06 22:24:55 2008 -0400
+++ b/linear_regression.py	Wed May 07 09:16:04 2008 -0400
@@ -1,12 +1,11 @@
 
 from learner import *
 from theano import tensor as t
-from compile import Function
 from theano.scalar import as_scalar
 
 # this is one of the simplest example of learner, and illustrates
 # the use of theano 
-class LinearRegression(OneShotTLearner):
+class LinearRegression(MinibatchUpdatesTLearner):
     """
     Implement linear regression, with or without L2 regularization
     (the former is called Ridge Regression and the latter Ordinary Least Squares).
@@ -18,14 +17,13 @@
     of all the training sets passed to update since construction or since
     the last call to forget).
 
-    The L2 regularization coefficient is obtained analytically.
     For each (input[t],output[t]) pair in a minibatch,::
     
        output_t = b + W * input_t
 
     where b and W are obtained by minimizing::
 
-       lambda sum_{ij} W_{ij}^2  + sum_t ||output_t - target_t||^2
+       L2_regularizer sum_{ij} W_{ij}^2  + sum_t ||output_t - target_t||^2
 
     Let X be the whole training set inputs matrix (one input example per row),
     with the first column full of 1's, and Let Y the whole training set
@@ -36,7 +34,7 @@
        XtX * theta[:,i] = XtY[:,i]
 
     where XtX is a (n_inputs+1)x(n_inputs+1) matrix containing X'*X
-    plus lambda on the diagonal except at (0,0),
+    plus L2_regularizer on the diagonal except at (0,0),
     and XtY is a (n_inputs+1)*n_outputs matrix containing X'*Y.
 
     The fields and attributes expected and produced by use and update are the following:
@@ -53,10 +51,10 @@
        input_dataset rather than those learned during 'update'; currently no support
        for providing these to update):
        
-       - 'lambda' 
+       - 'L2_regularizer' 
        - 'b' 
        - 'W'
-       - 'parameters' = (b, W) tuple
+       - 'parameters' = [b, W] 
        - 'regularization_term'
        - 'XtX'
        - 'XtY'
@@ -64,7 +62,7 @@
     """
 
     def attributeNames(self):
-        return ["lambda","parameters","b","W","regularization_term","XtX","XtY"]
+        return ["L2_regularizer","parameters","b","W","regularization_term","XtX","XtY"]
 
     def useInputAttributes(self):
         return ["b","W"]
@@ -73,10 +71,7 @@
         return []
 
     def updateInputAttributes(self):
-        return ["lambda","XtX","XtY"]
-
-    def updateOutputAttributes(self):
-        return ["parameters"] + self.updateMinibatchOutputAttributes() + self.updateEndOutputAttributes()
+        return ["L2_regularizer","XtX","XtY"]
 
     def updateMinibatchInputFields(self):
         return ["input","target"]
@@ -93,6 +88,9 @@
     def updateEndOutputAttributes(self):
         return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?
 
+    def parameterAttributes(self):
+        return ["b","W"]
+    
     def defaultOutputFields(self, input_fields):
         output_fields = ["output"]
         if "target" in input_fields:
@@ -102,7 +100,7 @@
     def __init__(self):
         self._input = t.matrix('input') # n_examples x n_inputs
         self._target = t.matrix('target') # n_examples x n_outputs
-        self._lambda = as_scalar(0.,'lambda')
+        self._L2_regularizer = as_scalar(0.,'L2_regularizer')
         self._theta = t.matrix('theta')
         self._W = self._theta[:,1:] 
         self._b = self._theta[:,0]
@@ -111,13 +109,12 @@
         self._extended_input = t.prepend_one_to_each_row(self._input)
         self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
         self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
-        self._regularizer = self._lambda * t.dot(self._W,self._W)
+        self._regularizer = self._L2_regularizer * t.dot(self._W,self._W)
         self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
         self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
         self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
 
         OneShotTLearner.__init__(self)
-        self.allocate()
             
     def allocate(self,minibatch):
         minibatch_n_inputs  = minibatch["input"].shape[1]
@@ -130,7 +127,7 @@
             self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
             self.forget()
         elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
-            # if the input or target changes dimension on the fly, we forget everything
+            # if the input or target changes dimension on the fly, we resize and forget everything
             self.forget()
             
     def forget(self):
@@ -139,9 +136,5 @@
             self.XtY.resize((1+self.n_inputs,self.n_outputs))
             self.XtX.data[:,:]=0
             self.XtY.data[:,:]=0
-            numpy.diag(self.XtX.data)[1:]=self.lambda
+            numpy.diag(self.XtX.data)[1:]=self.L2_regularizer
 
-    def updateEnd(self):
-        TLearner.updateEnd(self)
-        self.parameters = (self.W,self.b)
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mlp.py	Wed May 07 09:16:04 2008 -0400
@@ -0,0 +1,276 @@
+
+from learner import *
+from theano import tensor as t
+from theano.scalar import as_scalar
+
+# this is one of the simplest example of learner, and illustrates
+# the use of theano
+
+
+class OneHiddenLayerNNetClassifier(MinibatchUpdatesTLearner):
+    """
+    Implement a straightforward classicial feedforward
+    one-hidden-layer neural net, with L2 regularization.
+
+    The predictor parameters are obtained by minibatch/online gradient descent.
+    Training can proceed sequentially (with multiple calls to update with
+    different disjoint subsets of the training sets).
+
+    Hyper-parameters:
+      - L2_regularizer
+      - learning_rate
+      - n_hidden
+
+    For each (input_t,output_t) pair in a minibatch,::
+
+       output_activations_t = b2+W2*tanh(b1+W1*input_t)
+       output_t = softmax(output_activations_t)
+       output_class_t = argmax(output_activations_t)
+       class_error_t = 1_{output_class_t != target_t}
+       nll_t = -log(output_t[target_t])
+
+    and the training criterion is::
+
+       loss = L2_regularizer*(||W1||^2 + ||W2||^2) + sum_t nll_t
+
+    The parameters are [b1,W1,b2,W2] and are obtained by minimizing the loss by
+    stochastic minibatch gradient descent::
+
+       parameters[i] -= learning_rate * dloss/dparameters[i]
+       
+    The fields and attributes expected and produced by use and update are the following:
+
+     - Input and output fields (example-wise quantities):
+
+       - 'input' (always expected by use and update)
+       - 'target' (optionally expected by use and always by update)
+       - 'output' (optionally produced by use)
+       - 'output_class' (optionally produced by use)
+       - 'class_error' (optionally produced by use)
+       - 'nll' (optionally produced by use)
+       
+     - optional attributes (optionally expected as input_dataset attributes)
+       (warning, this may be dangerous, the 'use' method will use those provided in the 
+       input_dataset rather than those learned during 'update'; currently no support
+       for providing these to update):
+       
+       - 'L2_regularizer'
+       - 'b1' 
+       - 'W1'
+       - 'b2' 
+       - 'W2'
+       - 'parameters' = [b1, W1, b2, W2]
+       - 'regularization_term'
+
+    """
+
+    def attributeNames(self):
+        return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"]
+
+    def parameterAttributes(self):
+        return ["b1","W1", "b2", "W2"]
+    
+    def useInputAttributes(self):
+        return self.parameterAttributes()
+
+    def useOutputAttributes(self):
+        return []
+
+    def updateInputAttributes(self):
+        return self.parameterAttributes() + ["L2_regularizer"]
+
+    def updateMinibatchInputFields(self):
+        return ["input","target"]
+    
+    def updateMinibatchInputAttributes(self):
+        return self.parameterAttributes()
+    
+    def updateMinibatchOutputAttributes(self):
+        return self.parameterAttributes()
+    
+    def updateEndInputAttributes(self):
+        return self.parameterAttributes()
+
+    def updateEndOutputAttributes(self):
+        return ["regularization_term"]
+
+    def defaultOutputFields(self, input_fields):
+        output_fields = ["output", "output_class",]
+        if "target" in input_fields:
+            output_fields += ["class_error", "nll"]
+        return output_fields
+        
+    def __init__(self):
+        self._input = t.matrix('input') # n_examples x n_inputs
+        self._target = t.matrix('target') # n_examples x n_outputs
+        self._lambda = as_scalar(0.,'lambda')
+        self._theta = t.matrix('theta')
+        self._W = self._theta[:,1:] 
+        self._b = self._theta[:,0]
+        self._XtX = t.matrix('XtX')
+        self._XtY = t.matrix('XtY')
+        self._extended_input = t.prepend_one_to_each_row(self._input)
+        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
+        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
+        self._regularizer = self._lambda * t.dot(self._W,self._W)
+        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
+        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
+        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
+
+        OneShotTLearner.__init__(self)
+            
+    def allocate(self,minibatch):
+        minibatch_n_inputs  = minibatch["input"].shape[1]
+        minibatch_n_outputs = minibatch["target"].shape[1]
+        if not self._n_inputs:
+            self._n_inputs = minibatch_n_inputs 
+            self._n_outputs = minibatch_n_outputs
+            self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
+            self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
+            self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
+            self.forget()
+        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
+            # if the input or target changes dimension on the fly, we resize and forget everything
+            self.forget()
+            
+    def forget(self):
+        if self._n_inputs and self._n_outputs:
+            self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
+            self.XtY.resize((1+self.n_inputs,self.n_outputs))
+            self.XtX.data[:,:]=0
+            self.XtY.data[:,:]=0
+            numpy.diag(self.XtX.data)[1:]=self.lambda
+
+
+class MLP(MinibatchUpdatesTLearner):
+    """
+    Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization.
+
+    The predictor parameters are obtained by minibatch/online gradient descent.
+    Training can proceed sequentially (with multiple calls to update with
+    different disjoint subsets of the training sets).
+
+    Hyper-parameters:
+      - L1_regularizer
+      - L2_regularizer
+      - neuron_sparsity_regularizer
+      - initial_learning_rate
+      - learning_rate_decrease_rate
+      - n_hidden_per_layer (a list of integers)
+      - activation_function ("sigmoid","tanh", or "ratio")
+
+    The output/task type (classification, regression, etc.) is obtained by specializing MLP.
+
+    For each (input[t],output[t]) pair in a minibatch,::
+
+       activation[0] = input_t
+       for k=1 to n_hidden_layers:
+          activation[k]=activation_function(b[k]+ W[k]*activation[k-1])
+       output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers])
+
+    and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent::
+
+       L2_regularizer sum_{ijk} W_{kij}^2  + L1_regularizer sum_{kij} |W_{kij}|
+       + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity|
+       - sum_t log P_{output_model}(target_t | output_t)
+
+    The fields and attributes expected and produced by use and update are the following:
+
+     - Input and output fields (example-wise quantities):
+
+       - 'input' (always expected by use and update)
+       - 'target' (optionally expected by use and always by update)
+       - 'output' (optionally produced by use)
+       - error fields produced by sub-class of MLP
+
+     - optional attributes (optionally expected as input_dataset attributes)
+       (warning, this may be dangerous, the 'use' method will use those provided in the 
+       input_dataset rather than those learned during 'update'; currently no support
+       for providing these to update):
+       
+       - 'L1_regularizer'
+       - 'L2_regularizer'
+       - 'b' 
+       - 'W'
+       - 'parameters' = [b[1], W[1], b[2], W[2], ...] 
+       - 'regularization_term'
+
+    """
+
+    def attributeNames(self):
+        return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"]
+
+    def useInputAttributes(self):
+        return ["b","W"]
+
+    def useOutputAttributes(self):
+        return []
+
+    def updateInputAttributes(self):
+        return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"]
+
+    def updateMinibatchInputFields(self):
+        return ["input","target"]
+    
+    def updateMinibatchInputAttributes(self):
+        return ["b","W"]
+    
+    def updateMinibatchOutputAttributes(self):
+        return ["new_XtX","new_XtY"]
+    
+    def updateEndInputAttributes(self):
+        return ["theta","XtX","XtY"]
+
+    def updateEndOutputAttributes(self):
+        return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?
+
+    def parameterAttributes(self):
+        return ["b","W"]
+    
+    def defaultOutputFields(self, input_fields):
+        output_fields = ["output"]
+        if "target" in input_fields:
+            output_fields.append("squared_error")
+        return output_fields
+        
+    def __init__(self):
+        self._input = t.matrix('input') # n_examples x n_inputs
+        self._target = t.matrix('target') # n_examples x n_outputs
+        self._lambda = as_scalar(0.,'lambda')
+        self._theta = t.matrix('theta')
+        self._W = self._theta[:,1:] 
+        self._b = self._theta[:,0]
+        self._XtX = t.matrix('XtX')
+        self._XtY = t.matrix('XtY')
+        self._extended_input = t.prepend_one_to_each_row(self._input)
+        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
+        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
+        self._regularizer = self._lambda * t.dot(self._W,self._W)
+        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
+        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
+        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
+
+        OneShotTLearner.__init__(self)
+            
+    def allocate(self,minibatch):
+        minibatch_n_inputs  = minibatch["input"].shape[1]
+        minibatch_n_outputs = minibatch["target"].shape[1]
+        if not self._n_inputs:
+            self._n_inputs = minibatch_n_inputs 
+            self._n_outputs = minibatch_n_outputs
+            self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
+            self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
+            self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
+            self.forget()
+        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
+            # if the input or target changes dimension on the fly, we resize and forget everything
+            self.forget()
+            
+    def forget(self):
+        if self._n_inputs and self._n_outputs:
+            self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
+            self.XtY.resize((1+self.n_inputs,self.n_outputs))
+            self.XtX.data[:,:]=0
+            self.XtY.data[:,:]=0
+            numpy.diag(self.XtX.data)[1:]=self.lambda
+