view mlp.py @ 111:88257dfedf8c

Added another work in progress, for mlp's
author bengioy@bengiomac.local
date Wed, 07 May 2008 09:16:04 -0400
parents
children d0a1bd0378c6
line wrap: on
line source


from learner import *
from theano import tensor as t
from theano.scalar import as_scalar

# this is one of the simplest example of learner, and illustrates
# the use of theano


class OneHiddenLayerNNetClassifier(MinibatchUpdatesTLearner):
    """
    Implement a straightforward classicial feedforward
    one-hidden-layer neural net, with L2 regularization.

    The predictor parameters are obtained by minibatch/online gradient descent.
    Training can proceed sequentially (with multiple calls to update with
    different disjoint subsets of the training sets).

    Hyper-parameters:
      - L2_regularizer
      - learning_rate
      - n_hidden

    For each (input_t,output_t) pair in a minibatch,::

       output_activations_t = b2+W2*tanh(b1+W1*input_t)
       output_t = softmax(output_activations_t)
       output_class_t = argmax(output_activations_t)
       class_error_t = 1_{output_class_t != target_t}
       nll_t = -log(output_t[target_t])

    and the training criterion is::

       loss = L2_regularizer*(||W1||^2 + ||W2||^2) + sum_t nll_t

    The parameters are [b1,W1,b2,W2] and are obtained by minimizing the loss by
    stochastic minibatch gradient descent::

       parameters[i] -= learning_rate * dloss/dparameters[i]
       
    The fields and attributes expected and produced by use and update are the following:

     - Input and output fields (example-wise quantities):

       - 'input' (always expected by use and update)
       - 'target' (optionally expected by use and always by update)
       - 'output' (optionally produced by use)
       - 'output_class' (optionally produced by use)
       - 'class_error' (optionally produced by use)
       - 'nll' (optionally produced by use)
       
     - optional attributes (optionally expected as input_dataset attributes)
       (warning, this may be dangerous, the 'use' method will use those provided in the 
       input_dataset rather than those learned during 'update'; currently no support
       for providing these to update):
       
       - 'L2_regularizer'
       - 'b1' 
       - 'W1'
       - 'b2' 
       - 'W2'
       - 'parameters' = [b1, W1, b2, W2]
       - 'regularization_term'

    """

    def attributeNames(self):
        return ["parameters","b1","W2","b2","W2", "L2_regularizer","regularization_term"]

    def parameterAttributes(self):
        return ["b1","W1", "b2", "W2"]
    
    def useInputAttributes(self):
        return self.parameterAttributes()

    def useOutputAttributes(self):
        return []

    def updateInputAttributes(self):
        return self.parameterAttributes() + ["L2_regularizer"]

    def updateMinibatchInputFields(self):
        return ["input","target"]
    
    def updateMinibatchInputAttributes(self):
        return self.parameterAttributes()
    
    def updateMinibatchOutputAttributes(self):
        return self.parameterAttributes()
    
    def updateEndInputAttributes(self):
        return self.parameterAttributes()

    def updateEndOutputAttributes(self):
        return ["regularization_term"]

    def defaultOutputFields(self, input_fields):
        output_fields = ["output", "output_class",]
        if "target" in input_fields:
            output_fields += ["class_error", "nll"]
        return output_fields
        
    def __init__(self):
        self._input = t.matrix('input') # n_examples x n_inputs
        self._target = t.matrix('target') # n_examples x n_outputs
        self._lambda = as_scalar(0.,'lambda')
        self._theta = t.matrix('theta')
        self._W = self._theta[:,1:] 
        self._b = self._theta[:,0]
        self._XtX = t.matrix('XtX')
        self._XtY = t.matrix('XtY')
        self._extended_input = t.prepend_one_to_each_row(self._input)
        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
        self._regularizer = self._lambda * t.dot(self._W,self._W)
        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)

        OneShotTLearner.__init__(self)
            
    def allocate(self,minibatch):
        minibatch_n_inputs  = minibatch["input"].shape[1]
        minibatch_n_outputs = minibatch["target"].shape[1]
        if not self._n_inputs:
            self._n_inputs = minibatch_n_inputs 
            self._n_outputs = minibatch_n_outputs
            self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
            self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
            self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
            self.forget()
        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
            # if the input or target changes dimension on the fly, we resize and forget everything
            self.forget()
            
    def forget(self):
        if self._n_inputs and self._n_outputs:
            self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
            self.XtY.resize((1+self.n_inputs,self.n_outputs))
            self.XtX.data[:,:]=0
            self.XtY.data[:,:]=0
            numpy.diag(self.XtX.data)[1:]=self.lambda


class MLP(MinibatchUpdatesTLearner):
    """
    Implement a feedforward multi-layer perceptron, with or without L1 and/or L2 regularization.

    The predictor parameters are obtained by minibatch/online gradient descent.
    Training can proceed sequentially (with multiple calls to update with
    different disjoint subsets of the training sets).

    Hyper-parameters:
      - L1_regularizer
      - L2_regularizer
      - neuron_sparsity_regularizer
      - initial_learning_rate
      - learning_rate_decrease_rate
      - n_hidden_per_layer (a list of integers)
      - activation_function ("sigmoid","tanh", or "ratio")

    The output/task type (classification, regression, etc.) is obtained by specializing MLP.

    For each (input[t],output[t]) pair in a minibatch,::

       activation[0] = input_t
       for k=1 to n_hidden_layers:
          activation[k]=activation_function(b[k]+ W[k]*activation[k-1])
       output_t = output_activation_function(b[n_hidden_layers+1]+W[n_hidden_layers+1]*activation[n_hidden_layers])

    and the b and W are obtained by minimizing the following by stochastic minibatch gradient descent::

       L2_regularizer sum_{ijk} W_{kij}^2  + L1_regularizer sum_{kij} |W_{kij}|
       + neuron_sparsity_regularizer sum_{ki} |b_{ki} + infinity|
       - sum_t log P_{output_model}(target_t | output_t)

    The fields and attributes expected and produced by use and update are the following:

     - Input and output fields (example-wise quantities):

       - 'input' (always expected by use and update)
       - 'target' (optionally expected by use and always by update)
       - 'output' (optionally produced by use)
       - error fields produced by sub-class of MLP

     - optional attributes (optionally expected as input_dataset attributes)
       (warning, this may be dangerous, the 'use' method will use those provided in the 
       input_dataset rather than those learned during 'update'; currently no support
       for providing these to update):
       
       - 'L1_regularizer'
       - 'L2_regularizer'
       - 'b' 
       - 'W'
       - 'parameters' = [b[1], W[1], b[2], W[2], ...] 
       - 'regularization_term'

    """

    def attributeNames(self):
        return ["parameters","b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer","regularization_term"]

    def useInputAttributes(self):
        return ["b","W"]

    def useOutputAttributes(self):
        return []

    def updateInputAttributes(self):
        return ["b","W","L1_regularizer","L2_regularizer","neuron_sparsity_regularizer"]

    def updateMinibatchInputFields(self):
        return ["input","target"]
    
    def updateMinibatchInputAttributes(self):
        return ["b","W"]
    
    def updateMinibatchOutputAttributes(self):
        return ["new_XtX","new_XtY"]
    
    def updateEndInputAttributes(self):
        return ["theta","XtX","XtY"]

    def updateEndOutputAttributes(self):
        return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?

    def parameterAttributes(self):
        return ["b","W"]
    
    def defaultOutputFields(self, input_fields):
        output_fields = ["output"]
        if "target" in input_fields:
            output_fields.append("squared_error")
        return output_fields
        
    def __init__(self):
        self._input = t.matrix('input') # n_examples x n_inputs
        self._target = t.matrix('target') # n_examples x n_outputs
        self._lambda = as_scalar(0.,'lambda')
        self._theta = t.matrix('theta')
        self._W = self._theta[:,1:] 
        self._b = self._theta[:,0]
        self._XtX = t.matrix('XtX')
        self._XtY = t.matrix('XtY')
        self._extended_input = t.prepend_one_to_each_row(self._input)
        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
        self._regularizer = self._lambda * t.dot(self._W,self._W)
        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)

        OneShotTLearner.__init__(self)
            
    def allocate(self,minibatch):
        minibatch_n_inputs  = minibatch["input"].shape[1]
        minibatch_n_outputs = minibatch["target"].shape[1]
        if not self._n_inputs:
            self._n_inputs = minibatch_n_inputs 
            self._n_outputs = minibatch_n_outputs
            self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
            self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
            self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
            self.forget()
        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
            # if the input or target changes dimension on the fly, we resize and forget everything
            self.forget()
            
    def forget(self):
        if self._n_inputs and self._n_outputs:
            self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
            self.XtY.resize((1+self.n_inputs,self.n_outputs))
            self.XtX.data[:,:]=0
            self.XtY.data[:,:]=0
            numpy.diag(self.XtX.data)[1:]=self.lambda