view learner.py @ 202:b9950ae5e54b

Added test for ApplyFunctionDataset
author Frederic Bastien <bastienf@iro.umontreal.ca>
date Thu, 15 May 2008 13:10:21 -0400
parents fb4837eed1a6
children 69759976b3ac
line wrap: on
line source


from exceptions import *
from dataset import AttributesHolder,ApplyFunctionDataSet,DataSet,CachedDataSet
import theano
from theano import compile
from theano import tensor as t
    
class Learner(AttributesHolder):
    """
    Base class for learning algorithms, provides an interface
    that allows various algorithms to be applicable to generic learning
    algorithms.

    A L{Learner} can be seen as a learning algorithm, a function that when
    applied to training data returns a learned function (which is an object that
    can be applied to other data and return some output data).
    """
    
    def __init__(self):
        pass

    def forget(self):
        """
        Reset the state of the learner to a blank slate, before seeing
        training data. The operation may be non-deterministic if the
        learner has a random number generator that is set to use a
        different seed each time it forget() is called.
        """
        raise NotImplementedError

    def update(self,training_set,train_stats_collector=None):
        """
        Continue training a learner, with the evidence provided by the given training set.
        Hence update can be called multiple times. This is particularly useful in the
        on-line setting or the sequential (Bayesian or not) settings.
        The result is a function that can be applied on data, with the same
        semantics of the Learner.use method.

        The user may optionally provide a training L{StatsCollector} that is used to record
        some statistics of the outputs computed during training. It is update(d) during
        training.
        """
        return self.use # default behavior is 'non-adaptive', i.e. update does not do anything
    
    
    def __call__(self,training_set,train_stats_collector=None):
        """
        Train a learner from scratch using the provided training set,
        and return the learned function.
        """
        self.forget()
        return self.update(training_set,train_stats_collector)

    def use(self,input_dataset,output_fieldnames=None,
            test_stats_collector=None,copy_inputs=False,
            put_stats_in_output_dataset=True,
            output_attributes=[]):
        """
        Once a L{Learner} has been trained by one or more call to 'update', it can
        be used with one or more calls to 'use'. The argument is an input L{DataSet} (possibly
        containing a single example) and the result is an output L{DataSet} of the same length.
        If output_fieldnames is specified, it may be use to indicate which fields should
        be constructed in the output L{DataSet} (for example ['output','classification_error']).
        Otherwise, self.defaultOutputFields is called to choose the output fields.
        Optionally, if copy_inputs, the input fields (of the input_dataset) can be made
        visible in the output L{DataSet} returned by this method.
        Optionally, attributes of the learner can be copied in the output dataset,
        and statistics computed by the stats collector also put in the output dataset.
        Note the distinction between fields (which are example-wise quantities, e.g. 'input')
        and attributes (which are not, e.g. 'regularization_term').

        We provide here a default implementation that does all this using
        a sub-class defined method: minibatchwiseUseFunction.
        
        @todo check if some of the learner attributes are actually SPECIFIED
        as attributes of the input_dataset, and if so use their values instead
        of the ones in the learner.

        The learner tries to compute in the output dataset the output fields specified.
        If None is specified then self.defaultOutputFields(input_dataset.fieldNames())
        is called to determine the output fields.

        Attributes of the learner can also optionally be copied into the output dataset.
        If output_attributes is None then all of the attributes in self.AttributeNames()
        are copied in the output dataset, but if it is [] (the default), then none are copied.
        If a test_stats_collector is provided, then its attributes (test_stats_collector.AttributeNames())
        are also copied into the output dataset attributes.
        """
        input_fieldnames = input_dataset.fieldNames()
        if not output_fieldnames:
            output_fieldnames = self.defaultOutputFields(input_fieldnames)

        minibatchwise_use_function = self.minibatchwiseUseFunction(input_fieldnames,
                                                                   output_fieldnames,
                                                                   test_stats_collector)
        virtual_output_dataset = ApplyFunctionDataSet(input_dataset,
                                                      minibatchwise_use_function,
                                                      output_fieldnames,
                                                      True,DataSet.numpy_vstack,
                                                      DataSet.numpy_hstack)
        # actually force the computation
        output_dataset = CachedDataSet(virtual_output_dataset,True)
        if copy_inputs:
            output_dataset = input_dataset | output_dataset
        # copy the wanted attributes in the dataset
        if output_attributes is None:
            output_attributes = self.attributeNames()
        if output_attributes:
            assert set(attribute_names) <= set(self.attributeNames())
            output_dataset.setAttributes(output_attributes,
                                         self.names2attributes(output_attributes,return_copy=True))
        if test_stats_collector:
            test_stats_collector.update(output_dataset)
            if put_stats_in_output_dataset:
                output_dataset.setAttributes(test_stats_collector.attributeNames(),
                                             test_stats_collector.attributes())
        return output_dataset

    def minibatchwiseUseFunction(self, input_fields, output_fields, stats_collector):
        """
        Returns a function that can map the given input fields to the given output fields
        and to the attributes that the stats collector needs for its computation.
        That function is expected to operate on minibatches.
        The function returned makes use of the self.useInputAttributes() and
        sets the attributes specified by self.useOutputAttributes().
        """
        raise AbstractFunction()

    def attributeNames(self):
        """
        A Learner may have attributes that it wishes to export to other objects. To automate
        such export, sub-classes should define here the names (list of strings) of these attributes.

        @todo By default, attributeNames looks for all dictionary entries whose name does not start with _.
        """
        return []

    def attributes(self,return_copy=False):
        """
        Return a list with the values of the learner's attributes (or optionally, a deep copy).
        """
        return self.names2attributes(self.attributeNames(),return_copy)

    def names2attributes(self,names):
        """
        Private helper function that maps a list of attribute names to a list
        of (optionally copies) values of attributes.
        """
        res=[]
        for name in names:
            assert name in names
            res.append(self.__getattribute__(name))
        return res

    def useInputAttributes(self):
        """
        A subset of self.attributeNames() which are the names of attributes needed by use() in order
        to do its work.
        """
        raise AbstractFunction()

    def useOutputAttributes(self):
        """
        A subset of self.attributeNames() which are the names of attributes modified/created by use() in order
        to do its work.
        """
        raise AbstractFunction()

    
class TLearner(Learner):
    """
    TLearner is a virtual class of L{Learner}s that attempts to factor
    out of the definition of a learner the steps that are common to
    many implementations of learning algorithms, so as to leave only
    'the equations' to define in particular sub-classes, using Theano.

    In the default implementations of use and update, it is assumed
    that the 'use' and 'update' methods visit examples in the input
    dataset sequentially. In the 'use' method only one pass through the
    dataset is done, whereas the sub-learner may wish to iterate over
    the examples multiple times. Subclasses where this basic model is
    not appropriate can simply redefine update or use.

    Sub-classes must provide the following functions and functionalities:
      - attributeNames(): defines all the names of attributes which can
      be used as fields or
                          attributes in input/output datasets or in
                          stats collectors.  All these attributes
                          are expected to be theano.Result objects
                          (with a .data property and recognized by
                          theano.Function for compilation).  The sub-class
                          constructor defines the relations between the
                          Theano variables that may be used by 'use'
                          and 'update' or by a stats collector.
      - defaultOutputFields(input_fields): return a list of default
      dataset output fields when
                          None are provided by the caller of use.
    The following naming convention is assumed and important.  Attributes
    whose names are listed in attributeNames() can be of any type,
    but those that can be referenced as input/output dataset fields or
    as output attributes in 'use' or as input attributes in the stats
    collector should be associated with a Theano Result variable. If the
    exported attribute name is <name>, the corresponding Result name
    (an internal attribute of the TLearner, created in the sub-class
    constructor) should be _<name>.  Typically <name> will be numpy
    ndarray and _<name> will be the corresponding Theano Tensor (for
    symbolic manipulation).

    @todo pousser dans Learner toute la poutine qui peut l'etre sans etre
    dependant de Theano
    """

    def __init__(self):
        Learner.__init__(self)
        self.use_functions_dictionary={}

    def defaultOutputFields(self, input_fields):
        """
        Return a default list of output field names (to put in the output dataset).
        This will be used when None are provided (as output_fields) by the caller of the 'use' method.
        This may involve looking at the input_fields (names) available in the
        input_dataset.
        """
        raise AbstractFunction()

    def minibatchwiseUseFunction(self, input_fields, output_fields, stats_collector):
        """
        Implement minibatchwiseUseFunction by exploiting Theano compilation
        and the expression graph defined by a sub-class constructor.
        """
        if stats_collector:
            stats_collector_inputs = stats_collector.input2UpdateAttributes()
            for attribute in stats_collector_inputs:
                if attribute not in input_fields:
                    output_fields.append(attribute)
        key = (tuple(input_fields),tuple(output_fields))
        if key not in self.use_functions_dictionary:
            use_input_attributes = self.useInputAttributes()
            use_output_attributes = self.useOutputAttributes()
            complete_f = compile.function(self.names2OpResults(input_fields+use_input_attributes),
                                          self.names2OpResults(output_fields+use_output_attributes))
            def f(*input_field_values):
                input_attribute_values = self.names2attributes(use_input_attributes)
                results = complete_f(*(list(input_field_values) + input_attribute_values))
                output_field_values = results[0:len(output_fields)]
                output_attribute_values = results[len(output_fields):len(results)]
                if use_output_attributes:
                    self.setAttributes(use_output_attributes,output_attribute_values)
                return output_field_values
            self.use_functions_dictionary[key]=f
        return self.use_functions_dictionary[key]

    def names2OpResults(self,names):
        """
        Private helper function that maps a list of attribute names to a list
        of corresponding Op Results (with the same name but with a '_' prefix).
        """
        return [self.__getattribute__('_'+name) for name in names]


class MinibatchUpdatesTLearner(TLearner):
    """
    This adds the following functions to a L{TLearner}:
      - updateStart(), updateEnd(), updateMinibatch(minibatch), isLastEpoch():
      functions executed at the beginning, the end, in the middle (for
      each minibatch) of the update method, and at the end of each
      epoch. This model only works for 'online' or one-shot learning
      that requires going only once through the training data. For more
      complicated models, more specialized subclasses of TLearner should
      be used or a learning-algorithm specific update method should
      be defined.

      - a 'parameters' attribute which is a list of parameters
      (whose names are specified by the user's subclass with the
      parameterAttributes() method)

    """

    def __init__(self):
        TLearner.__init__(self)
        self.update_minibatch_function = compile.function(self.names2OpResults(self.updateMinibatchOutputAttributes()+
                                                                               self.updateMinibatchInputFields()),
                                                          self.names2OpResults(self.updateMinibatchOutputAttributes()))
        self.update_end_function = compile.function(self.names2OpResults(self.updateEndInputAttributes()),
                                                    self.names2OpResults(self.updateEndOutputAttributes()))

    def allocate(self, minibatch):
        """
        This function is called at the beginning of each L{updateMinibatch}
        and should be used to check that all required attributes have been
        allocated and initialized (usually this function calls forget()
        when it has to do an initialization).
        """
        raise AbstractFunction()
        
    def updateMinibatchInputFields(self):
        raise AbstractFunction()
    
    def updateMinibatchInputAttributes(self):
        raise AbstractFunction()
    
    def updateMinibatchOutputAttributes(self):
        raise AbstractFunction()
    
    def updateEndInputAttributes(self):
        raise AbstractFunction()

    def updateEndOutputAttributes(self):
        raise AbstractFunction()

    def parameterAttributes(self):
        raise AbstractFunction()

    def updateStart(self,training_set):
        pass

    def updateEnd(self):
        self.setAttributes(self.updateEndOutputAttributes(),
                           self.update_end_function(*self.names2attributes(self.updateEndInputAttributes())))
        self.parameters = self.names2attributes(self.parameterAttributes())
        
    def updateMinibatch(self,minibatch):
        # make sure all required fields are allocated and initialized
        self.allocate(minibatch)
        input_attributes = self.names2attributes(self.updateMinibatchInputAttributes())
        input_fields = minibatch(*self.updateMinibatchInputFields())
        self.setAttributes(self.updateMinibatchOutputAttributes(),
                           # concatenate the attribute values and field values and then apply update fn
                           self.update_minibatch_function(*(input_attributes+input_fields)))
        
    def isLastEpoch(self):
        """
        This method is called at the end of each epoch (cycling over the training set).
        It returns a boolean to indicate if this is the last epoch.
        By default just do one epoch.
        """
        return True
    
    def update(self,training_set,train_stats_collector=None):
        """
        @todo check if some of the learner attributes are actually SPECIFIED
        in as attributes of the training_set.
        """
        self.updateStart(training_set)
        stop=False
        if hasattr(self,'_minibatch_size') and self._minibatch_size:
            minibatch_size=self._minibatch_size
        else:
            minibatch_size=min(100,len(training_set))
        while not stop:
            if train_stats_collector:
                train_stats_collector.forget() # restart stats collectin at the beginning of each epoch
            for minibatch in training_set.minibatches(minibatch_size=minibatch_size):
                self.updateMinibatch(minibatch)
                if train_stats_collector:
                    minibatch_set = minibatch.examples()
                    minibatch_set.setAttributes(self.attributeNames(),self.attributes())
                    train_stats_collector.update(minibatch_set)
            stop = self.isLastEpoch()
        self.updateEnd()
        return self.use

class OnlineGradientTLearner(MinibatchUpdatesTLearner):
    """
    Specialization of L{MinibatchUpdatesTLearner} in which the minibatch updates
    are obtained by performing an online (minibatch-based) gradient step.

    Sub-classes must define the following:
      - self._learning_rate (may be changed by the sub-class between epochs or minibatches)
      - self.lossAttribute()  = name of the loss field 
    """
    def __init__(self,truly_online=False):
        """
        If truly_online then only one pass is made through the training set passed to update().

        SUBCLASSES SHOULD CALL THIS CONSTRUCTOR ONLY AFTER HAVING DEFINED ALL THEIR THEANO FORMULAS
        """
        self.truly_online=truly_online

        # create the formulas for the gradient update
        old_params = [self.__getattribute__("_"+name) for name in self.parameterAttributes()]
        new_params_names = ["_new_"+name for name in self.parameterAttributes()]
        loss = self.__getattribute__("_"+self.lossAttribute())
        self.setAttributes(new_params_names,
                           [t.add_inplace(param,self._learning_rate*t.grad(loss,param))
                            for param in old_params])
        MinibatchUpdatesTLearner.__init__(self)
        

    def namesOfAttributesToComputeOutputs(self,output_names):
        """
        The output_names are attribute names (not the corresponding Result names, which have leading _).
        Return the corresponding input names
        """
        all_inputs = t.gof.graph.inputs(self.names2OpResults(output_names))
        # remove constants and leading '_' in name

        return [r.name for r in all_inputs if isinstance(r,theano.Result) and \
                not isinstance(r,theano.Constant) and not isinstance(r,theano.Value)]
        #inputs = []
        #for r in all_inputs:
        #    if isinstance(r,theano.Result) and \
        #    not isinstance(r,theano.Constant) and not isinstance(r,theano.Value):
        #       inputs.append(r.name)
        #return inputs
        
    def isLastEpoch(self):
        return self.truly_online

    def updateMinibatchInputAttributes(self):
        return self.parameterAttributes()
    
    def updateMinibatchOutputAttributes(self):
        return ["new_"+name for name in self.parameterAttributes()]
    
    def updateEndInputAttributes(self):
        return self.namesOfAttributesToComputeOutputs(self.updateEndOutputAttributes())

    def useInputAttributes(self):
        return self.parameterAttributes()

    def useOutputAttributes(self):
        return []