changeset 110:8fa1ef2411a0

Worked on OneShotTLearner and implementation of LinearRegression
author bengioy@bengiomac.local
date Tue, 06 May 2008 22:24:55 -0400
parents d97f6fe6bdf9
children 88257dfedf8c
files dataset.py learner.py linear_regression.py lookup_list.py
diffstat 4 files changed, 229 insertions(+), 128 deletions(-) [+]
line wrap: on
line diff
--- a/dataset.py	Tue May 06 20:01:34 2008 -0400
+++ b/dataset.py	Tue May 06 22:24:55 2008 -0400
@@ -9,7 +9,22 @@
 class AbstractFunction (Exception): """Derived class must override this function"""
 class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented"""
 
-class DataSet(object):
+class AttributesHolder(object):
+    def __init__(self): pass
+
+    def attributeNames(self):
+        raise AbstractFunction()
+
+    def setAttributes(self,attribute_names,attribute_values,make_copies=False):
+        if make_copies:
+            for name,value in zip(attribute_names,attribute_values):
+                self.__setattr__(name,copy.deepcopy(value))
+        else:
+            for name,value in zip(attribute_names,attribute_values):
+                self.__setattr__(name,value)
+    
+    
+class DataSet(AttributesHolder):
     """A virtual base class for datasets.
 
     A DataSet can be seen as a generalization of a matrix, meant to be used in conjunction
@@ -149,10 +164,6 @@
 
     def attributeNames(self): return self._attribute_names
 
-    def setAttributes(self,attribute_names,attribute_values):
-        for name,value in zip(attribute_names,attribute_values):
-            self.__setattr__(name,value)
-    
     class MinibatchToSingleExampleIterator(object):
         """
         Converts the result of minibatch iterator with minibatch_size==1 into
--- a/learner.py	Tue May 06 20:01:34 2008 -0400
+++ b/learner.py	Tue May 06 22:24:55 2008 -0400
@@ -1,7 +1,7 @@
 
 from dataset import *
     
-class Learner(object):
+class Learner(AttributesHolder):
     """Base class for learning algorithms, provides an interface
     that allows various algorithms to be applicable to generic learning
     algorithms.
@@ -66,6 +66,35 @@
         """
         return []
 
+    def updateInputAttributes(self):
+        """
+        A subset of self.attributeNames() which are the names of attributes needed by update() in order
+        to do its work.
+        """
+        raise AbstractFunction()
+
+    def useInputAttributes(self):
+        """
+        A subset of self.attributeNames() which are the names of attributes needed by use() in order
+        to do its work.
+        """
+        raise AbstractFunction()
+
+    def updateOutputAttributes(self):
+        """
+        A subset of self.attributeNames() which are the names of attributes modified/created by update() in order
+        to do its work.
+        """
+        raise AbstractFunction()
+
+    def useOutputAttributes(self):
+        """
+        A subset of self.attributeNames() which are the names of attributes modified/created by use() in order
+        to do its work.
+        """
+        raise AbstractFunction()
+
+    
 class TLearner(Learner):
     """
     TLearner is a virtual class of Learners that attempts to factor out of the definition
@@ -103,50 +132,82 @@
 
     def __init__(self):
         Learner.__init__(self)
+
+    def defaultOutputFields(self, input_fields):
+        """
+        Return a default list of output field names (to put in the output dataset).
+        This will be used when None are provided (as output_fields) by the caller of the 'use' method.
+        This may involve looking at the input_fields (names) available in the
+        input_dataset.
+        """
+        raise AbstractFunction()
+
+    def allocate(self, minibatch):
+        """
+        This function is called at the beginning of each updateMinibatch
+        and should be used to check that all required attributes have been
+        allocated and initialized (usually this function calls forget()
+        when it has to do an initialization).
+        """
+        raise AbstractFunction()
         
-    def _minibatchwise_use_functions(self, input_fields, output_fields, stats_collector):
+    def minibatchwise_use_functions(self, input_fields, output_fields, stats_collector):
         """
         Private helper function called by the generic TLearner.use. It returns a function
         that can map the given input fields to the given output fields (along with the
-        attributes that the stats collector needs for its computation.
+        attributes that the stats collector needs for its computation. The function
+        called also automatically makes use of the self.useInputAttributes() and
+        sets the self.useOutputAttributes().
         """
         if not output_fields:
             output_fields = self.defaultOutputFields(input_fields)
         if stats_collector:
-            stats_collector_inputs = stats_collector.inputUpdateAttributes()
+            stats_collector_inputs = stats_collector.input2UpdateAttributes()
             for attribute in stats_collector_inputs:
                 if attribute not in input_fields:
                     output_fields.append(attribute)
         key = (input_fields,output_fields)
         if key not in self.use_functions_dictionary:
-            self.use_functions_dictionary[key]=Function(self._names2attributes(input_fields),
-                                                   self._names2attributes(output_fields))
+            use_input_attributes = self.useInputAttributes()
+            use_output_attributes = self.useOutputAttributes()
+            complete_f = Function(self.names2OpResults(input_fields+use_input_attributes),
+                                  self.names2OpResults(output_fields+use_output_attributes))
+            def f(*input_field_values):
+                input_attribute_values = self.names2attributes(use_input_attributes)
+                results = complete_f(*(input_field_values + input_attribute_values))
+                output_field_values = results[0:len(output_fields)]
+                output_attribute_values = results[len(output_fields):len(results)]
+                if use_output_attributes:
+                    self.setAttributes(use_output_attributes,output_attribute_values)
+                return output_field_values
+            self.use_functions_dictionary[key]=f
         return self.use_functions_dictionary[key]
 
     def attributes(self,return_copy=False):
         """
         Return a list with the values of the learner's attributes (or optionally, a deep copy).
         """
-        return self.names2attributes(self.attributeNames())
-            
-    def _names2attributes(self,names,return_Result=False, return_copy=False):
+        return self.names2attributes(self.attributeNames(),return_copy)
+
+    def names2attributes(self,names,return_copy=False):
         """
         Private helper function that maps a list of attribute names to a list
-        of (optionally copies) values or of the Result objects that own these values.
+        of (optionally copies) values of attributes.
         """
-        if return_Result:
-            if return_copy:
-                return [copy.deepcopy(self.__getattr__(name)) for name in names]
-            else:
-                return [self.__getattr__(name) for name in names]
+        if return_copy:
+            return [copy.deepcopy(self.__getattr__(name).data) for name in names]
         else:
-            if return_copy:
-                return [copy.deepcopy(self.__getattr__(name).data) for name in names]
-            else:
-                return [self.__getattr__(name).data for name in names]
+            return [self.__getattr__(name).data for name in names]
+
+    def names2OpResults(self,names):
+        """
+        Private helper function that maps a list of attribute names to a list
+        of corresponding Op Results (with the same name but with a '_' prefix).
+        """
+        return [self.__getattr__('_'+name).data for name in names]
 
     def use(self,input_dataset,output_fieldnames=None,output_attributes=[],
-            test_stats_collector=None,copy_inputs=True):
+            test_stats_collector=None,copy_inputs=True, put_stats_in_output_dataset=True):
         """
         The learner tries to compute in the output dataset the output fields specified
 
@@ -164,7 +225,7 @@
         If a test_stats_collector is provided, then its attributes (test_stats_collector.AttributeNames())
         are also copied into the output dataset attributes.
         """
-        minibatchwise_use_function = _minibatchwise_use_functions(input_dataset.fieldNames(),
+        minibatchwise_use_function = minibatchwise_use_functions(input_dataset.fieldNames(),
                                                                   output_fieldnames,
                                                                   test_stats_collector)
         virtual_output_dataset = ApplyFunctionDataSet(input_dataset,
@@ -179,20 +240,21 @@
         if output_attributes is None:
             output_attributes = self.attributeNames()
         if output_attributes:
-            assert set(output_attributes) <= set(self.attributeNames())
+            assert set(attribute_names) <= set(self.attributeNames())
             output_dataset.setAttributes(output_attributes,
-                                         self._names2attributes(output_attributes,return_copy=True))
+                                         self.names2attributes(output_attributes,return_copy=True))
         if test_stats_collector:
             test_stats_collector.update(output_dataset)
-            output_dataset.setAttributes(test_stats_collector.attributeNames(),
-                                         test_stats_collector.attributes())
+            if put_stats_in_output_dataset:
+                output_dataset.setAttributes(test_stats_collector.attributeNames(),
+                                             test_stats_collector.attributes())
         return output_dataset
 
 
 class OneShotTLearner(TLearner):
     """
     This adds to TLearner a 
-      - update_start(), update_end(), update_minibatch(minibatch), end_epoch():
+      - updateStart(), updateEnd(), updateMinibatch(minibatch), isLastEpoch():
                           functions executed at the beginning, the end, in the middle
                           (for each minibatch) of the update method, and at the end
                           of each epoch. This model only
@@ -204,18 +266,56 @@
 
     def __init__(self):
         TLearner.__init__(self)
+        self.update_minibatch_function =
+        Function(self.names2OpResults(self.updateMinibatchOutputAttributes()+
+                                      self.updateMinibatchInputFields()),
+                 self.names2OpResults(self.updateMinibatchOutputAttributes()))
+        self.update_end_function = Function(self.names2OpResults(self.updateEndInputAttributes()),
+                                            self.names2OpResults(self.updateEndOutputAttributes()))
+
+    def updateMinibatchInputFields(self):
+        raise AbstractFunction()
+    
+    def updateMinibatchInputAttributes(self):
+        raise AbstractFunction()
+    
+    def updateMinibatchOutputAttributes(self):
+        raise AbstractFunction()
+    
+    def updateEndInputAttributes(self):
+        raise AbstractFunction()
+
+    def updateEndOutputAttributes(self):
+        raise AbstractFunction()
+
+    def updateStart(self): pass
+
+    def updateEnd(self):
+        self.setAttributes(self.updateEndOutputAttributes(),
+                           self.update_end_function
+                           (self.names2attributes(self.updateEndInputAttributes())))
         
-    def update_start(self): pass
-    def update_end(self): pass
-    def update_minibatch(self,minibatch):
-        raise AbstractFunction()
+    def updateMinibatch(self,minibatch):
+        # make sure all required fields are allocated and initialized
+        self.allocate(minibatch)
+        self.setAttributes(self.updateMinibatchOutputAttributes(),
+                           self.update_minibatch_function(*(self.names2attributes(self.updateMinibatchInputAttributes()))
+                                                          + minibatch(self.updateMinibatchInputFields())))
+        
+    def isLastEpoch(self):
+        """
+        This method is called at the end of each epoch (cycling over the training set).
+        It returns a boolean to indicate if this is the last epoch.
+        By default just do one epoch.
+        """
+        return True
     
     def update(self,training_set,train_stats_collector=None):
         """
         @todo check if some of the learner attributes are actually SPECIFIED
         in as attributes of the training_set.
         """
-        self.update_start()
+        self.updateStart(training_set)
         stop=False
         while not stop:
             if train_stats_collector:
@@ -227,7 +327,7 @@
                     minibatch_set = minibatch.examples()
                     minibatch_set.setAttributes(self.attributeNames(),self.attributes())
                     train_stats_collector.update(minibatch_set)
-            stop = self.end_epoch()
-        self.update_end()
+            stop = self.isLastEpoch()
+        self.updateEnd()
         return self.use
 
--- a/linear_regression.py	Tue May 06 20:01:34 2008 -0400
+++ b/linear_regression.py	Tue May 06 22:24:55 2008 -0400
@@ -55,50 +55,43 @@
        
        - 'lambda' 
        - 'b' 
-       - 'W' 
-       - 'regularization_term' 
+       - 'W'
+       - 'parameters' = (b, W) tuple
+       - 'regularization_term'
+       - 'XtX'
+       - 'XtY'
 
     """
 
     def attributeNames(self):
-        return ["lambda","b","W","regularization_term"]
-    
+        return ["lambda","parameters","b","W","regularization_term","XtX","XtY"]
+
+    def useInputAttributes(self):
+        return ["b","W"]
+
+    def useOutputAttributes(self):
+        return []
+
+    def updateInputAttributes(self):
+        return ["lambda","XtX","XtY"]
+
+    def updateOutputAttributes(self):
+        return ["parameters"] + self.updateMinibatchOutputAttributes() + self.updateEndOutputAttributes()
 
-    def __init__(self):
-        self.input = t.matrix('input') # n_examples x n_inputs
-        self.target = t.matrix('target') # n_examples x n_outputs
-        self.lambda = as_scalar(0.,'lambda')
-        self.theta = t.matrix('theta')
-        self.W = self.theta[:,1:] 
-        self.b = self.theta[:,0]
-        self.XtX = t.matrix('XtX')
-        self.XtY = t.matrix('XtY')
-        self.regularizer = self.lambda * t.dot(self.W,self.W)
-        self.squared_error = 
-        self.loss = self.regularizer + t.sum(self.squared_error) # this only makes sense if the whole training set fits in memory in a minibatch
-        self.loss_function = Function([self.W,self.lambda,self.squared_error],[self.loss])
-        self.new_XtX = self.XtX + t.dot(self.extended_input.T,self.extended_input)
-        self.new_XtY = self.XtY + t.dot(self.extended_input.T,self.target)
-        self.new_theta = t.solve(self.XtX,self.XtY)
+    def updateMinibatchInputFields(self):
+        return ["input","target"]
+    
+    def updateMinibatchInputAttributes(self):
+        return ["XtX","XtY"]
+    
+    def updateMinibatchOutputAttributes(self):
+        return ["new_XtX","new_XtY"]
+    
+    def updateEndInputAttributes(self):
+        return ["theta","XtX","XtY"]
 
-    def initialize(self):
-        self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
-        self.XtY.resize((1+self.n_inputs,self.n_outputs))
-        self.XtX.data[:,:]=0
-        self.XtY.data[:,:]=0
-        numpy.diag(self.XtX.data)[1:]=self.lambda.data
-        
-    def updated_variables(self):
-    
-    def minibatch_wise_inputs(self):
-    def minibatch_wise_outputs(self):
-        # self.input is a (n_examples, n_inputs) minibatch matrix
-        self.extended_input = t.prepend_one_to_each_row(self.input)
-        self.output = t.dot(self.input,self.W.T) + self.b  # (n_examples , n_outputs) matrix
-        self.squared_error = t.sum_within_rows(t.sqr(self.output-self.target)) # (n_examples ) vector
-
-    def attributeNames(self):
-        return ["lambda","b","W","regularization_term","XtX","XtY"]
+    def updateEndOutputAttributes(self):
+        return ["new_theta","b","W","regularization_term"] # CHECK: WILL b AND W CONTAIN OLD OR NEW THETA? @todo i.e. order of computation = ?
 
     def defaultOutputFields(self, input_fields):
         output_fields = ["output"]
@@ -106,58 +99,49 @@
             output_fields.append("squared_error")
         return output_fields
         
-    # poutine generale basee sur ces fonctions
+    def __init__(self):
+        self._input = t.matrix('input') # n_examples x n_inputs
+        self._target = t.matrix('target') # n_examples x n_outputs
+        self._lambda = as_scalar(0.,'lambda')
+        self._theta = t.matrix('theta')
+        self._W = self._theta[:,1:] 
+        self._b = self._theta[:,0]
+        self._XtX = t.matrix('XtX')
+        self._XtY = t.matrix('XtY')
+        self._extended_input = t.prepend_one_to_each_row(self._input)
+        self._output = t.dot(self._input,self._W.T) + self._b  # (n_examples , n_outputs) matrix
+        self._squared_error = t.sum_within_rows(t.sqr(self._output-self._target)) # (n_examples ) vector
+        self._regularizer = self._lambda * t.dot(self._W,self._W)
+        self._new_XtX = add_inplace(self._XtX,t.dot(self._extended_input.T,self._extended_input))
+        self._new_XtY = add_inplace(self._XtY,t.dot(self._extended_input.T,self._target))
+        self._new_theta = t.solve_inplace(self._theta,self._XtX,self._XtY)
 
-    
-    def __init__(self,lambda=0.,max_memory_use=500):
-        """
-        @type lambda: float
-        @param lambda: regularization coefficient
-        """
-        
-        W=t.matrix('W')
-        # b is a broadcastable row vector (can be replicated into
-        # as many rows as there are examples in the minibach)
-        b=t.row('b')
-        minibatch_input = t.matrix('input') # n_examples x n_inputs
-        minibatch_target = t.matrix('target') # n_examples x n_outputs
-        minibatch_output = t.dot(minibatch_input,W.T) + b  # n_examples x n_outputs
-        lambda = as_scalar(lambda)
-        regularizer = self.lambda * t.dot(W,W)
-        example_squared_error = t.sum_within_rows(t.sqr(minibatch_output-minibatch_target))
-        self.output_function = Function([W,b,minibatch_input],[minibatch_output])
-        self.squared_error_function = Function([minibatch_output,minibatch_target],[self.example_squared_error])
-        self.loss_function = Function([W,squared_error],[self.regularizer + t.sum(self.example_squared_error)])
-        self.W=None
-        self.b=None
-        self.XtX=None
-        self.XtY=None
-        
+        OneShotTLearner.__init__(self)
+        self.allocate()
+            
+    def allocate(self,minibatch):
+        minibatch_n_inputs  = minibatch["input"].shape[1]
+        minibatch_n_outputs = minibatch["target"].shape[1]
+        if not self._n_inputs:
+            self._n_inputs = minibatch_n_inputs 
+            self._n_outputs = minibatch_n_outputs
+            self.XtX = numpy.zeros((1+self._n_inputs,1+self._n_inputs))
+            self.XtY = numpy.zeros((1+self._n_inputs,self._n_outputs))
+            self.theta = numpy.zeros((self._n_outputs,1+self._n_inputs))
+            self.forget()
+        elif self._n_inputs!=minibatch_n_inputs or self._n_outputs!=minibatch_n_outputs:
+            # if the input or target changes dimension on the fly, we forget everything
+            self.forget()
+            
     def forget(self):
-        if self.W:
-            self.XtX *= 0
-            self.XtY *= 0
+        if self._n_inputs and self._n_outputs:
+            self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
+            self.XtY.resize((1+self.n_inputs,self.n_outputs))
+            self.XtX.data[:,:]=0
+            self.XtY.data[:,:]=0
+            numpy.diag(self.XtX.data)[1:]=self.lambda
 
-    def use(self,input_dataset,output_fieldnames=None,copy_inputs=True):
-        input_fieldnames = input_dataset.fieldNames()
-        assert "input" in input_fieldnames
-        if not output_fields:
-            output_fields = ["output"]
-            if "target" in input_fieldnames:
-                output_fields += ["squared_error"]
-        else:
-            if "squared_error" in output_fields or "total_loss" in output_fields:
-                assert "target" in input_fieldnames
+    def updateEnd(self):
+        TLearner.updateEnd(self)
+        self.parameters = (self.W,self.b)
 
-        use_functions = []
-        for output_fieldname in output_fieldnames:
-            if output_fieldname=="output":
-                use_functions.append(self.output_function)
-            elif output_fieldname=="squared_error":
-                use_functions.append(lambda self.output_function)
-    
-        n_examples = len(input_dataset)
-        
-        for minibatch in input_dataset.minibatches(minibatch_size=minibatch_size, allow_odd_last_minibatch=True):
-            use_function(
-                             
--- a/lookup_list.py	Tue May 06 20:01:34 2008 -0400
+++ b/lookup_list.py	Tue May 06 22:24:55 2008 -0400
@@ -9,6 +9,7 @@
     following syntactic constructions work as one would expect:
        example = LookupList(['x','y','z'],[1,2,3])
        example['x'] = [1, 2, 3] # set or change a field
+       print example('z','y') # prints [3,2]
        x, y, z = example
        x = example[0]
        x = example["x"]
@@ -88,7 +89,6 @@
             new_example.append_keyval(item[0],item[1])
         return new_example
 
-        
     def __eq__(self, other):
         return self._values==other._values and self._name2index==other._name2index and self._names==other._names
 
@@ -97,3 +97,9 @@
 
     def __hash__():
         raise NotImplementedError()
+
+    def __call__(*names):
+        """
+        Return a list of values associated with the given names (which must all be keys of the lookup list).
+        """
+        return [self[name] for name in names]