diff linear_regression.py @ 92:c4726e19b8ec

Finished first draft of TLearner
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Mon, 05 May 2008 18:14:32 -0400
parents 3499918faa9d
children c4916445e025
line wrap: on
line diff
--- a/linear_regression.py	Mon May 05 11:49:40 2008 -0400
+++ b/linear_regression.py	Mon May 05 18:14:32 2008 -0400
@@ -11,7 +11,12 @@
     Implement linear regression, with or without L2 regularization
     (the former is called Ridge Regression and the latter Ordinary Least Squares).
 
-    The predictor is obtained analytically.
+    The predictor parameters are obtained analytically from the training set.
+    Training can proceed sequentially (with multiple calls to update with
+    different disjoint subsets of the training sets). After each call to
+    update the predictor is ready to be used (and optimized for the union
+    of all the training sets passed to update since construction or since
+    the last call to forget).
 
     The L2 regularization coefficient is obtained analytically.
     For each (input[t],output[t]) pair in a minibatch,::
@@ -45,22 +50,25 @@
 
      - optional input attributes (optionally expected as input_dataset attributes)
 
-       - 'lambda' (only used by update)
-       - 'b' (only used by use)
-       - 'W' (only used by use)
-
-     - optional output attributes (available in self and optionally in output dataset)
-
-       - 'b' (only set by update)
-       - 'W' (only set by update)
-       - 'regularization_term' (only set by update)
-       - 'XtX' (only set by update)
-       - 'XtY' (only set by update)
+     - optional attributes (optionally expected as input_dataset attributes)
+       (warning, this may be dangerous, the 'use' method will use those provided in the 
+       input_dataset rather than those learned during 'update'; currently no support
+       for providing these to update):
        
+       - 'lambda' 
+       - 'b' 
+       - 'W' 
+       - 'regularization_term' 
+       - 'XtX'
+       - 'XtY'
     """
 
+    def attributeNames(self):
+        return ["lambda","b","W","regularization_term","XtX","XtY"]
+    
 # definitions specifiques a la regression lineaire:
 
+
     def global_inputs(self):
         self.lambda = as_scalar(0.,'lambda')
         self.theta = t.matrix('theta')
@@ -107,63 +115,6 @@
         
     # poutine generale basee sur ces fonctions
 
-    def minibatchwise_use_functions(self, input_fields, output_fields, stats_collector):
-        if not output_fields:
-            output_fields = self.defaultOutputFields(input_fields)
-        if stats_collector:
-            stats_collector_inputs = stats_collector.inputUpdateAttributes()
-            for attribute in stats_collector_inputs:
-                if attribute not in input_fields:
-                    output_fields.append(attribute)
-        key = (input_fields,output_fields)
-        if key not in self.use_functions_dictionary:
-            self.use_functions_dictionary[key]=Function(self.names2attributes(input_fields),
-                                                   self.names2attributes(output_fields))
-        return self.use_functions_dictionary[key]
-
-    def attributes(self,return_copy=False):
-        return self.names2attributes(self.attributeNames())
-            
-    def names2attributes(self,names,return_Result=False, return_copy=False):
-        if return_Result:
-            if return_copy:
-                return [copy.deepcopy(self.__getattr__(name)) for name in names]
-            else:
-                return [self.__getattr__(name) for name in names]
-        else:
-            if return_copy:
-                return [copy.deepcopy(self.__getattr__(name).data) for name in names]
-            else:
-                return [self.__getattr__(name).data for name in names]
-
-    def use(self,input_dataset,output_fieldnames=None,test_stats_collector=None,copy_inputs=True):
-        minibatchwise_use_function = minibatchwise_use_functions(input_dataset.fieldNames(),output_fieldnames,test_stats_collector)
-        virtual_output_dataset = ApplyFunctionDataSet(input_dataset,
-                                                      minibatchwise_use_function,
-                                                      True,DataSet.numpy_vstack,
-                                                      DataSet.numpy_hstack)
-        # actually force the computation
-        output_dataset = CachedDataSet(virtual_output_dataset,True)
-        if copy_inputs:
-            output_dataset = input_dataset | output_dataset
-        # compute the attributes that should be copied in the dataset
-        output_dataset.setAttributes(self.attributeNames(),self.attributes(return_copy=True))
-        if test_stats_collector:
-            test_stats_collector.update(output_dataset)
-            for attribute in test_stats_collector.attributeNames():
-                output_dataset[attribute] = copy.deepcopy(test_stats_collector[attribute])
-        return output_dataset
-
-    def update(self,training_set,train_stats_collector=None):
-        self.update_start()
-        for minibatch in training_set.minibatches(self.training_set_input_fields, minibatch_size=self.minibatch_size):
-            self.update_minibatch(minibatch)
-            if train_stats_collector:
-                minibatch_set = minibatch.examples()
-                minibatch_set.setAttributes(self.attributeNames(),self.attributes())
-                train_stats_collector.update(minibatch_set)
-        self.update_end()
-        return self.use
     
     def __init__(self,lambda=0.,max_memory_use=500):
         """