changeset 78:3499918faa9d

In the middle of designing TLearner
author bengioy@bengiomac.local
date Mon, 05 May 2008 09:35:30 -0400
parents 1e2bb5bad636
children 158653a9bc7c
files dataset.py learner.py linear_regression.py
diffstat 3 files changed, 85 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/dataset.py	Sun May 04 15:09:22 2008 -0400
+++ b/dataset.py	Mon May 05 09:35:30 2008 -0400
@@ -88,6 +88,9 @@
      the name <property>. The following properties should be supported:
           - 'description': a textual description or name for the dataset
           - 'fieldtypes': a list of types (one per field)
+    A DataSet may have other attributes that it makes visible to other objects. These are
+    used to store information that is not example-wise but global to the dataset.
+    The list of names of these attributes is given by the attribute_names() method.
 
     Datasets can be concatenated either vertically (increasing the length) or
     horizontally (augmenting the set of fields), if they are compatible, using
@@ -114,7 +117,7 @@
     or other properties of the dataset or associated with the dataset or the result
     of a computation stored in a dataset. These can be accessed through the [key] syntax
     when key is a string (or more specifically, neither an integer, a slice, nor a list).
-    
+
     A DataSet sub-class should always redefine the following methods:
        - __len__ if it is not a stream
        - fieldNames
@@ -125,6 +128,11 @@
        - hasFields
        - __getitem__ may not be feasible with some streams
        - __iter__
+    A sub-class should also append attributes to self._attribute_names
+    (the default value returned by attributeNames()).
+    By convention, attributes not in attributeNames() should have a name
+    starting with an underscore.
+    @todo enforce/test that convention!
     """
 
     numpy_vstack = lambda fieldname,values: return numpy.vstack(values)
@@ -136,6 +144,15 @@
             description = type(self).__name__ + " ( " + join([x.__name__ for x in type(self).__bases__]) + " )"
         self.description=description
         self.fieldtypes=fieldtypes
+        self._attribute_names = ["description"]
+        if fieldtypes:
+            self._attribute_names.append("fieldtypes")
+
+    def attributeNames(self): return self._attribute_names
+
+    def setAttributes(self,attribute_names,attribute_values):
+        for name,value in zip(attribute_names,attribute_values):
+            self.__setattr__(name,value)
     
     class MinibatchToSingleExampleIterator(object):
         """
--- a/learner.py	Sun May 04 15:09:22 2008 -0400
+++ b/learner.py	Mon May 05 09:35:30 2008 -0400
@@ -57,4 +57,35 @@
         """
         raise NotImplementedError
 
+    def attribute_names(self):
+        """
+        A Learner may have attributes that it wishes to export to other objects. To automate
+        such export, sub-classes should define here the names (list of strings) of these attributes.
+        """
+        return []
 
+class TLearner(Learner):
+    """
+    TLearner is a virtual class of Learners that attempts to factor out of the definition
+    of a learner the steps that are common to many implementations of learning algorithms,
+    so as to leave only "the equations" to define in particular sub-classes, using Theano.
+
+    In the default implementations of use and update, it is assumed that the 'use' and 'update' methods
+    visit examples in the input dataset sequentially. In the 'use' method only one pass through the dataset is done,
+    whereas the sub-learner may wish to iterate over the examples multiple times. Subclasses where this
+    basic model is not appropriate can simply redefine update or use.
+    
+    Sub-classes must provide the following functions and functionalities:
+      - attributeNames(): defines all the names of attributes which can be used as fields or
+                          attributes in input/output datasets or in stats collectors.
+                          All these attributes are expected to be theano.Result objects
+                          (with a .data property and recognized by theano.Function for compilation).
+                          The sub-class constructor defines the relations between
+                          the Theano variables that may be used by 'use' and 'update'
+                          or by a stats collector.
+      - defaultOutputFields(input_fields): return a list of default dataset output fields when
+                          None are provided by the caller of use.
+      - 
+      
+    """
+    
--- a/linear_regression.py	Sun May 04 15:09:22 2008 -0400
+++ b/linear_regression.py	Mon May 05 09:35:30 2008 -0400
@@ -96,10 +96,10 @@
         self.output = t.dot(self.input,self.W.T) + self.b  # (n_examples , n_outputs) matrix
         self.squared_error = t.sum_within_rows(t.sqr(self.output-self.target)) # (n_examples ) vector
 
-    def attribute_names(self):
+    def attributeNames(self):
         return ["lambda","b","W","regularization_term","XtX","XtY"]
 
-    def default_output_fields(self, input_fields):
+    def defaultOutputFields(self, input_fields):
         output_fields = ["output"]
         if "target" in input_fields:
             output_fields.append("squared_error")
@@ -107,23 +107,37 @@
         
     # poutine generale basee sur ces fonctions
 
-    def minibatchwise_use_functions(self, input_fields, output_fields):
+    def minibatchwise_use_functions(self, input_fields, output_fields, stats_collector):
         if not output_fields:
-            output_fields = self.default_output_fields(input_fields)
+            output_fields = self.defaultOutputFields(input_fields)
+        if stats_collector:
+            stats_collector_inputs = stats_collector.inputUpdateAttributes()
+            for attribute in stats_collector_inputs:
+                if attribute not in input_fields:
+                    output_fields.append(attribute)
         key = (input_fields,output_fields)
-        if key not in use_functions_dictionary:
-            use_functions_dictionary[key]=Function(self.names2attributes(input_fields),
+        if key not in self.use_functions_dictionary:
+            self.use_functions_dictionary[key]=Function(self.names2attributes(input_fields),
                                                    self.names2attributes(output_fields))
-        return use_functions_dictionary[key]
+        return self.use_functions_dictionary[key]
 
-    def names2attributes(self,names,return_Result=True):
+    def attributes(self,return_copy=False):
+        return self.names2attributes(self.attributeNames())
+            
+    def names2attributes(self,names,return_Result=False, return_copy=False):
         if return_Result:
-            return [self.__getattr__(name) for name in names]
+            if return_copy:
+                return [copy.deepcopy(self.__getattr__(name)) for name in names]
+            else:
+                return [self.__getattr__(name) for name in names]
         else:
-            return [self.__getattr__(name).data for name in names]
+            if return_copy:
+                return [copy.deepcopy(self.__getattr__(name).data) for name in names]
+            else:
+                return [self.__getattr__(name).data for name in names]
 
     def use(self,input_dataset,output_fieldnames=None,test_stats_collector=None,copy_inputs=True):
-        minibatchwise_use_function = use_functions(input_dataset.fieldNames(),output_fieldnames)
+        minibatchwise_use_function = minibatchwise_use_functions(input_dataset.fieldNames(),output_fieldnames,test_stats_collector)
         virtual_output_dataset = ApplyFunctionDataSet(input_dataset,
                                                       minibatchwise_use_function,
                                                       True,DataSet.numpy_vstack,
@@ -133,17 +147,23 @@
         if copy_inputs:
             output_dataset = input_dataset | output_dataset
         # compute the attributes that should be copied in the dataset
-        for attribute in self.attribute_names():
-            # .data assumes that all attributes are Result objects
-            output_dataset.__setattr__(attribute) = copy.deepcopy(self.__getattr__(attribute).data)
+        output_dataset.setAttributes(self.attributeNames(),self.attributes(return_copy=True))
         if test_stats_collector:
             test_stats_collector.update(output_dataset)
-            for attribute in test_stats_collector.attribute_names():
+            for attribute in test_stats_collector.attributeNames():
                 output_dataset[attribute] = copy.deepcopy(test_stats_collector[attribute])
         return output_dataset
 
     def update(self,training_set,train_stats_collector=None):
-        
+        self.update_start()
+        for minibatch in training_set.minibatches(self.training_set_input_fields, minibatch_size=self.minibatch_size):
+            self.update_minibatch(minibatch)
+            if train_stats_collector:
+                minibatch_set = minibatch.examples()
+                minibatch_set.setAttributes(self.attributeNames(),self.attributes())
+                train_stats_collector.update(minibatch_set)
+        self.update_end()
+        return self.use
     
     def __init__(self,lambda=0.,max_memory_use=500):
         """