# HG changeset patch
# User bengioy@bengiomac.local
# Date 1209928162 14400
# Node ID 1e2bb5bad636c61130405a8e246e46b6e2652a68
# Parent  ccd6ae89a7c4c15fe4ae84f9630391b8d8f76a4b
toying with different ways to implement learners

diff -r ccd6ae89a7c4 -r 1e2bb5bad636 dataset.py
--- a/dataset.py	Sat May 03 22:00:37 2008 -0400
+++ b/dataset.py	Sun May 04 15:09:22 2008 -0400
@@ -127,6 +127,9 @@
        - __iter__
     """
 
+    numpy_vstack = lambda fieldname,values: return numpy.vstack(values)
+    numpy_hstack = lambda fieldnames,values: return numpy.hstack(values)
+        
     def __init__(self,description=None,fieldtypes=None):
         if description is None:
             # by default return "<DataSetType>(<SuperClass1>,<SuperClass2>,...)"
@@ -970,7 +973,10 @@
   (and cached) upon construction of the CachedDataSet, rather at the
   first access.
 
-  TODO: add disk-buffering capability, so that when the cache becomes too
+  @todo when cache_all_upon_construction create mini-batches that are as 
+  large as possible but not so large as to fill up memory.
+  
+  @todo add disk-buffering capability, so that when the cache becomes too
   big for memory, we cache things on disk, trying to keep in memory only
   the record most likely to be accessed next.
   """
@@ -978,6 +984,10 @@
       self.source_dataset=source_dataset
       self.cache_all_upon_construction=cache_all_upon_construction
       if cache_all_upon_construction:
+          # this potentially brings all the source examples
+          # into memory at once, which may be too much
+          # the work could possibly be done by minibatches
+          # that are as large as possible but no more than what memory allows.
           self.cached_examples = zip(*source_dataset.minibatches(minibatch_size=len(source_dataset)).__iter__().next())
       else:
           self.cached_examples = []
diff -r ccd6ae89a7c4 -r 1e2bb5bad636 learner.py
--- a/learner.py	Sat May 03 22:00:37 2008 -0400
+++ b/learner.py	Sun May 04 15:09:22 2008 -0400
@@ -56,3 +56,5 @@
         visible in the output DataSet returned by this method.
         """
         raise NotImplementedError
+
+
diff -r ccd6ae89a7c4 -r 1e2bb5bad636 linear_regression.py
--- a/linear_regression.py	Sat May 03 22:00:37 2008 -0400
+++ b/linear_regression.py	Sun May 04 15:09:22 2008 -0400
@@ -53,14 +53,99 @@
 
        - 'b' (only set by update)
        - 'W' (only set by update)
-       - 'total_squared_error' (set by use and by update) = sum over examples of example_wise_squared_error 
-       - 'total_loss' (set by use and by update) = regularizer + total_squared_error
+       - 'regularization_term' (only set by update)
        - 'XtX' (only set by update)
        - 'XtY' (only set by update)
        
     """
 
-    def __init__(self,lambda=0.):
+# definitions specifiques a la regression lineaire:
+
+    def global_inputs(self):
+        self.lambda = as_scalar(0.,'lambda')
+        self.theta = t.matrix('theta')
+        self.W = self.theta[:,1:] 
+        self.b = self.theta[:,0]
+        self.XtX = t.matrix('XtX')
+        self.XtY = t.matrix('XtY')
+
+    def global_outputs(self):
+        self.regularizer = self.lambda * t.dot(self.W,self.W)
+        self.loss = self.regularizer + t.sum(self.squared_error) # this only makes sense if the whole training set fits in memory in a minibatch
+        self.loss_function = Function([self.W,self.lambda,self.squared_error],[self.loss])
+
+    def initialize(self):
+        self.XtX.resize((1+self.n_inputs,1+self.n_inputs))
+        self.XtY.resize((1+self.n_inputs,self.n_outputs))
+        self.XtX.data[:,:]=0
+        self.XtY.data[:,:]=0
+        numpy.diag(self.XtX.data)[1:]=self.lambda.data
+        
+    def updated_variables(self):
+        self.new_XtX = self.XtX + t.dot(self.extended_input.T,self.extended_input)
+        self.new_XtY = self.XtY + t.dot(self.extended_input.T,self.target)
+        self.new_theta = t.solve(self.XtX,self.XtY)
+    
+    def minibatch_wise_inputs(self):
+        self.input = t.matrix('input') # n_examples x n_inputs
+        self.target = t.matrix('target') # n_examples x n_outputs
+        
+    def minibatch_wise_outputs(self):
+        # self.input is a (n_examples, n_inputs) minibatch matrix
+        self.extended_input = t.prepend_one_to_each_row(self.input)
+        self.output = t.dot(self.input,self.W.T) + self.b  # (n_examples , n_outputs) matrix
+        self.squared_error = t.sum_within_rows(t.sqr(self.output-self.target)) # (n_examples ) vector
+
+    def attribute_names(self):
+        return ["lambda","b","W","regularization_term","XtX","XtY"]
+
+    def default_output_fields(self, input_fields):
+        output_fields = ["output"]
+        if "target" in input_fields:
+            output_fields.append("squared_error")
+        return output_fields
+        
+    # poutine generale basee sur ces fonctions
+
+    def minibatchwise_use_functions(self, input_fields, output_fields):
+        if not output_fields:
+            output_fields = self.default_output_fields(input_fields)
+        key = (input_fields,output_fields)
+        if key not in use_functions_dictionary:
+            use_functions_dictionary[key]=Function(self.names2attributes(input_fields),
+                                                   self.names2attributes(output_fields))
+        return use_functions_dictionary[key]
+
+    def names2attributes(self,names,return_Result=True):
+        if return_Result:
+            return [self.__getattr__(name) for name in names]
+        else:
+            return [self.__getattr__(name).data for name in names]
+
+    def use(self,input_dataset,output_fieldnames=None,test_stats_collector=None,copy_inputs=True):
+        minibatchwise_use_function = use_functions(input_dataset.fieldNames(),output_fieldnames)
+        virtual_output_dataset = ApplyFunctionDataSet(input_dataset,
+                                                      minibatchwise_use_function,
+                                                      True,DataSet.numpy_vstack,
+                                                      DataSet.numpy_hstack)
+        # actually force the computation
+        output_dataset = CachedDataSet(virtual_output_dataset,True)
+        if copy_inputs:
+            output_dataset = input_dataset | output_dataset
+        # compute the attributes that should be copied in the dataset
+        for attribute in self.attribute_names():
+            # .data assumes that all attributes are Result objects
+            output_dataset.__setattr__(attribute) = copy.deepcopy(self.__getattr__(attribute).data)
+        if test_stats_collector:
+            test_stats_collector.update(output_dataset)
+            for attribute in test_stats_collector.attribute_names():
+                output_dataset[attribute] = copy.deepcopy(test_stats_collector[attribute])
+        return output_dataset
+
+    def update(self,training_set,train_stats_collector=None):
+        
+    
+    def __init__(self,lambda=0.,max_memory_use=500):
         """
         @type lambda: float
         @param lambda: regularization coefficient
@@ -107,3 +192,8 @@
             elif output_fieldname=="squared_error":
                 use_functions.append(lambda self.output_function)
     
+        n_examples = len(input_dataset)
+        
+        for minibatch in input_dataset.minibatches(minibatch_size=minibatch_size, allow_odd_last_minibatch=True):
+            use_function(
+