changeset 421:e01f17be270a

Kernel regression learning algorithm
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Sat, 19 Jul 2008 10:11:22 -0400
parents 040cb796f4e0
children 32c5f87bc54e
files kernel_regression.py linear_regression.py
diffstat 2 files changed, 214 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kernel_regression.py	Sat Jul 19 10:11:22 2008 -0400
@@ -0,0 +1,209 @@
+"""
+Implementation of kernel regression:
+"""
+
+from pylearn.learner import OfflineLearningAlgorithm
+from theano import tensor as T
+from nnet_ops import prepend_1_to_each_row
+from theano.scalar import as_scalar
+from common.autoname import AutoName
+import theano
+import numpy
+
+class KernelRegression(OfflineLearningAlgorithm):
+    """
+Implementation of kernel regression:
+* the data are n (x_t,y_t) pairs and we want to estimate E[y|x]
+* the predictor computes
+     f(x) = b + \sum_{t=1}^n \alpha_t K(x,x_t)
+  with free parameters b and alpha, training inputs x_t,
+  and kernel function K (gaussian by default).
+  Clearly, each prediction involves O(n) computations.
+* the learner chooses b and alpha to minimize
+     lambda alpha' G' G alpha + \sum_{t=1}^n (f(x_t)-y_t)^2
+  where G is the matrix with entries G_ij = K(x_i,x_j).
+  The first (L2 regularization) term is the squared L2
+  norm of the primal weights w = \sum_t \alpha_t phi(x_t)
+  where phi is the function s.t. K(u,v)=phi(u).phi(v).
+* this involves solving a linear system with (n+1,n+1)
+  matrix, which is an O(n^3) computation. In addition,
+  that linear system matrix requires O(n^2) memory.
+  So this learning algorithm should be used only for
+  small datasets.
+* the linear system is
+      (M + lambda I_n) theta = (1, y)'
+  where theta = (b, alpha), I_n is the (n+1)x(n+1) matrix that is the identity 
+  except with a 0 at (0,0), M is the matrix with G in the sub-matrix starting 
+  at (1,1), 1's in column 0, except for a value of n at (0,0), and sum_i G_{i,j} 
+  in the rest of row 0.
+  
+Note that this is gives an estimate of E[y|x,training_set] that is the
+same as obtained with a Gaussian process regression. The GP
+regression would also provide a Bayesian Var[y|x,training_set].
+It corresponds to an assumption that f is a random variable
+with Gaussian (process) prior distribution with covariance
+function K. Because we assume Gaussian noise we obtain a Gaussian
+posterior for f (whose mean is computed here).
+
+
+    Usage:
+
+       kernel_regressor=KernelRegression(L2_regularizer=0.1,kernel=GaussianKernel(gamma=0.5))
+       kernel_predictor=kernel_regressor(training_set)
+       all_results_dataset=kernel_predictor(test_set) # creates a dataset with "output" and "squared_error" field
+       outputs = kernel_predictor.compute_outputs(inputs) # inputs and outputs are numpy arrays
+       outputs, errors = kernel_predictor.compute_outputs_and_errors(inputs,targets)
+       errors = kernel_predictor.compute_errors(inputs,targets)
+       mse = kernel_predictor.compute_mse(inputs,targets)
+       
+       
+
+    The training_set must have fields "input" and "target".
+    The test_set must have field "input", and needs "target" if
+    we want to compute the squared errors.
+
+    The predictor parameters are obtained analytically from the training set.
+    Training is only done on a whole training set rather than on minibatches
+    (no online implementation).
+
+    The dataset fields expected and produced by the learning algorithm and the trained model
+    are the following:
+
+     - Input and output dataset fields (example-wise quantities):
+
+       - 'input' (always expected as an input_dataset field)
+       - 'target' (always expected by the learning algorithm, optional for learned model)
+       - 'output' (always produced by learned model)
+       - 'squared_error' (optionally produced by learned model if 'target' is provided)
+          = example-wise squared error
+    """
+    def __init__(self, kernel=None, L2_regularizer=0, gamma=1):
+        self.kernel = kernel
+        self.L2_regularizer=L2_regularizer
+        self.gamma = gamma # until we fix things, the kernel type is fixed, Gaussian
+        self.equations = KernelRegressionEquations()
+
+    def __call__(self,trainset):
+        n_examples = len(trainset)
+        first_example = trainset[0]
+        n_inputs = first_example['input'].size
+        n_outputs = first_example['target'].size
+        M = numpy.zeros((n_examples+1,n_examples+1))
+        Y = numpy.zeros((n_examples+1,n_outputs))
+        for i in xrange(n_inputs):
+            M[i+1,i+1]=self.L2_regularizer
+        data = trainset.fields()
+        train_inputs = numpy.array(data['input'])
+        Y[0]=1
+        Y[1:,:] = numpy.array(data['target'])
+        M,train_inputs_square=self.equations.compute_system_matrix(train_inputs,M)
+        theta=numpy.linalg.solve(M,Y)
+        return KernelPredictor(theta,self.gamma, train_inputs, train_inputs_square)
+
+class KernelPredictorEquations(AutoName):
+    train_inputs = T.matrix() # n_examples x n_inputs
+    train_inputs_square = T.vector() # n_examples
+    inputs = T.matrix() # minibatchsize x n_inputs
+    targets = T.matrix() # minibatchsize x n_outputs
+    theta = T.matrix() # (n_examples+1) x n_outputs
+    gamma = T.scalar()
+    inv_gamma2 = 1./(gamma*gamma)
+    b = theta[0]
+    alpha = theta[1:,:]
+    inputs_square = T.sum(inputs*inputs,axis=1)
+    Kx = exp(-(train_inputs_square-2*dot(inputs,train_inputs.T)+inputs_square)*inv_gamma2)
+    outputs = T.dot(Kx,alpha) + b # minibatchsize x n_outputs
+    squared_errors = T.sum(T.sqr(targets-outputs),axis=1)
+
+    __compiled = False
+    @classmethod
+    def compile(cls,linker='c|py'):
+        if cls.__compiled:
+            return
+        def fn(input_vars,output_vars):
+            return staticmethod(theano.function(input_vars,output_vars, linker=linker))
+
+        cls.compute_outputs = fn([cls.inputs,cls.theta,cls.gamma,cls.train_inputs,cls.train_inputs_square],[cls.outputs])
+        cls.compute_errors = fn([cls.outputs,cls.targets],[cls.squared_errors])
+
+        cls.__compiled = True
+
+    def __init__(self):
+        self.compile()
+        
+class KernelRegressionEquations(KernelPredictorEquations):
+    # P = KernelPredictorEquations
+    M = T.matrix() # (n_examples+1) x (n_examples+1)
+    inputs = T.matrix() # n_examples x n_inputs
+    G = M[1:,1:]
+    new_G = gemm(G,1.,inputs,inputs.T,1.)
+    M2 = T.add_inplace(M,new_G)
+    M2[0,0] = M.shape[0]
+    M2[1:,0] = 1
+    M2[0,1:] = T.sum(G,axis=0)
+    inputs_square = T.sum(inputs*inputs,axis=1)
+    
+    __compiled = False
+    
+    @classmethod
+    def compile(cls,linker='c|py'):
+        if cls.__compiled:
+            return
+        def fn(input_vars,output_vars):
+            return staticmethod(theano.function(input_vars,output_vars, linker=linker))
+
+        cls.compute_system_matrix = fn([cls.inputs,cls.M],[cls.M2,cls.inputs_square])
+
+        cls.__compiled = True
+
+    def __init__(self):
+        self.compile()
+
+class KernelPredictor(object):
+    """
+    A kernel predictor has parameters theta (a bias vector and a weight matrix alpha)
+    it can use to make a non-linear prediction (according to the KernelPredictorEquations).
+    It can compute its output (bias + alpha * kernel(train_inputs,input) and a squared error (||output - target||^2).
+    """
+    def __init__(self, theta, gamma, train_inputs, train_inputs_square):
+        self.theta=theta
+        self.gamma=gamma
+        self.train_inputs=train_inputs
+        self.train_inputs_square=train_inputs_square
+        self.equations = LinearPredictorEquations()
+
+    def compute_outputs(self,inputs):
+        return self.equations.compute_outputs(inputs,self.theta,self.gamma,self.train_inputs,self.train_inputs_square)
+    def compute_errors(self,inputs,targets):
+        return self.equations.compute_errors(self.compute_outputs(inputs),targets)
+    def compute_outputs_and_errors(self,inputs,targets):
+        outputs = self.compute_outputs(inputs)
+        return [outputs,self.equations.compute_errors(outputs,targets)]
+    def compute_mse(self,inputs,targets):
+        errors = self.compute_errors(inputs,targets)
+        return numpy.sum(errors)/errors.size
+    
+    def __call__(self,dataset,output_fieldnames=None,cached_output_dataset=False):
+        assert dataset.hasFields(["input"])
+        if output_fieldnames is None:
+            if dataset.hasFields(["target"]):
+                output_fieldnames = ["output","squared_error"]
+            else:
+                output_fieldnames = ["output"]
+        output_fieldnames.sort()
+        if output_fieldnames == ["squared_error"]:
+            f = self.compute_errors
+        elif output_fieldnames == ["output"]:
+            f = self.compute_outputs
+        elif output_fieldnames == ["output","squared_error"]:
+            f = self.compute_outputs_and_errors
+        else:
+            raise ValueError("unknown field(s) in output_fieldnames: "+str(output_fieldnames))
+        
+        ds=ApplyFunctionDataSet(dataset,f,output_fieldnames)
+        if cached_output_dataset:
+            return CachedDataSet(ds)
+        else:
+            return ds
+        
+
--- a/linear_regression.py	Tue Jul 15 12:57:21 2008 -0400
+++ b/linear_regression.py	Sat Jul 19 10:11:22 2008 -0400
@@ -34,12 +34,15 @@
     we want to compute the squared errors.
 
     The predictor parameters are obtained analytically from the training set.
+
+    *** NOT IMPLEMENTED YET ***
     Training can proceed sequentially (with multiple calls to update with
     different disjoint subsets of the training sets). After each call to
     update the predictor is ready to be used (and optimized for the union
     of all the training sets passed to update since construction or since
     the last call to forget).
-
+    ***************************
+    
     For each (input[t],output[t]) pair in a minibatch,::
     
        output_t = b + W * input_t
@@ -74,7 +77,7 @@
     def __init__(self, L2_regularizer=0,minibatch_size=10000):
         self.L2_regularizer=L2_regularizer
         self.equations = LinearRegressionEquations()
-        self.minibatch_size=1000
+        self.minibatch_size=minibatch_size
 
     def __call__(self,trainset):
         first_example = trainset[0]