diff kernel_regression.py @ 421:e01f17be270a

Kernel regression learning algorithm
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Sat, 19 Jul 2008 10:11:22 -0400
parents
children 32c5f87bc54e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kernel_regression.py	Sat Jul 19 10:11:22 2008 -0400
@@ -0,0 +1,209 @@
+"""
+Implementation of kernel regression:
+"""
+
+from pylearn.learner import OfflineLearningAlgorithm
+from theano import tensor as T
+from nnet_ops import prepend_1_to_each_row
+from theano.scalar import as_scalar
+from common.autoname import AutoName
+import theano
+import numpy
+
+class KernelRegression(OfflineLearningAlgorithm):
+    """
+Implementation of kernel regression:
+* the data are n (x_t,y_t) pairs and we want to estimate E[y|x]
+* the predictor computes
+     f(x) = b + \sum_{t=1}^n \alpha_t K(x,x_t)
+  with free parameters b and alpha, training inputs x_t,
+  and kernel function K (gaussian by default).
+  Clearly, each prediction involves O(n) computations.
+* the learner chooses b and alpha to minimize
+     lambda alpha' G' G alpha + \sum_{t=1}^n (f(x_t)-y_t)^2
+  where G is the matrix with entries G_ij = K(x_i,x_j).
+  The first (L2 regularization) term is the squared L2
+  norm of the primal weights w = \sum_t \alpha_t phi(x_t)
+  where phi is the function s.t. K(u,v)=phi(u).phi(v).
+* this involves solving a linear system with (n+1,n+1)
+  matrix, which is an O(n^3) computation. In addition,
+  that linear system matrix requires O(n^2) memory.
+  So this learning algorithm should be used only for
+  small datasets.
+* the linear system is
+      (M + lambda I_n) theta = (1, y)'
+  where theta = (b, alpha), I_n is the (n+1)x(n+1) matrix that is the identity 
+  except with a 0 at (0,0), M is the matrix with G in the sub-matrix starting 
+  at (1,1), 1's in column 0, except for a value of n at (0,0), and sum_i G_{i,j} 
+  in the rest of row 0.
+  
+Note that this is gives an estimate of E[y|x,training_set] that is the
+same as obtained with a Gaussian process regression. The GP
+regression would also provide a Bayesian Var[y|x,training_set].
+It corresponds to an assumption that f is a random variable
+with Gaussian (process) prior distribution with covariance
+function K. Because we assume Gaussian noise we obtain a Gaussian
+posterior for f (whose mean is computed here).
+
+
+    Usage:
+
+       kernel_regressor=KernelRegression(L2_regularizer=0.1,kernel=GaussianKernel(gamma=0.5))
+       kernel_predictor=kernel_regressor(training_set)
+       all_results_dataset=kernel_predictor(test_set) # creates a dataset with "output" and "squared_error" field
+       outputs = kernel_predictor.compute_outputs(inputs) # inputs and outputs are numpy arrays
+       outputs, errors = kernel_predictor.compute_outputs_and_errors(inputs,targets)
+       errors = kernel_predictor.compute_errors(inputs,targets)
+       mse = kernel_predictor.compute_mse(inputs,targets)
+       
+       
+
+    The training_set must have fields "input" and "target".
+    The test_set must have field "input", and needs "target" if
+    we want to compute the squared errors.
+
+    The predictor parameters are obtained analytically from the training set.
+    Training is only done on a whole training set rather than on minibatches
+    (no online implementation).
+
+    The dataset fields expected and produced by the learning algorithm and the trained model
+    are the following:
+
+     - Input and output dataset fields (example-wise quantities):
+
+       - 'input' (always expected as an input_dataset field)
+       - 'target' (always expected by the learning algorithm, optional for learned model)
+       - 'output' (always produced by learned model)
+       - 'squared_error' (optionally produced by learned model if 'target' is provided)
+          = example-wise squared error
+    """
+    def __init__(self, kernel=None, L2_regularizer=0, gamma=1):
+        self.kernel = kernel
+        self.L2_regularizer=L2_regularizer
+        self.gamma = gamma # until we fix things, the kernel type is fixed, Gaussian
+        self.equations = KernelRegressionEquations()
+
+    def __call__(self,trainset):
+        n_examples = len(trainset)
+        first_example = trainset[0]
+        n_inputs = first_example['input'].size
+        n_outputs = first_example['target'].size
+        M = numpy.zeros((n_examples+1,n_examples+1))
+        Y = numpy.zeros((n_examples+1,n_outputs))
+        for i in xrange(n_inputs):
+            M[i+1,i+1]=self.L2_regularizer
+        data = trainset.fields()
+        train_inputs = numpy.array(data['input'])
+        Y[0]=1
+        Y[1:,:] = numpy.array(data['target'])
+        M,train_inputs_square=self.equations.compute_system_matrix(train_inputs,M)
+        theta=numpy.linalg.solve(M,Y)
+        return KernelPredictor(theta,self.gamma, train_inputs, train_inputs_square)
+
+class KernelPredictorEquations(AutoName):
+    train_inputs = T.matrix() # n_examples x n_inputs
+    train_inputs_square = T.vector() # n_examples
+    inputs = T.matrix() # minibatchsize x n_inputs
+    targets = T.matrix() # minibatchsize x n_outputs
+    theta = T.matrix() # (n_examples+1) x n_outputs
+    gamma = T.scalar()
+    inv_gamma2 = 1./(gamma*gamma)
+    b = theta[0]
+    alpha = theta[1:,:]
+    inputs_square = T.sum(inputs*inputs,axis=1)
+    Kx = exp(-(train_inputs_square-2*dot(inputs,train_inputs.T)+inputs_square)*inv_gamma2)
+    outputs = T.dot(Kx,alpha) + b # minibatchsize x n_outputs
+    squared_errors = T.sum(T.sqr(targets-outputs),axis=1)
+
+    __compiled = False
+    @classmethod
+    def compile(cls,linker='c|py'):
+        if cls.__compiled:
+            return
+        def fn(input_vars,output_vars):
+            return staticmethod(theano.function(input_vars,output_vars, linker=linker))
+
+        cls.compute_outputs = fn([cls.inputs,cls.theta,cls.gamma,cls.train_inputs,cls.train_inputs_square],[cls.outputs])
+        cls.compute_errors = fn([cls.outputs,cls.targets],[cls.squared_errors])
+
+        cls.__compiled = True
+
+    def __init__(self):
+        self.compile()
+        
+class KernelRegressionEquations(KernelPredictorEquations):
+    # P = KernelPredictorEquations
+    M = T.matrix() # (n_examples+1) x (n_examples+1)
+    inputs = T.matrix() # n_examples x n_inputs
+    G = M[1:,1:]
+    new_G = gemm(G,1.,inputs,inputs.T,1.)
+    M2 = T.add_inplace(M,new_G)
+    M2[0,0] = M.shape[0]
+    M2[1:,0] = 1
+    M2[0,1:] = T.sum(G,axis=0)
+    inputs_square = T.sum(inputs*inputs,axis=1)
+    
+    __compiled = False
+    
+    @classmethod
+    def compile(cls,linker='c|py'):
+        if cls.__compiled:
+            return
+        def fn(input_vars,output_vars):
+            return staticmethod(theano.function(input_vars,output_vars, linker=linker))
+
+        cls.compute_system_matrix = fn([cls.inputs,cls.M],[cls.M2,cls.inputs_square])
+
+        cls.__compiled = True
+
+    def __init__(self):
+        self.compile()
+
+class KernelPredictor(object):
+    """
+    A kernel predictor has parameters theta (a bias vector and a weight matrix alpha)
+    it can use to make a non-linear prediction (according to the KernelPredictorEquations).
+    It can compute its output (bias + alpha * kernel(train_inputs,input) and a squared error (||output - target||^2).
+    """
+    def __init__(self, theta, gamma, train_inputs, train_inputs_square):
+        self.theta=theta
+        self.gamma=gamma
+        self.train_inputs=train_inputs
+        self.train_inputs_square=train_inputs_square
+        self.equations = LinearPredictorEquations()
+
+    def compute_outputs(self,inputs):
+        return self.equations.compute_outputs(inputs,self.theta,self.gamma,self.train_inputs,self.train_inputs_square)
+    def compute_errors(self,inputs,targets):
+        return self.equations.compute_errors(self.compute_outputs(inputs),targets)
+    def compute_outputs_and_errors(self,inputs,targets):
+        outputs = self.compute_outputs(inputs)
+        return [outputs,self.equations.compute_errors(outputs,targets)]
+    def compute_mse(self,inputs,targets):
+        errors = self.compute_errors(inputs,targets)
+        return numpy.sum(errors)/errors.size
+    
+    def __call__(self,dataset,output_fieldnames=None,cached_output_dataset=False):
+        assert dataset.hasFields(["input"])
+        if output_fieldnames is None:
+            if dataset.hasFields(["target"]):
+                output_fieldnames = ["output","squared_error"]
+            else:
+                output_fieldnames = ["output"]
+        output_fieldnames.sort()
+        if output_fieldnames == ["squared_error"]:
+            f = self.compute_errors
+        elif output_fieldnames == ["output"]:
+            f = self.compute_outputs
+        elif output_fieldnames == ["output","squared_error"]:
+            f = self.compute_outputs_and_errors
+        else:
+            raise ValueError("unknown field(s) in output_fieldnames: "+str(output_fieldnames))
+        
+        ds=ApplyFunctionDataSet(dataset,f,output_fieldnames)
+        if cached_output_dataset:
+            return CachedDataSet(ds)
+        else:
+            return ds
+        
+