view kernel_regression.py @ 463:8df9179b3239

Updated file locations
author Joseph Turian <turian@iro.umontreal.ca>
date Wed, 08 Oct 2008 17:31:26 -0400
parents 0f8c81b0776d
children 05f802184606
line wrap: on
line source

"""
Implementation of kernel regression:
"""

from pylearn.learner import OfflineLearningAlgorithm
from theano import tensor as T
from nnet_ops import prepend_1_to_each_row
from theano.scalar import as_scalar
from common.autoname import AutoName
import theano
import numpy

# map a N-vector to a 1xN matrix
row_vector = theano.elemwise.DimShuffle((False,),['x',0])
# map a N-vector to a Nx1 matrix
col_vector = theano.elemwise.DimShuffle((False,),[0,'x'])

class KernelRegression(OfflineLearningAlgorithm):
    """
Implementation of kernel regression:
* the data are n (x_t,y_t) pairs and we want to estimate E[y|x]
* the predictor computes
     f(x) = b + \sum_{t=1}^n \alpha_t K(x,x_t)
  with free parameters b and alpha, training inputs x_t,
  and kernel function K (gaussian by default).
  Clearly, each prediction involves O(n) computations.
* the learner chooses b and alpha to minimize
     lambda alpha' G' G alpha + \sum_{t=1}^n (f(x_t)-y_t)^2
  where G is the matrix with entries G_ij = K(x_i,x_j).
  The first (L2 regularization) term is the squared L2
  norm of the primal weights w = \sum_t \alpha_t phi(x_t)
  where phi is the function s.t. K(u,v)=phi(u).phi(v).
* this involves solving a linear system with (n+1,n+1)
  matrix, which is an O(n^3) computation. In addition,
  that linear system matrix requires O(n^2) memory.
  So this learning algorithm should be used only for
  small datasets.
* the linear system is
      (M + lambda I_n) theta = (1, y)'
  where theta = (b, alpha), I_n is the (n+1)x(n+1) matrix that is the identity 
  except with a 0 at (0,0), M is the matrix with G in the sub-matrix starting 
  at (1,1), 1's in column 0, except for a value of n at (0,0), and sum_i G_{i,j} 
  in the rest of row 0.
  
Note that this is gives an estimate of E[y|x,training_set] that is the
same as obtained with a Gaussian process regression. The GP
regression would also provide a Bayesian Var[y|x,training_set].
It corresponds to an assumption that f is a random variable
with Gaussian (process) prior distribution with covariance
function K. Because we assume Gaussian noise we obtain a Gaussian
posterior for f (whose mean is computed here).


    Usage:

       kernel_regressor=KernelRegression(L2_regularizer=0.1,gamma=0.5) (kernel=GaussianKernel(gamma=0.5))
       kernel_predictor=kernel_regressor(training_set)
       all_results_dataset=kernel_predictor(test_set) # creates a dataset with "output" and "squared_error" field
       outputs = kernel_predictor.compute_outputs(inputs) # inputs and outputs are numpy arrays
       outputs, errors = kernel_predictor.compute_outputs_and_errors(inputs,targets)
       errors = kernel_predictor.compute_errors(inputs,targets)
       mse = kernel_predictor.compute_mse(inputs,targets)
       
       

    The training_set must have fields "input" and "target".
    The test_set must have field "input", and needs "target" if
    we want to compute the squared errors.

    The predictor parameters are obtained analytically from the training set.
    Training is only done on a whole training set rather than on minibatches
    (no online implementation).

    The dataset fields expected and produced by the learning algorithm and the trained model
    are the following:

     - Input and output dataset fields (example-wise quantities):

       - 'input' (always expected as an input_dataset field)
       - 'target' (always expected by the learning algorithm, optional for learned model)
       - 'output' (always produced by learned model)
       - 'squared_error' (optionally produced by learned model if 'target' is provided)
          = example-wise squared error
    """
    def __init__(self, kernel=None, L2_regularizer=0, gamma=1, use_bias=False):
        # THE VERSION WITH BIAS DOES NOT SEEM RIGHT
        self.kernel = kernel
        self.L2_regularizer=L2_regularizer
        self.use_bias=use_bias
        self.gamma = gamma # until we fix things, the kernel type is fixed, Gaussian
        self.equations = KernelRegressionEquations()

    def __call__(self,trainset):
        n_examples = len(trainset)
        first_example = trainset[0]
        n_inputs = first_example['input'].size
        n_outputs = first_example['target'].size
        b1=1 if self.use_bias else 0
        M = numpy.zeros((n_examples+b1,n_examples+b1))
        Y = numpy.zeros((n_examples+b1,n_outputs))
        for i in xrange(n_examples):
            M[i+b1,i+b1]=self.L2_regularizer
        data = trainset.fields()
        train_inputs = numpy.array(data['input'])
        if self.use_bias:
            Y[0]=1
        Y[b1:,:] = numpy.array(data['target'])
        train_inputs_square,sumG,G=self.equations.compute_system_matrix(train_inputs,self.gamma)
        M[b1:,b1:] += G
        if self.use_bias:
            M[0,1:] = sumG
            M[1:,0] = 1
            M[0,0] = M.shape[0]
        self.M=M
        self.Y=Y
        theta=numpy.linalg.solve(M,Y)
        return KernelPredictor(theta,self.gamma, train_inputs, train_inputs_square)

class KernelPredictorEquations(AutoName):
    train_inputs = T.matrix() # n_examples x n_inputs
    train_inputs_square = T.vector() # n_examples
    inputs = T.matrix() # minibatchsize x n_inputs
    targets = T.matrix() # minibatchsize x n_outputs
    theta = T.matrix() # (n_examples+1) x n_outputs
    b1 = T.shape(train_inputs_square)[0]<T.shape(theta)[0]
    gamma = T.scalar()
    inv_gamma2 = 1./(gamma*gamma)
    b = b1*theta[0]
    alpha = theta[b1:,:]
    inputs_square = T.sum(inputs*inputs,axis=1)
    Kx = T.exp(-(row_vector(train_inputs_square)-2*T.dot(inputs,train_inputs.T)+col_vector(inputs_square))*inv_gamma2)
    outputs = T.dot(Kx,alpha) + b # minibatchsize x n_outputs
    squared_errors = T.sum(T.sqr(targets-outputs),axis=1)

    __compiled = False
    @classmethod
    def compile(cls,linker='c|py'):
        if cls.__compiled:
            return
        def fn(input_vars,output_vars):
            return staticmethod(theano.function(input_vars,output_vars, linker=linker))

        cls.compute_outputs = fn([cls.inputs,cls.theta,cls.gamma,cls.train_inputs,cls.train_inputs_square],[cls.outputs])
        cls.compute_errors = fn([cls.outputs,cls.targets],[cls.squared_errors])

        cls.__compiled = True

    def __init__(self):
        self.compile()
        
class KernelRegressionEquations(KernelPredictorEquations):
    #M = T.matrix() # (n_examples+1) x (n_examples+1)
    inputs = T.matrix() # n_examples x n_inputs
    gamma = T.scalar()
    inv_gamma2 = 1./(gamma*gamma)
    inputs_square = T.sum(inputs*inputs,axis=1)
    #new_G = G+T.dot(inputs,inputs.T)
    #new_G = T.gemm(G,1.,inputs,inputs.T,1.)
    G = T.exp(-(row_vector(inputs_square)-2*T.dot(inputs,inputs.T)+col_vector(inputs_square))*inv_gamma2)
    sumG = T.sum(G,axis=0)
    
    __compiled = False
    
    @classmethod
    def compile(cls,linker='c|py'):
        if cls.__compiled:
            return
        def fn(input_vars,output_vars):
            return staticmethod(theano.function(input_vars,output_vars, linker=linker))

        cls.compute_system_matrix = fn([cls.inputs,cls.gamma],[cls.inputs_square,cls.sumG,cls.G])

        cls.__compiled = True

    def __init__(self):
        self.compile()

class KernelPredictor(object):
    """
    A kernel predictor has parameters theta (a bias vector and a weight matrix alpha)
    it can use to make a non-linear prediction (according to the KernelPredictorEquations).
    It can compute its output (bias + alpha * kernel(train_inputs,input) and a squared error (||output - target||^2).
    """
    def __init__(self, theta, gamma, train_inputs, train_inputs_square=None):
        self.theta=theta
        self.gamma=gamma
        self.train_inputs=train_inputs
        if train_inputs_square==None:
            train_inputs_square = numpy.sum(train_inputs*train_inputs,axis=1)
        self.train_inputs_square=train_inputs_square
        self.equations = KernelPredictorEquations()

    def compute_outputs(self,inputs):
        return self.equations.compute_outputs(inputs,self.theta,self.gamma,self.train_inputs,self.train_inputs_square)
    def compute_errors(self,inputs,targets):
        return self.equations.compute_errors(self.compute_outputs(inputs),targets)
    def compute_outputs_and_errors(self,inputs,targets):
        outputs = self.compute_outputs(inputs)
        return [outputs,self.equations.compute_errors(outputs,targets)]
    def compute_mse(self,inputs,targets):
        errors = self.compute_errors(inputs,targets)
        return numpy.sum(errors)/errors.size
    
    def __call__(self,dataset,output_fieldnames=None,cached_output_dataset=False):
        assert dataset.hasFields(["input"])
        if output_fieldnames is None:
            if dataset.hasFields(["target"]):
                output_fieldnames = ["output","squared_error"]
            else:
                output_fieldnames = ["output"]
        output_fieldnames.sort()
        if output_fieldnames == ["squared_error"]:
            f = self.compute_errors
        elif output_fieldnames == ["output"]:
            f = self.compute_outputs
        elif output_fieldnames == ["output","squared_error"]:
            f = self.compute_outputs_and_errors
        else:
            raise ValueError("unknown field(s) in output_fieldnames: "+str(output_fieldnames))
        
        ds=ApplyFunctionDataSet(dataset,f,output_fieldnames)
        if cached_output_dataset:
            return CachedDataSet(ds)
        else:
            return ds
        

def kernel_predictor(inputs,params,*otherargs):
  p = KernelPredictor(params,*otherargs[0])
  return p.compute_outputs(inputs)