Mercurial > pylearn
comparison kernel_regression.py @ 435:eac0a7d44ff0
merge
author | Olivier Breuleux <breuleuo@iro.umontreal.ca> |
---|---|
date | Mon, 04 Aug 2008 16:29:30 -0400 |
parents | 0f8c81b0776d |
children | 05f802184606 |
comparison
equal
deleted
inserted
replaced
434:0f366ecb11ee | 435:eac0a7d44ff0 |
---|---|
1 """ | |
2 Implementation of kernel regression: | |
3 """ | |
4 | |
5 from pylearn.learner import OfflineLearningAlgorithm | |
6 from theano import tensor as T | |
7 from nnet_ops import prepend_1_to_each_row | |
8 from theano.scalar import as_scalar | |
9 from common.autoname import AutoName | |
10 import theano | |
11 import numpy | |
12 | |
13 # map a N-vector to a 1xN matrix | |
14 row_vector = theano.elemwise.DimShuffle((False,),['x',0]) | |
15 # map a N-vector to a Nx1 matrix | |
16 col_vector = theano.elemwise.DimShuffle((False,),[0,'x']) | |
17 | |
18 class KernelRegression(OfflineLearningAlgorithm): | |
19 """ | |
20 Implementation of kernel regression: | |
21 * the data are n (x_t,y_t) pairs and we want to estimate E[y|x] | |
22 * the predictor computes | |
23 f(x) = b + \sum_{t=1}^n \alpha_t K(x,x_t) | |
24 with free parameters b and alpha, training inputs x_t, | |
25 and kernel function K (gaussian by default). | |
26 Clearly, each prediction involves O(n) computations. | |
27 * the learner chooses b and alpha to minimize | |
28 lambda alpha' G' G alpha + \sum_{t=1}^n (f(x_t)-y_t)^2 | |
29 where G is the matrix with entries G_ij = K(x_i,x_j). | |
30 The first (L2 regularization) term is the squared L2 | |
31 norm of the primal weights w = \sum_t \alpha_t phi(x_t) | |
32 where phi is the function s.t. K(u,v)=phi(u).phi(v). | |
33 * this involves solving a linear system with (n+1,n+1) | |
34 matrix, which is an O(n^3) computation. In addition, | |
35 that linear system matrix requires O(n^2) memory. | |
36 So this learning algorithm should be used only for | |
37 small datasets. | |
38 * the linear system is | |
39 (M + lambda I_n) theta = (1, y)' | |
40 where theta = (b, alpha), I_n is the (n+1)x(n+1) matrix that is the identity | |
41 except with a 0 at (0,0), M is the matrix with G in the sub-matrix starting | |
42 at (1,1), 1's in column 0, except for a value of n at (0,0), and sum_i G_{i,j} | |
43 in the rest of row 0. | |
44 | |
45 Note that this is gives an estimate of E[y|x,training_set] that is the | |
46 same as obtained with a Gaussian process regression. The GP | |
47 regression would also provide a Bayesian Var[y|x,training_set]. | |
48 It corresponds to an assumption that f is a random variable | |
49 with Gaussian (process) prior distribution with covariance | |
50 function K. Because we assume Gaussian noise we obtain a Gaussian | |
51 posterior for f (whose mean is computed here). | |
52 | |
53 | |
54 Usage: | |
55 | |
56 kernel_regressor=KernelRegression(L2_regularizer=0.1,gamma=0.5) (kernel=GaussianKernel(gamma=0.5)) | |
57 kernel_predictor=kernel_regressor(training_set) | |
58 all_results_dataset=kernel_predictor(test_set) # creates a dataset with "output" and "squared_error" field | |
59 outputs = kernel_predictor.compute_outputs(inputs) # inputs and outputs are numpy arrays | |
60 outputs, errors = kernel_predictor.compute_outputs_and_errors(inputs,targets) | |
61 errors = kernel_predictor.compute_errors(inputs,targets) | |
62 mse = kernel_predictor.compute_mse(inputs,targets) | |
63 | |
64 | |
65 | |
66 The training_set must have fields "input" and "target". | |
67 The test_set must have field "input", and needs "target" if | |
68 we want to compute the squared errors. | |
69 | |
70 The predictor parameters are obtained analytically from the training set. | |
71 Training is only done on a whole training set rather than on minibatches | |
72 (no online implementation). | |
73 | |
74 The dataset fields expected and produced by the learning algorithm and the trained model | |
75 are the following: | |
76 | |
77 - Input and output dataset fields (example-wise quantities): | |
78 | |
79 - 'input' (always expected as an input_dataset field) | |
80 - 'target' (always expected by the learning algorithm, optional for learned model) | |
81 - 'output' (always produced by learned model) | |
82 - 'squared_error' (optionally produced by learned model if 'target' is provided) | |
83 = example-wise squared error | |
84 """ | |
85 def __init__(self, kernel=None, L2_regularizer=0, gamma=1, use_bias=False): | |
86 # THE VERSION WITH BIAS DOES NOT SEEM RIGHT | |
87 self.kernel = kernel | |
88 self.L2_regularizer=L2_regularizer | |
89 self.use_bias=use_bias | |
90 self.gamma = gamma # until we fix things, the kernel type is fixed, Gaussian | |
91 self.equations = KernelRegressionEquations() | |
92 | |
93 def __call__(self,trainset): | |
94 n_examples = len(trainset) | |
95 first_example = trainset[0] | |
96 n_inputs = first_example['input'].size | |
97 n_outputs = first_example['target'].size | |
98 b1=1 if self.use_bias else 0 | |
99 M = numpy.zeros((n_examples+b1,n_examples+b1)) | |
100 Y = numpy.zeros((n_examples+b1,n_outputs)) | |
101 for i in xrange(n_examples): | |
102 M[i+b1,i+b1]=self.L2_regularizer | |
103 data = trainset.fields() | |
104 train_inputs = numpy.array(data['input']) | |
105 if self.use_bias: | |
106 Y[0]=1 | |
107 Y[b1:,:] = numpy.array(data['target']) | |
108 train_inputs_square,sumG,G=self.equations.compute_system_matrix(train_inputs,self.gamma) | |
109 M[b1:,b1:] += G | |
110 if self.use_bias: | |
111 M[0,1:] = sumG | |
112 M[1:,0] = 1 | |
113 M[0,0] = M.shape[0] | |
114 self.M=M | |
115 self.Y=Y | |
116 theta=numpy.linalg.solve(M,Y) | |
117 return KernelPredictor(theta,self.gamma, train_inputs, train_inputs_square) | |
118 | |
119 class KernelPredictorEquations(AutoName): | |
120 train_inputs = T.matrix() # n_examples x n_inputs | |
121 train_inputs_square = T.vector() # n_examples | |
122 inputs = T.matrix() # minibatchsize x n_inputs | |
123 targets = T.matrix() # minibatchsize x n_outputs | |
124 theta = T.matrix() # (n_examples+1) x n_outputs | |
125 b1 = T.shape(train_inputs_square)[0]<T.shape(theta)[0] | |
126 gamma = T.scalar() | |
127 inv_gamma2 = 1./(gamma*gamma) | |
128 b = b1*theta[0] | |
129 alpha = theta[b1:,:] | |
130 inputs_square = T.sum(inputs*inputs,axis=1) | |
131 Kx = T.exp(-(row_vector(train_inputs_square)-2*T.dot(inputs,train_inputs.T)+col_vector(inputs_square))*inv_gamma2) | |
132 outputs = T.dot(Kx,alpha) + b # minibatchsize x n_outputs | |
133 squared_errors = T.sum(T.sqr(targets-outputs),axis=1) | |
134 | |
135 __compiled = False | |
136 @classmethod | |
137 def compile(cls,linker='c|py'): | |
138 if cls.__compiled: | |
139 return | |
140 def fn(input_vars,output_vars): | |
141 return staticmethod(theano.function(input_vars,output_vars, linker=linker)) | |
142 | |
143 cls.compute_outputs = fn([cls.inputs,cls.theta,cls.gamma,cls.train_inputs,cls.train_inputs_square],[cls.outputs]) | |
144 cls.compute_errors = fn([cls.outputs,cls.targets],[cls.squared_errors]) | |
145 | |
146 cls.__compiled = True | |
147 | |
148 def __init__(self): | |
149 self.compile() | |
150 | |
151 class KernelRegressionEquations(KernelPredictorEquations): | |
152 #M = T.matrix() # (n_examples+1) x (n_examples+1) | |
153 inputs = T.matrix() # n_examples x n_inputs | |
154 gamma = T.scalar() | |
155 inv_gamma2 = 1./(gamma*gamma) | |
156 inputs_square = T.sum(inputs*inputs,axis=1) | |
157 #new_G = G+T.dot(inputs,inputs.T) | |
158 #new_G = T.gemm(G,1.,inputs,inputs.T,1.) | |
159 G = T.exp(-(row_vector(inputs_square)-2*T.dot(inputs,inputs.T)+col_vector(inputs_square))*inv_gamma2) | |
160 sumG = T.sum(G,axis=0) | |
161 | |
162 __compiled = False | |
163 | |
164 @classmethod | |
165 def compile(cls,linker='c|py'): | |
166 if cls.__compiled: | |
167 return | |
168 def fn(input_vars,output_vars): | |
169 return staticmethod(theano.function(input_vars,output_vars, linker=linker)) | |
170 | |
171 cls.compute_system_matrix = fn([cls.inputs,cls.gamma],[cls.inputs_square,cls.sumG,cls.G]) | |
172 | |
173 cls.__compiled = True | |
174 | |
175 def __init__(self): | |
176 self.compile() | |
177 | |
178 class KernelPredictor(object): | |
179 """ | |
180 A kernel predictor has parameters theta (a bias vector and a weight matrix alpha) | |
181 it can use to make a non-linear prediction (according to the KernelPredictorEquations). | |
182 It can compute its output (bias + alpha * kernel(train_inputs,input) and a squared error (||output - target||^2). | |
183 """ | |
184 def __init__(self, theta, gamma, train_inputs, train_inputs_square=None): | |
185 self.theta=theta | |
186 self.gamma=gamma | |
187 self.train_inputs=train_inputs | |
188 if train_inputs_square==None: | |
189 train_inputs_square = numpy.sum(train_inputs*train_inputs,axis=1) | |
190 self.train_inputs_square=train_inputs_square | |
191 self.equations = KernelPredictorEquations() | |
192 | |
193 def compute_outputs(self,inputs): | |
194 return self.equations.compute_outputs(inputs,self.theta,self.gamma,self.train_inputs,self.train_inputs_square) | |
195 def compute_errors(self,inputs,targets): | |
196 return self.equations.compute_errors(self.compute_outputs(inputs),targets) | |
197 def compute_outputs_and_errors(self,inputs,targets): | |
198 outputs = self.compute_outputs(inputs) | |
199 return [outputs,self.equations.compute_errors(outputs,targets)] | |
200 def compute_mse(self,inputs,targets): | |
201 errors = self.compute_errors(inputs,targets) | |
202 return numpy.sum(errors)/errors.size | |
203 | |
204 def __call__(self,dataset,output_fieldnames=None,cached_output_dataset=False): | |
205 assert dataset.hasFields(["input"]) | |
206 if output_fieldnames is None: | |
207 if dataset.hasFields(["target"]): | |
208 output_fieldnames = ["output","squared_error"] | |
209 else: | |
210 output_fieldnames = ["output"] | |
211 output_fieldnames.sort() | |
212 if output_fieldnames == ["squared_error"]: | |
213 f = self.compute_errors | |
214 elif output_fieldnames == ["output"]: | |
215 f = self.compute_outputs | |
216 elif output_fieldnames == ["output","squared_error"]: | |
217 f = self.compute_outputs_and_errors | |
218 else: | |
219 raise ValueError("unknown field(s) in output_fieldnames: "+str(output_fieldnames)) | |
220 | |
221 ds=ApplyFunctionDataSet(dataset,f,output_fieldnames) | |
222 if cached_output_dataset: | |
223 return CachedDataSet(ds) | |
224 else: | |
225 return ds | |
226 | |
227 | |
228 def kernel_predictor(inputs,params,*otherargs): | |
229 p = KernelPredictor(params,*otherargs[0]) | |
230 return p.compute_outputs(inputs) | |
231 |