comparison pylearn/algorithms/pca_online_estimator.py @ 1409:cedb48a300fc

added a pca online estimator
author Philippe Hamel <higgsbosonh@hotmail.com>
date Mon, 31 Jan 2011 12:23:20 -0500
parents
children 8f15ef656598
comparison
equal deleted inserted replaced
1408:2993b2a5c1af 1409:cedb48a300fc
1 # Copyright 2009 PA Manzagol (manzagop AT iro DOT umontreal DOT ca)
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 #
15
16 import numpy
17 from scipy import linalg
18
19 # Todo:
20 # - complete docstring (explain arguments, pseudo code)
21 # - Consider case with discount = 1.0
22 # - reevaluation when not at the end of a minibatch
23
24 class PcaOnlineEstimator(object):
25 """Online estimation of the leading eigen values/vectors of the covariance
26 of some samples.
27
28 Maintains a moving (with discount) low rank (n_eigen) estimate of the
29 covariance matrix of some observations. New observations are accumulated
30 until the batch is complete, at which point the low rank estimate is
31 reevaluated.
32
33 Example:
34
35 pca_esti = pca_online_estimator.PcaOnlineEstimator(dimension_of_the_samples)
36
37 for i in range(number_of_samples):
38 pca_esti.observe(samples[i])
39
40 [eigvals, eigvecs] = pca_esti.getLeadingEigen()
41
42 """
43
44
45 def __init__(self, n_dim, n_eigen = 10, minibatch_size = 25, gamma = 0.999, regularizer = 1e-6, centering = True):
46 # dimension of the observations
47 self.n_dim = n_dim
48 # rank of the low-rank estimate
49 self.n_eigen = n_eigen
50 # how many observations between reevaluations of the low rank estimate
51 self.minibatch_size = minibatch_size
52 # the discount factor in the moving estimate
53 self.gamma = gamma
54 # regularizer of the covariance estimate
55 self.regularizer = regularizer
56 # wether we center the observations or not: obtain leading eigen of
57 # covariance (centering = True) vs second moment (centering = False)
58 self.centering = centering
59
60 # Total number of observations: to compute the normalizer for the mean and
61 # the covariance.
62 self.n_observations = 0
63 # Index in the current minibatch
64 self.minibatch_index = 0
65
66 # Matrix containing on its *rows*:
67 # - the current unnormalized eigen vector estimates
68 # - the observations since the last reevaluation
69 self.Xt = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_dim])
70
71 # The discounted sum of the observations.
72 self.x_sum = numpy.zeros([self.n_dim])
73
74 # The Gram matrix of the observations, ie Xt Xt' (since Xt is rowwise)
75 self.G = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size])
76 for i in range(self.n_eigen):
77 self.G[i,i] = self.regularizer
78
79 # I don't think it's worth "allocating" these 3 next (though they need to be
80 # declared). I don't know how to do in place operations...
81
82 # Hold the results of the eigendecomposition of the Gram matrix G
83 # (eigen vectors on columns of V).
84 self.d = numpy.zeros([self.n_eigen + self.minibatch_size])
85 self.V = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size])
86
87 # Holds the unnormalized eigenvectors of the covariance matrix before
88 # they're copied back to Xt.
89 self.Ut = numpy.zeros([self.n_eigen, self.n_dim])
90
91
92 def observe(self, x):
93 assert(numpy.size(x) == self.n_dim)
94
95 self.n_observations += 1
96
97 # Add the *non-centered* observation to Xt.
98 row = self.n_eigen + self.minibatch_index
99 self.Xt[row] = x
100
101 # Update the discounted sum of the observations.
102 self.x_sum *= self.gamma
103 self.x_sum += x
104
105 # To get the mean, we must normalize the sum by:
106 # /gamma^(n_observations-1) + /gamma^(n_observations-2) + ... + 1
107 normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma);
108 #print "normalizer: ", normalizer
109
110 # Now center the observation.
111 # We will lose the first observation as it is the only one in the mean.
112 if self.centering:
113 self.Xt[row] -= self.x_sum / normalizer
114
115 # Multiply the observation by the discount compensator. Basically
116 # we make this observation look "younger" than the previous ones. The actual
117 # discount is applied in the reevaluation (and when solving the equations in
118 # the case of TONGA) by multiplying every direction with the same aging factor.
119 rn = pow(self.gamma, -0.5*(self.minibatch_index+1));
120 self.Xt[row] *= rn
121
122 # Update the Gram Matrix.
123 # The column.
124 self.G[:row+1,row] = numpy.dot( self.Xt[:row+1,:], self.Xt[row,:].transpose() )
125 # The symetric row.
126 # There are row+1 values, but the diag doesn't need to get copied.
127 self.G[row,:row] = self.G[:row,row].transpose()
128
129 self.minibatch_index += 1
130
131 if self.minibatch_index == self.minibatch_size:
132 self.reevaluate()
133
134
135 def reevaluate(self):
136 # TODO do the modifications to handle when this is not true.
137 assert(self.minibatch_index == self.minibatch_size);
138
139 # Regularize - not necessary but in case
140 for i in range(self.n_eigen + self.minibatch_size):
141 self.G[i,i] += self.regularizer
142
143 # The Gram matrix is up to date. Get its low rank eigendecomposition.
144 # NOTE: the eigenvalues are in ASCENDING order and the vectors are on
145 # the columns.
146 # With scipy 0.7, you can ask for only some eigenvalues (the n_eigen top
147 # ones) but it doesn't look loke it for scipy 0.6.
148 self.d, self.V = linalg.eigh(self.G) #, overwrite_a=True)
149
150 # Convert the n_eigen LAST eigenvectors of the Gram matrix contained in V
151 # into *unnormalized* eigenvectors U of the covariance (unnormalized wrt
152 # the eigen values, not the moving average).
153 self.Ut = numpy.dot(self.V[:,-self.n_eigen:].transpose(), self.Xt)
154
155 # Take into account the discount factor.
156 # Here, minibatch index is minibatch_size. We age everyone. Because of the
157 # previous multiplications to make some observations "younger" we multiply
158 # everyone by the same factor.
159 # TODO VERIFY THIS!
160 rn = pow(self.gamma, -0.5*(self.minibatch_index+1))
161 inv_rn2 = 1.0/(rn*rn)
162 self.Ut *= 1.0/rn
163 self.d *= inv_rn2;
164
165 #print "*** Reevaluate! ***"
166 #normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma)
167 #print "normalizer: ", normalizer
168 #print self.d / normalizer
169 #print self.Ut # unnormalized eigen vectors (wrt eigenvalues AND moving average).
170
171 # Update Xt, G and minibatch_index
172 self.Xt[:self.n_eigen,:] = self.Ut
173
174 for i in range(self.n_eigen):
175 self.G[i,i] = self.d[-self.n_eigen+i]
176
177 self.minibatch_index = 0
178
179 # Returns a copy of the current estimate of the eigen values and vectors
180 # (normalized vectors on rows), normalized by the discounted number of observations.
181 def getLeadingEigen(self):
182 # We subtract self.minibatch_index in case this call is not right after a reevaluate call.
183 normalizer = (1.0 - pow(self.gamma, self.n_observations - self.minibatch_index)) /(1.0 - self.gamma)
184
185 eigvals = self.d[-self.n_eigen:] / normalizer
186 eigvecs = numpy.zeros([self.n_eigen, self.n_dim])
187 for i in range(self.n_eigen):
188 eigvecs[i] = self.Ut[-self.n_eigen+i] / numpy.sqrt(numpy.dot(self.Ut[-self.n_eigen+i], self.Ut[-self.n_eigen+i]))
189
190 return [eigvals, eigvecs]
191