pylearn: pylearn/algorithms/pca_online

comparison pylearn/algorithms/pca_online_estimator.py @ 1409:cedb48a300fc

added a pca online estimator

author	Philippe Hamel <higgsbosonh@hotmail.com>
date	Mon, 31 Jan 2011 12:23:20 -0500
parents
children	8f15ef656598

comparison

equal deleted inserted replaced

-:2993b2a5c1af
+:cedb48a300fc
+# Copyright 2009 PA Manzagol (manzagop AT iro DOT umontreal DOT ca)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy
+from scipy import linalg
+# Todo:
+#   - complete docstring (explain arguments, pseudo code)
+#   - Consider case with discount = 1.0
+#   - reevaluation when not at the end of a minibatch
+class PcaOnlineEstimator(object):
+"""Online estimation of the leading eigen values/vectors of the covariance
+of some samples.
+Maintains a moving (with discount) low rank (n_eigen) estimate of the
+covariance matrix of some observations. New observations are accumulated
+until the batch is complete, at which point the low rank estimate is
+reevaluated.
+Example:
+pca_esti = pca_online_estimator.PcaOnlineEstimator(dimension_of_the_samples)
+for i in range(number_of_samples):
+pca_esti.observe(samples[i])
+[eigvals, eigvecs] = pca_esti.getLeadingEigen()
+"""
+def __init__(self, n_dim, n_eigen = 10, minibatch_size = 25, gamma = 0.999, regularizer = 1e-6, centering = True):
+# dimension of the observations
+self.n_dim = n_dim
+# rank of the low-rank estimate
+self.n_eigen = n_eigen
+# how many observations between reevaluations of the low rank estimate
+self.minibatch_size = minibatch_size
+# the discount factor in the moving estimate
+self.gamma = gamma
+# regularizer of the covariance estimate
+self.regularizer = regularizer
+# wether we center the observations or not: obtain leading eigen of
+# covariance (centering = True) vs second moment (centering = False)
+self.centering = centering
+# Total number of observations: to compute the normalizer for the mean and
+# the covariance.
+self.n_observations = 0
+# Index in the current minibatch
+self.minibatch_index = 0
+# Matrix containing on its *rows*:
+# - the current unnormalized eigen vector estimates
+# - the observations since the last reevaluation
+self.Xt = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_dim])
+# The discounted sum of the observations.
+self.x_sum = numpy.zeros([self.n_dim])
+# The Gram matrix of the observations, ie Xt Xt' (since Xt is rowwise)
+self.G = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size])
+for i in range(self.n_eigen):
+self.G[i,i] = self.regularizer
+# I don't think it's worth "allocating" these 3 next (though they need to be
+# declared). I don't know how to do in place operations...
+# Hold the results of the eigendecomposition of the Gram matrix G
+# (eigen vectors on columns of V).
+self.d = numpy.zeros([self.n_eigen + self.minibatch_size])
+self.V = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size])
+# Holds the unnormalized eigenvectors of the covariance matrix before
+# they're copied back to Xt.
+self.Ut = numpy.zeros([self.n_eigen, self.n_dim])
+def observe(self, x):
+assert(numpy.size(x) == self.n_dim)
+self.n_observations += 1
+# Add the *non-centered* observation to Xt.
+row = self.n_eigen + self.minibatch_index
+self.Xt[row] = x
+# Update the discounted sum of the observations.
+self.x_sum *= self.gamma
+self.x_sum += x
+# To get the mean, we must normalize the sum by:
+# /gamma^(n_observations-1) + /gamma^(n_observations-2) + ... + 1
+normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma);
+#print "normalizer: ", normalizer
+# Now center the observation.
+# We will lose the first observation as it is the only one in the mean.
+if self.centering:
+self.Xt[row] -= self.x_sum / normalizer
+# Multiply the observation by the discount compensator. Basically
+# we make this observation look "younger" than the previous ones. The actual
+# discount is applied in the reevaluation (and when solving the equations in
+# the case of TONGA) by multiplying every direction with the same aging factor.
+rn = pow(self.gamma, -0.5*(self.minibatch_index+1));
+self.Xt[row] *= rn
+# Update the Gram Matrix.
+# The column.
+self.G[:row+1,row] = numpy.dot( self.Xt[:row+1,:], self.Xt[row,:].transpose() )
+# The symetric row.
+# There are row+1 values, but the diag doesn't need to get copied.
+self.G[row,:row] = self.G[:row,row].transpose()
+self.minibatch_index += 1
+if self.minibatch_index == self.minibatch_size:
+self.reevaluate()
+def reevaluate(self):
+# TODO do the modifications to handle when this is not true.
+assert(self.minibatch_index == self.minibatch_size);
+# Regularize - not necessary but in case
+for i in range(self.n_eigen + self.minibatch_size):
+self.G[i,i] += self.regularizer
+# The Gram matrix is up to date. Get its low rank eigendecomposition.
+# NOTE: the eigenvalues are in ASCENDING order and the vectors are on
+# the columns.
+# With scipy 0.7, you can ask for only some eigenvalues (the n_eigen top
+# ones) but it doesn't look loke it for scipy 0.6.
+self.d, self.V = linalg.eigh(self.G) #, overwrite_a=True)
+# Convert the n_eigen LAST eigenvectors of the Gram matrix contained in V
+# into *unnormalized* eigenvectors U of the covariance (unnormalized wrt
+# the eigen values, not the moving average).
+self.Ut = numpy.dot(self.V[:,-self.n_eigen:].transpose(), self.Xt)
+# Take into account the discount factor.
+# Here, minibatch index is minibatch_size. We age everyone. Because of the
+# previous multiplications to make some observations "younger" we multiply
+# everyone by the same factor.
+# TODO VERIFY THIS!
+rn = pow(self.gamma, -0.5*(self.minibatch_index+1))
+inv_rn2 = 1.0/(rn*rn)
+self.Ut *= 1.0/rn
+self.d *= inv_rn2;
+#print "*** Reevaluate! ***"
+#normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma)
+#print "normalizer: ", normalizer
+#print self.d / normalizer
+#print self.Ut # unnormalized eigen vectors (wrt eigenvalues AND moving average).
+# Update Xt, G and minibatch_index
+self.Xt[:self.n_eigen,:] = self.Ut
+for i in range(self.n_eigen):
+self.G[i,i] = self.d[-self.n_eigen+i]
+self.minibatch_index = 0
+# Returns a copy of the current estimate of the eigen values and vectors
+# (normalized vectors on rows), normalized by the discounted number of observations.
+def getLeadingEigen(self):
+# We subtract self.minibatch_index in case this call is not right after a reevaluate call.
+normalizer = (1.0 - pow(self.gamma, self.n_observations - self.minibatch_index)) /(1.0 - self.gamma)
+eigvals = self.d[-self.n_eigen:] / normalizer
+eigvecs = numpy.zeros([self.n_eigen, self.n_dim])
+for i in range(self.n_eigen):
+eigvecs[i] = self.Ut[-self.n_eigen+i] / numpy.sqrt(numpy.dot(self.Ut[-self.n_eigen+i], self.Ut[-self.n_eigen+i]))
+return [eigvals, eigvecs]

Mercurial > pylearn

comparison pylearn/algorithms/pca_online_estimator.py @ 1409:cedb48a300fc