Mercurial > pylearn
view pylearn/algorithms/pca_online_estimator.py @ 1524:9d21919e2332
autopep8
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Fri, 02 Nov 2012 13:02:18 -0400 |
parents | c584d8f8f280 |
children |
line wrap: on
line source
import numpy from scipy import linalg # Todo: # - complete docstring (explain arguments, pseudo code) # - Consider case with discount = 1.0 # - reevaluation when not at the end of a minibatch class PcaOnlineEstimator(object): """Online estimation of the leading eigen values/vectors of the covariance of some samples. Maintains a moving (with discount) low rank (n_eigen) estimate of the covariance matrix of some observations. New observations are accumulated until the batch is complete, at which point the low rank estimate is reevaluated. Example: pca_esti = pca_online_estimator.PcaOnlineEstimator(dimension_of_the_samples) for i in range(number_of_samples): pca_esti.observe(samples[i]) [eigvals, eigvecs] = pca_esti.getLeadingEigen() """ def __init__(self, n_dim, n_eigen = 10, minibatch_size = 25, gamma = 0.999, regularizer = 1e-6, centering = True): # dimension of the observations self.n_dim = n_dim # rank of the low-rank estimate self.n_eigen = n_eigen # how many observations between reevaluations of the low rank estimate self.minibatch_size = minibatch_size # the discount factor in the moving estimate self.gamma = gamma # regularizer of the covariance estimate self.regularizer = regularizer # wether we center the observations or not: obtain leading eigen of # covariance (centering = True) vs second moment (centering = False) self.centering = centering # Total number of observations: to compute the normalizer for the mean and # the covariance. self.n_observations = 0 # Index in the current minibatch self.minibatch_index = 0 # Matrix containing on its *rows*: # - the current unnormalized eigen vector estimates # - the observations since the last reevaluation self.Xt = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_dim]) # The discounted sum of the observations. self.x_sum = numpy.zeros([self.n_dim]) # The Gram matrix of the observations, ie Xt Xt' (since Xt is rowwise) self.G = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size]) for i in range(self.n_eigen): self.G[i,i] = self.regularizer # I don't think it's worth "allocating" these 3 next (though they need to be # declared). I don't know how to do in place operations... # Hold the results of the eigendecomposition of the Gram matrix G # (eigen vectors on columns of V). self.d = numpy.zeros([self.n_eigen + self.minibatch_size]) self.V = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size]) # Holds the unnormalized eigenvectors of the covariance matrix before # they're copied back to Xt. self.Ut = numpy.zeros([self.n_eigen, self.n_dim]) def observe(self, x): assert(numpy.size(x) == self.n_dim) self.n_observations += 1 # Add the *non-centered* observation to Xt. row = self.n_eigen + self.minibatch_index self.Xt[row] = x # Update the discounted sum of the observations. self.x_sum *= self.gamma self.x_sum += x # To get the mean, we must normalize the sum by: # /gamma^(n_observations-1) + /gamma^(n_observations-2) + ... + 1 normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma); #print "normalizer: ", normalizer # Now center the observation. # We will lose the first observation as it is the only one in the mean. if self.centering: self.Xt[row] -= self.x_sum / normalizer # Multiply the observation by the discount compensator. Basically # we make this observation look "younger" than the previous ones. The actual # discount is applied in the reevaluation (and when solving the equations in # the case of TONGA) by multiplying every direction with the same aging factor. rn = pow(self.gamma, -0.5*(self.minibatch_index+1)); self.Xt[row] *= rn # Update the Gram Matrix. # The column. self.G[:row+1,row] = numpy.dot( self.Xt[:row+1,:], self.Xt[row,:].transpose() ) # The symetric row. # There are row+1 values, but the diag doesn't need to get copied. self.G[row,:row] = self.G[:row,row].transpose() self.minibatch_index += 1 if self.minibatch_index == self.minibatch_size: self.reevaluate() def reevaluate(self): # TODO do the modifications to handle when this is not true. assert(self.minibatch_index == self.minibatch_size); # Regularize - not necessary but in case for i in range(self.n_eigen + self.minibatch_size): self.G[i,i] += self.regularizer # The Gram matrix is up to date. Get its low rank eigendecomposition. # NOTE: the eigenvalues are in ASCENDING order and the vectors are on # the columns. # With scipy 0.7, you can ask for only some eigenvalues (the n_eigen top # ones) but it doesn't look loke it for scipy 0.6. self.d, self.V = linalg.eigh(self.G) #, overwrite_a=True) # Convert the n_eigen LAST eigenvectors of the Gram matrix contained in V # into *unnormalized* eigenvectors U of the covariance (unnormalized wrt # the eigen values, not the moving average). self.Ut = numpy.dot(self.V[:,-self.n_eigen:].transpose(), self.Xt) # Take into account the discount factor. # Here, minibatch index is minibatch_size. We age everyone. Because of the # previous multiplications to make some observations "younger" we multiply # everyone by the same factor. # TODO VERIFY THIS! rn = pow(self.gamma, -0.5*(self.minibatch_index+1)) inv_rn2 = 1.0/(rn*rn) self.Ut *= 1.0/rn self.d *= inv_rn2; #print "*** Reevaluate! ***" #normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma) #print "normalizer: ", normalizer #print self.d / normalizer #print self.Ut # unnormalized eigen vectors (wrt eigenvalues AND moving average). # Update Xt, G and minibatch_index self.Xt[:self.n_eigen,:] = self.Ut for i in range(self.n_eigen): self.G[i,i] = self.d[-self.n_eigen+i] self.minibatch_index = 0 # Returns a copy of the current estimate of the eigen values and vectors # (normalized vectors on rows), normalized by the discounted number of observations. def getLeadingEigen(self): # We subtract self.minibatch_index in case this call is not right after a reevaluate call. normalizer = (1.0 - pow(self.gamma, self.n_observations - self.minibatch_index)) /(1.0 - self.gamma) eigvals = self.d[-self.n_eigen:] / normalizer eigvecs = numpy.zeros([self.n_eigen, self.n_dim]) for i in range(self.n_eigen): eigvecs[i] = self.Ut[-self.n_eigen+i] / numpy.sqrt(numpy.dot(self.Ut[-self.n_eigen+i], self.Ut[-self.n_eigen+i])) return [eigvals, eigvecs]