view pylearn/algorithms/pca_online_estimator.py @ 1439:c584d8f8f280

fixed indentation.
author Frederic Bastien <nouiz@nouiz.org>
date Fri, 25 Feb 2011 16:38:33 -0500
parents 8f15ef656598
children
line wrap: on
line source

import numpy
from scipy import linalg

# Todo:
#   - complete docstring (explain arguments, pseudo code)
#   - Consider case with discount = 1.0
#   - reevaluation when not at the end of a minibatch

class PcaOnlineEstimator(object):
    """Online estimation of the leading eigen values/vectors of the covariance
    of some samples.

    Maintains a moving (with discount) low rank (n_eigen) estimate of the
    covariance matrix of some observations. New observations are accumulated
    until the batch is complete, at which point the low rank estimate is
    reevaluated.

    Example:

      pca_esti = pca_online_estimator.PcaOnlineEstimator(dimension_of_the_samples)

      for i in range(number_of_samples):
        pca_esti.observe(samples[i])

      [eigvals, eigvecs] = pca_esti.getLeadingEigen()

    """


    def __init__(self, n_dim, n_eigen = 10, minibatch_size = 25, gamma = 0.999, regularizer = 1e-6, centering = True):
        # dimension of the observations
        self.n_dim = n_dim
        # rank of the low-rank estimate
        self.n_eigen = n_eigen
        # how many observations between reevaluations of the low rank estimate
        self.minibatch_size = minibatch_size
        # the discount factor in the moving estimate
        self.gamma = gamma
        # regularizer of the covariance estimate
        self.regularizer = regularizer
        # wether we center the observations or not: obtain leading eigen of
        # covariance (centering = True) vs second moment (centering = False)
        self.centering = centering

        # Total number of observations: to compute the normalizer for the mean and
        # the covariance.
        self.n_observations = 0
        # Index in the current minibatch
        self.minibatch_index = 0

        # Matrix containing on its *rows*:
        # - the current unnormalized eigen vector estimates
        # - the observations since the last reevaluation
        self.Xt = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_dim])

        # The discounted sum of the observations.
        self.x_sum = numpy.zeros([self.n_dim])

        # The Gram matrix of the observations, ie Xt Xt' (since Xt is rowwise)
        self.G = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size])
        for i in range(self.n_eigen):
            self.G[i,i] = self.regularizer

        # I don't think it's worth "allocating" these 3 next (though they need to be
        # declared). I don't know how to do in place operations...

        # Hold the results of the eigendecomposition of the Gram matrix G
        # (eigen vectors on columns of V).
        self.d = numpy.zeros([self.n_eigen + self.minibatch_size])
        self.V = numpy.zeros([self.n_eigen + self.minibatch_size, self.n_eigen + self.minibatch_size])

        # Holds the unnormalized eigenvectors of the covariance matrix before
        # they're copied back to Xt.
        self.Ut = numpy.zeros([self.n_eigen, self.n_dim])


    def observe(self, x):
        assert(numpy.size(x) == self.n_dim)

        self.n_observations += 1

        # Add the *non-centered* observation to Xt.
        row = self.n_eigen + self.minibatch_index
        self.Xt[row] = x

        # Update the discounted sum of the observations.
        self.x_sum *= self.gamma
        self.x_sum += x

        # To get the mean, we must normalize the sum by:
        # /gamma^(n_observations-1) + /gamma^(n_observations-2) + ... + 1
        normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma);
        #print "normalizer: ", normalizer

        # Now center the observation.
        # We will lose the first observation as it is the only one in the mean.
        if self.centering:
            self.Xt[row] -= self.x_sum / normalizer

        # Multiply the observation by the discount compensator. Basically
        # we make this observation look "younger" than the previous ones. The actual
        # discount is applied in the reevaluation (and when solving the equations in
        # the case of TONGA) by multiplying every direction with the same aging factor.
        rn = pow(self.gamma, -0.5*(self.minibatch_index+1));
        self.Xt[row] *= rn

        # Update the Gram Matrix.
        # The column.
        self.G[:row+1,row] = numpy.dot( self.Xt[:row+1,:], self.Xt[row,:].transpose() )
        # The symetric row.
        # There are row+1 values, but the diag doesn't need to get copied.
        self.G[row,:row] = self.G[:row,row].transpose()

        self.minibatch_index += 1

        if self.minibatch_index == self.minibatch_size:
            self.reevaluate()


    def reevaluate(self):
        # TODO do the modifications to handle when this is not true.
        assert(self.minibatch_index == self.minibatch_size);

        # Regularize - not necessary but in case
        for i in range(self.n_eigen + self.minibatch_size):
            self.G[i,i] += self.regularizer

        # The Gram matrix is up to date. Get its low rank eigendecomposition.
        # NOTE: the eigenvalues are in ASCENDING order and the vectors are on
        # the columns.
        # With scipy 0.7, you can ask for only some eigenvalues (the n_eigen top
        # ones) but it doesn't look loke it for scipy 0.6.
        self.d, self.V = linalg.eigh(self.G) #, overwrite_a=True)

        # Convert the n_eigen LAST eigenvectors of the Gram matrix contained in V
        # into *unnormalized* eigenvectors U of the covariance (unnormalized wrt
        # the eigen values, not the moving average).
        self.Ut = numpy.dot(self.V[:,-self.n_eigen:].transpose(), self.Xt)

        # Take into account the discount factor.
        # Here, minibatch index is minibatch_size. We age everyone. Because of the
        # previous multiplications to make some observations "younger" we multiply
        # everyone by the same factor.
        # TODO VERIFY THIS!
        rn = pow(self.gamma, -0.5*(self.minibatch_index+1))
        inv_rn2 = 1.0/(rn*rn)
        self.Ut *= 1.0/rn
        self.d *= inv_rn2;

        #print "*** Reevaluate! ***"
        #normalizer = (1.0 - pow(self.gamma, self.n_observations)) /(1.0 - self.gamma)
        #print "normalizer: ", normalizer
        #print self.d / normalizer
        #print self.Ut # unnormalized eigen vectors (wrt eigenvalues AND moving average).

        # Update Xt, G and minibatch_index
        self.Xt[:self.n_eigen,:] = self.Ut

        for i in range(self.n_eigen):
            self.G[i,i] = self.d[-self.n_eigen+i]

        self.minibatch_index = 0

    # Returns a copy of the current estimate of the eigen values and vectors
    # (normalized vectors on rows), normalized by the discounted number of observations.
    def getLeadingEigen(self):
        # We subtract self.minibatch_index in case this call is not right after a reevaluate call.
        normalizer = (1.0 - pow(self.gamma, self.n_observations - self.minibatch_index)) /(1.0 - self.gamma)

        eigvals = self.d[-self.n_eigen:] / normalizer
        eigvecs = numpy.zeros([self.n_eigen, self.n_dim])
        for i in range(self.n_eigen):
            eigvecs[i] = self.Ut[-self.n_eigen+i] / numpy.sqrt(numpy.dot(self.Ut[-self.n_eigen+i], self.Ut[-self.n_eigen+i]))

        return [eigvals, eigvecs]