view pylearn/preprocessing/pca.py @ 962:0fee974dca1d

work on pca file
author James Bergstra <bergstrj@iro.umontreal.ca>
date Fri, 20 Aug 2010 09:29:30 -0400
parents 2eb98a740823
children 822c7691a759
line wrap: on
line source

"""

There is potentially a lot of approaches to PCA, this file may get there eventually.


Elements of this implementation have been borrowed from the MDP toolkit:
    mdp/nodes/pca_nodes.py
"""

#TODO: estimate number of principle components by cross-validation (early stopping)

import numpy
import scipy.linalg

def diag_as_vector(x):
    if x.ndim != 2:
        raise TypeError('this diagonal is implemented only for matrices', x)
    rval = x[0,:min(*x.shape)]
    rval.strides = (rval.strides[0] + x.strides[0],)
    return rval


def pca_from_cov(cov, lower=0, max_components=None, max_energy_fraction=None):
    """Return (eigvals, eigvecs) of data with covariance `cov`.

    The returned eigvals will be a numpy ndarray vector.
    The returned eigvecs will be a numpy ndarray matrix whose *cols* are the eigenvectors.

    This is recommended for retrieving many components from high-dimensional data.

    :param cov: data covariance matrix
    :type cov: a numpy ndarray 
    """

    w, v = scipy.linalg.eigh(a=cov, lower=lower)
    # definition of eigh
    #  a * v[:,i] = w[i] * vr[:,i]
    #  v.H * v = identity


    # total variance can be computed at this point:
    # note that vartot == d.sum()
    vartot = diag_as_vector(cov).sum()

    assert numpy.allclose(vartot, w.sum())

    a = numpy.argsort(w)[::-1]

    w = w[a]
    v = v[:,a]

    if max_components != None:
        w = w[:max_components]
        v = v[:, :max_components]

    if max_energy_fraction != None:
        if not (0.0 <= max_energy_fraction <= 1.0):
            raise ValueError('illegal value for max_energy_fraction', max_energy_fraction)
        if max_energy_fraction < 1.0:
            energy = 0
            i = 0
            while energy < max_energy_fraction * vartot:
                energy += w[i]
                i += 1
            if i < len(w):
                w = w[:i]
                v = v[:,:i]


    return w,v


def pca_from_examples(X, max_components=None, max_energy_fraction=None, x_centered=False):
    """Return (eigvals, eigvecs), centered_X of observations `X` (1-per-row)

    This function exists to wrap several algorithms for getting the principle components.

    :param max_components:
        Return no more than this many principle components.

    :param max_energy_fraction: 
        Return [only] enough components to account for this fraction of the energy (aka
        variance) in the observations.

    :param x_centered:
        True means to consider X as having mean 0 (even if it actually doesn't!)

    """
    if x_centered:
        centered_X = X
    else:
        centered_X = X - numpy.mean(X, axis=0)
    return pca_from_cov( numpy.cov(centered_X.T), max_components=max_components,
            max_energy_fraction=max_energy_fraction), centered_X


def pca_whiten((eigvals, eigvecs), centered_X,eps=1e-8):
    """
    Return the projection of X onto it's principle components.  
    
    The return value has the same number of rows as X, but the number of columns is the number
    of principle components.  Columns of the return value have mean 0, variance 1, and are
    uncorrelated.

    :param pca: the (w,v) pair returned by e.g. pca_from_examples(X)

    """
    pca_of_X = numpy.dot(centered_X, eigvecs)
    pca_of_X /= numpy.sqrt(eigvals)+eps
    return pca_of_X

def zca_whiten((eigvals, eigvecs), centered_X):
    """Return the PCA of X but rotated back into the original vector space.

    See also fft_whiten.py
    """
    pca_of_X = pca_whiten((eigvals,eigvecs), centered_X)
    return numpy.dot(pca_of_X, eigvecs.T)