view pylearn/preprocessing/pca.py @ 1356:26644a775a0d

pca - added some assertions
author James Bergstra <bergstrj@iro.umontreal.ca>
date Fri, 05 Nov 2010 23:06:27 -0400
parents 881bce55a203
children 8cc66dac6430
line wrap: on
line source

"""

There is potentially a lot of approaches to PCA, this file may get there eventually.


Elements of this implementation have been borrowed from the MDP toolkit:
    mdp/nodes/pca_nodes.py
"""

#TODO: estimate number of principle components by cross-validation (early stopping)

#TODO: include the original feature means in the `pca` tuple object so that the full transform
# can be saved, applied to new datasets, and approximately inverted.

import numpy
import scipy.linalg

def diag_as_vector(x):
    if x.ndim != 2:
        raise TypeError('this diagonal is implemented only for matrices', x)
    rval = x[0,:min(*x.shape)]
    rval.strides = (rval.strides[0] + x.strides[0],)
    return rval


def pca_from_cov(cov, lower=0, max_components=None, max_energy_fraction=None):
    """Return (eigvals, eigvecs) of data with covariance `cov`.

    The returned eigvals will be a numpy ndarray vector.
    The returned eigvecs will be a numpy ndarray matrix whose *cols* are the eigenvectors.

    This is recommended for retrieving many components from high-dimensional data.

    :param cov: data covariance matrix
    :type cov: a numpy ndarray 
    """

    w, v = scipy.linalg.eigh(a=cov, lower=lower)
    # definition of eigh
    #  a * v[:,i] = w[i] * vr[:,i]
    #  v.H * v = identity

    assert w.min() >= -1e-12 # assert w is all pretty much positive
    if w.min() < 0:
        for i,wi in enumerate(w):
            if wi < 0:
                w[i]=0


    # total variance can be computed at this point:
    # note that vartot == w.sum()
    vartot = w.sum()
    if 0: 
        # you can do this if you want, but it just slows things down
        vartot_cov = diag_as_vector(cov).sum()
        assert numpy.allclose(vartot_cov, vartot)

    a = numpy.argsort(w)[::-1]

    w = w[a]
    v = v[:,a]

    if max_components != None:
        w = w[:max_components]
        v = v[:, :max_components]

    if max_energy_fraction != None:
        if not (0.0 <= max_energy_fraction <= 1.0):
            raise ValueError('illegal value for max_energy_fraction', max_energy_fraction)
        if max_energy_fraction < 1.0:
            energy = 0
            i = 0
            while (energy < max_energy_fraction * vartot) and (i < len(w)):
                energy += w[i]
                i += 1
            if i < len(w):
                w = w[:i]
                v = v[:,:i]
    return w,v


def pca_from_examples(X, max_components=None, max_energy_fraction=None, x_centered=False):
    """Return (eigvals, eigvecs), centered_X of observations `X` (1-per-row)

    This function exists to wrap several algorithms for getting the principle components.

    :param max_components:
        Return no more than this many principle components.

    :param max_energy_fraction: 
        Return [only] enough components to account for this fraction of the energy (aka
        variance) in the observations.

    :param x_centered:
        True means to consider X as having mean 0 (even if it actually doesn't!)

    """
    if x_centered:
        centered_X = X
    else:
        centered_X = X - numpy.mean(X, axis=0)
    return pca_from_cov( numpy.cov(centered_X.T), max_components=max_components,
            max_energy_fraction=max_energy_fraction), centered_X


def pca_whiten((eigvals, eigvecs), centered_X,eps=1e-8):
    """
    Return the projection of X onto it's principle components.  
    
    The return value has the same number of rows as X, but the number of columns is the number
    of principle components.  Columns of the return value have mean 0, variance 1, and are
    uncorrelated.

    :param pca: the (w,v) pair returned by e.g. pca_from_examples(X)

    """
    pca_of_X = numpy.dot(centered_X, eigvecs)
    pca_of_X /= numpy.sqrt(eigvals)+eps
    return pca_of_X

def pca_whiten_inverse((eigvals, eigvecs), whitened_X, eps=1e-8):
    """
    Return an approximate inverse of the `pca_whiten` transform.

    The inverse is not perfect because pca_whitening discards the least-significant components
    of the data.
    """
    return numpy.dot(whitened_X * (numpy.sqrt(eigvals)+eps), eigvecs.T)

def zca_whiten((eigvals, eigvecs), centered_X):
    """Return the PCA of X but rotated back into the original vector space.

    See also fft_whiten.py
    """
    pca_of_X = pca_whiten((eigvals,eigvecs), centered_X)
    return numpy.dot(pca_of_X, eigvecs.T)