changeset 953:2eb98a740823

Added 'preprocessing' module
author James Bergstra <bergstrj@iro.umontreal.ca>
date Thu, 19 Aug 2010 11:53:52 -0400
parents 5f80351bc762
children 9b0fd89599c7
files pylearn/preprocessing/README.txt pylearn/preprocessing/__init__.py pylearn/preprocessing/fft_whiten.py pylearn/preprocessing/pca.py
diffstat 4 files changed, 214 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/README.txt	Thu Aug 19 11:53:52 2010 -0400
@@ -0,0 +1,2 @@
+
+Module documentation is given by docstring in the __init__.py file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/__init__.py	Thu Aug 19 11:53:52 2010 -0400
@@ -0,0 +1,13 @@
+
+"""Preprocessing module
+
+This module provides little numpy functions (not necessarily Theano expressions) for one-time
+pre-processing of data for modeling experiments.  The module is a work in progress, but the
+sort of thing that belongs here is:
+
+ - PCA
+ - whitening (for images and video)
+ - TF/IDF (for text)
+ - MFCC (for audio)
+
+"""
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/fft_whiten.py	Thu Aug 19 11:53:52 2010 -0400
@@ -0,0 +1,76 @@
+"""
+Whitening algorithm used in Olshausen & Field, 1997.
+
+If you want to use some other images, there are a number of
+preprocessing steps you need to consider beforehand.  First, you
+should make sure all images have approximately the same overall
+contrast.  One way of doing this is to normalize each image so that
+the variance of the pixels is the same (i.e., 1).  Then you will need
+to prewhiten the images.  For a full explanation of whitening see
+
+Olshausen BA, Field DJ (1997)  Sparse Coding with an Overcomplete
+Basis Set: A Strategy Employed by V1?  Vision Research, 37: 3311-3325. 
+
+Basically, to whiten an image of size NxN, you multiply by the filter
+f*exp(-(f/f_0)^4) in the frequency domain, where f_0=0.4*N (f_0 is the
+cutoff frequency of a lowpass filter that is combined with the
+whitening filter).  Once you have preprocessed a number of images this
+way, all the same size, then you should combine them into one big N^2
+x M array, where M is the number of images.  Then rescale this array
+so that the average image variance is 0.1 (the parameters in sparsenet
+are set assuming that the data variance is 0.1).  Name this array
+IMAGES, save it to a file for future use, and you should be off and
+running.  The following Matlab commands should do this:
+
+
+    N=image_size;
+    M=num_images;
+
+    [fx fy]=meshgrid(-N/2:N/2-1,-N/2:N/2-1);
+    rho=sqrt(fx.*fx+fy.*fy);
+    f_0=0.4*N;
+    filt=rho.*exp(-(rho/f_0).^4);
+
+    for i=1:M
+    image=get_image;  % you will need to provide get_image
+    If=fft2(image);
+    imagew=real(ifft2(If.*fftshift(filt)));
+    IMAGES(:,i)=reshape(imagew,N^2,1);
+    end
+
+    IMAGES=sqrt(0.1)*IMAGES/sqrt(mean(var(IMAGES)));
+
+    save MY_IMAGES IMAGES
+
+"""
+
+import numpy as np
+def whiten(X, f0=None, n=None):
+    """
+    :param X: a 3D tensor n_images x rows x cols
+    :param f0: filter parameter (see docs)
+    :param n: filter parameter (see docs)
+
+    :returns: 3D tensor n_images x rows x cols of filtered images.
+    """
+    # May be mixed up with the size2 and size1s because matlab does things differnetly
+    R, C = X.shape[-2:]
+    if R %2:
+        raise NotImplementedError()
+    if C %2:
+        raise NotImplementedError()
+
+    if f0 is None:
+        f0 = .4 * min(R,C)
+    if n is None:
+        n = 4
+
+
+    fx,fy = np.mgrid[-R/2:R/2, -C/2:C/2]
+    rho=np.sqrt(fx**2 + fy**2)
+    filt=rho * np.exp( - (rho/f0)**4)
+    If = np.fft.fft2(X)
+    imagew=np.real(np.fft.ifft2(If * np.fft.fftshift(filt)))
+    assert imagew.shape == X.shape
+    return imagew
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/pca.py	Thu Aug 19 11:53:52 2010 -0400
@@ -0,0 +1,123 @@
+"""
+
+There is potentially a lot of approaches to PCA, this file may get there eventually.
+
+
+Elements of this implementation have been borrowed from the MDP toolkit:
+    mdp/nodes/pca_nodes.py
+"""
+
+#TODO: estimate number of principle components by cross-validation (early stopping)
+
+import numpy
+import scipy.linalg
+
+def diag_as_vector(x):
+    if x.ndim != 2:
+        raise TypeError('this diagonal is implemented only for matrices', x)
+    rval = x[0,:min(*x.shape)]
+    rval.strides = (rval.strides[0] + x.strides[0],)
+    return rval
+
+
+def pca_from_cov(cov, lower=0, max_components=None, max_energy_fraction=None):
+    """Return (eigvals, eigvecs) of data with covariance `cov`.
+
+    The returned eigvals will be a numpy ndarray vector.
+    The returned eigvecs will be a numpy ndarray matrix whose *cols* are the eigenvectors.
+
+    This is recommended for retrieving many components from high-dimensional data.
+
+    :param cov: data covariance matrix
+    :type cov: a numpy ndarray 
+    """
+
+    w, v = scipy.linalg.eigh(a=cov, lower=lower)
+    # definition of eigh
+    #  a * v[:,i] = w[i] * vr[:,i]
+    #  v.H * v = identity
+
+
+    # total variance can be computed at this point:
+    # note that vartot == d.sum()
+    vartot = diag_as_vector(cov).sum()
+
+    assert numpy.allclose(vartot, w.sum())
+
+    a = numpy.argsort(w)[::-1]
+
+    w = w[a]
+    v = v[:,a]
+
+    if max_components != None:
+        w = w[:max_components]
+        v = v[:, :max_components]
+
+    if max_energy_fraction != None:
+        if not (0.0 <= max_energy_fraction <= 1.0):
+            raise ValueError('illegal value for max_energy_fraction', max_energy_fraction)
+        if max_energy_fraction < 1.0:
+            energy = 0
+            i = 0
+            while energy < max_energy_fraction * vartot:
+                energy += w[i]
+                i += 1
+            if i < len(w):
+                w = w[:i]
+                v = v[:,:i]
+
+
+    return w,v
+
+
+def pca_from_examples(X, max_components=None, max_energy_fraction=None, x_centered=False):
+    """Return (eigvals, eigvecs) of observations `X` (1-per-row)
+
+    This function exists to wrap several algorithms for getting the principle components.
+
+    :param max_components:
+        Return no more than this many principle components.
+
+    :param max_energy_fraction: 
+        Return [only] enough components to account for this fraction of the energy (aka
+        variance) in the observations.
+
+    :param x_centered:
+        True means to consider X as having mean 0 (even if it actually doesn't!)
+
+    """
+    if x_centered:
+        centered_X = X
+    else:
+        centered_X = X - numpy.mean(X, axis=0)
+    return pca_from_cov( numpy.cov(centered_X.T), max_components=max_components,
+            max_energy_fraction=max_energy_fraction)
+
+
+def pca_whiten(X, pca):
+    """
+    Return the projection of X onto it's principle components.  
+    
+    The return value has the same number of rows as X, but the number of columns is the number
+    of principle components.  Columns of the return value have mean 0, variance 1, and are
+    uncorrelated.
+
+
+    See also fft_whiten.py
+
+    :param pca: the (w,v) pair returned by e.g. pca_from_examples(X)
+
+    """
+    w,v = pca
+
+    centered_X = X - numpy.mean(X, axis=0)
+    eigvals, eigvecs = pca_from_examples(centered_X, 
+            max_components=max_components, max_energy_fraction=max_energy_fraction,
+            x_centered=True)
+
+    rotated_X = numpy.dot(centered_X, eigvecs)
+    rotated_X /= numpy.sqrt(eigvals)
+
+    return rotated_X
+
+