# HG changeset patch # User James Bergstra # Date 1282233232 14400 # Node ID 2eb98a7408232567f61cf1f83bc466c6f490fdf1 # Parent 5f80351bc762404966650d995daf156bdf11e17c Added 'preprocessing' module diff -r 5f80351bc762 -r 2eb98a740823 pylearn/preprocessing/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/preprocessing/README.txt Thu Aug 19 11:53:52 2010 -0400 @@ -0,0 +1,2 @@ + +Module documentation is given by docstring in the __init__.py file diff -r 5f80351bc762 -r 2eb98a740823 pylearn/preprocessing/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/preprocessing/__init__.py Thu Aug 19 11:53:52 2010 -0400 @@ -0,0 +1,13 @@ + +"""Preprocessing module + +This module provides little numpy functions (not necessarily Theano expressions) for one-time +pre-processing of data for modeling experiments. The module is a work in progress, but the +sort of thing that belongs here is: + + - PCA + - whitening (for images and video) + - TF/IDF (for text) + - MFCC (for audio) + +""" diff -r 5f80351bc762 -r 2eb98a740823 pylearn/preprocessing/fft_whiten.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/preprocessing/fft_whiten.py Thu Aug 19 11:53:52 2010 -0400 @@ -0,0 +1,76 @@ +""" +Whitening algorithm used in Olshausen & Field, 1997. + +If you want to use some other images, there are a number of +preprocessing steps you need to consider beforehand. First, you +should make sure all images have approximately the same overall +contrast. One way of doing this is to normalize each image so that +the variance of the pixels is the same (i.e., 1). Then you will need +to prewhiten the images. For a full explanation of whitening see + +Olshausen BA, Field DJ (1997) Sparse Coding with an Overcomplete +Basis Set: A Strategy Employed by V1? Vision Research, 37: 3311-3325. + +Basically, to whiten an image of size NxN, you multiply by the filter +f*exp(-(f/f_0)^4) in the frequency domain, where f_0=0.4*N (f_0 is the +cutoff frequency of a lowpass filter that is combined with the +whitening filter). Once you have preprocessed a number of images this +way, all the same size, then you should combine them into one big N^2 +x M array, where M is the number of images. Then rescale this array +so that the average image variance is 0.1 (the parameters in sparsenet +are set assuming that the data variance is 0.1). Name this array +IMAGES, save it to a file for future use, and you should be off and +running. The following Matlab commands should do this: + + + N=image_size; + M=num_images; + + [fx fy]=meshgrid(-N/2:N/2-1,-N/2:N/2-1); + rho=sqrt(fx.*fx+fy.*fy); + f_0=0.4*N; + filt=rho.*exp(-(rho/f_0).^4); + + for i=1:M + image=get_image; % you will need to provide get_image + If=fft2(image); + imagew=real(ifft2(If.*fftshift(filt))); + IMAGES(:,i)=reshape(imagew,N^2,1); + end + + IMAGES=sqrt(0.1)*IMAGES/sqrt(mean(var(IMAGES))); + + save MY_IMAGES IMAGES + +""" + +import numpy as np +def whiten(X, f0=None, n=None): + """ + :param X: a 3D tensor n_images x rows x cols + :param f0: filter parameter (see docs) + :param n: filter parameter (see docs) + + :returns: 3D tensor n_images x rows x cols of filtered images. + """ + # May be mixed up with the size2 and size1s because matlab does things differnetly + R, C = X.shape[-2:] + if R %2: + raise NotImplementedError() + if C %2: + raise NotImplementedError() + + if f0 is None: + f0 = .4 * min(R,C) + if n is None: + n = 4 + + + fx,fy = np.mgrid[-R/2:R/2, -C/2:C/2] + rho=np.sqrt(fx**2 + fy**2) + filt=rho * np.exp( - (rho/f0)**4) + If = np.fft.fft2(X) + imagew=np.real(np.fft.ifft2(If * np.fft.fftshift(filt))) + assert imagew.shape == X.shape + return imagew + diff -r 5f80351bc762 -r 2eb98a740823 pylearn/preprocessing/pca.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/preprocessing/pca.py Thu Aug 19 11:53:52 2010 -0400 @@ -0,0 +1,123 @@ +""" + +There is potentially a lot of approaches to PCA, this file may get there eventually. + + +Elements of this implementation have been borrowed from the MDP toolkit: + mdp/nodes/pca_nodes.py +""" + +#TODO: estimate number of principle components by cross-validation (early stopping) + +import numpy +import scipy.linalg + +def diag_as_vector(x): + if x.ndim != 2: + raise TypeError('this diagonal is implemented only for matrices', x) + rval = x[0,:min(*x.shape)] + rval.strides = (rval.strides[0] + x.strides[0],) + return rval + + +def pca_from_cov(cov, lower=0, max_components=None, max_energy_fraction=None): + """Return (eigvals, eigvecs) of data with covariance `cov`. + + The returned eigvals will be a numpy ndarray vector. + The returned eigvecs will be a numpy ndarray matrix whose *cols* are the eigenvectors. + + This is recommended for retrieving many components from high-dimensional data. + + :param cov: data covariance matrix + :type cov: a numpy ndarray + """ + + w, v = scipy.linalg.eigh(a=cov, lower=lower) + # definition of eigh + # a * v[:,i] = w[i] * vr[:,i] + # v.H * v = identity + + + # total variance can be computed at this point: + # note that vartot == d.sum() + vartot = diag_as_vector(cov).sum() + + assert numpy.allclose(vartot, w.sum()) + + a = numpy.argsort(w)[::-1] + + w = w[a] + v = v[:,a] + + if max_components != None: + w = w[:max_components] + v = v[:, :max_components] + + if max_energy_fraction != None: + if not (0.0 <= max_energy_fraction <= 1.0): + raise ValueError('illegal value for max_energy_fraction', max_energy_fraction) + if max_energy_fraction < 1.0: + energy = 0 + i = 0 + while energy < max_energy_fraction * vartot: + energy += w[i] + i += 1 + if i < len(w): + w = w[:i] + v = v[:,:i] + + + return w,v + + +def pca_from_examples(X, max_components=None, max_energy_fraction=None, x_centered=False): + """Return (eigvals, eigvecs) of observations `X` (1-per-row) + + This function exists to wrap several algorithms for getting the principle components. + + :param max_components: + Return no more than this many principle components. + + :param max_energy_fraction: + Return [only] enough components to account for this fraction of the energy (aka + variance) in the observations. + + :param x_centered: + True means to consider X as having mean 0 (even if it actually doesn't!) + + """ + if x_centered: + centered_X = X + else: + centered_X = X - numpy.mean(X, axis=0) + return pca_from_cov( numpy.cov(centered_X.T), max_components=max_components, + max_energy_fraction=max_energy_fraction) + + +def pca_whiten(X, pca): + """ + Return the projection of X onto it's principle components. + + The return value has the same number of rows as X, but the number of columns is the number + of principle components. Columns of the return value have mean 0, variance 1, and are + uncorrelated. + + + See also fft_whiten.py + + :param pca: the (w,v) pair returned by e.g. pca_from_examples(X) + + """ + w,v = pca + + centered_X = X - numpy.mean(X, axis=0) + eigvals, eigvecs = pca_from_examples(centered_X, + max_components=max_components, max_energy_fraction=max_energy_fraction, + x_centered=True) + + rotated_X = numpy.dot(centered_X, eigvecs) + rotated_X /= numpy.sqrt(eigvals) + + return rotated_X + +