changeset 965:bf54637bb994

merge
author James Bergstra <bergstrj@iro.umontreal.ca>
date Fri, 20 Aug 2010 09:31:39 -0400
parents 6a778bca0dec (diff) d944e1c26a57 (current diff)
children e88d7b7d53ed
files
diffstat 30 files changed, 1146 insertions(+), 808 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/__init__.py	Mon Aug 16 10:39:36 2010 -0400
+++ b/pylearn/__init__.py	Fri Aug 20 09:31:39 2010 -0400
@@ -1,3 +1,22 @@
+"""Pylearn - Theano-based machine learning library
+
+Modules:
+
+ - `io` - Python parsers/writers for various file formats
+
+ - `datasets` - loading of various datasets (depends on `io`)
+
+ - `preprocessing` - pre-processing and feature extraction for various modalities
+
+ - `external` - code for interacting with other packages
+
+ - `gd` - gradient-based optimization (for stochastic online learning)
+
+ - `sampling` - theano expressions and tools for sampling from energy functions
+
+ - `algorithms` - implementations of [published] models typically with a script that reproduces a published result.  (depends on all)
+
+"""
 #import exceptions
 
 
--- a/pylearn/algorithms/minimizer.py	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-"""Define the interface and factory for gradient-based minimizers.
-"""
-import theano
-
-class DummyMinimizer(theano.Module):
-    """ The idea of a minimizer is that it provides an `step` function that will
-    eventually converge toward (maybe realize?) the minimum of a cost function.
-
-    The step_cost function takes a step and returns the cost associated with either
-    the current or previous parameter values (return whichever is easiest to compute, it's
-    meant for user feedback.)
-
-    """
-    def __init__(self, args, cost, parameters, gradients=None):
-        super(DummyMinimizer, self).__init__()
-
-    def _instance_step(self, obj, *args):
-        """Move the parameters toward the minimum of a cost
-
-        :param args: The arguments here should be values for the Variables that were in the
-        `args` argument to the constructor.
-
-        :Return:  None
-        """
-        pass
-
-    def _instance_step_cost(self, obj, *args):
-        """Move the parameters toward the minimum of a cost, and compute the cost
-
-        :param args: The arguments here should be values for the Variables that were in the
-        `args` argument to the constructor.
-
-        :Return:  The current cost value.
-        """
-        pass
-
--- a/pylearn/algorithms/sgd.py	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-"""A stochastic gradient descent minimizer. (Possibly the simplest minimizer.)
-"""
-
-import theano
-
-class StochasticGradientDescent(theano.Module):
-    """Fixed stepsize gradient descent
-
-    Methods for gradient descent are:
-    - step(arg_vals) which returns None and updates the params
-    - step_cost(arg_vals) which returns the cost value, and updates the params
-    
-    """
-    def __init__(self, args, cost, params, 
-                 gradients=None, stepsize=None, 
-                 updates=None, auxout=None, methods=True):
-        """
-        :param stepsize: the step to take in (negative) gradient direction
-        :type stepsize: None, scalar value, or scalar TensorVariable
-
-        :param updates: extra symbolic updates to make when evating either step or step_cost
-        (these override the gradients if necessary)
-        :type updates: dict Variable -> Variable
-        :param auxout: auxiliary outputs, list containing output symbols to 
-                      compute at the same time as cost (for efficiency)
-        :param methods: Should this module define the step and step_cost methods?
-        """
-        super(StochasticGradientDescent, self).__init__()
-        self.stepsize_init = None
-
-        if stepsize is None:
-            self.stepsize = theano.tensor.dscalar()
-        elif isinstance(stepsize, theano.tensor.TensorVariable):
-            self.stepsize = stepsize
-        else:
-            self.stepsize = (theano.tensor.as_tensor_variable(stepsize))
-
-        if self.stepsize.ndim != 0:
-            raise TypeError('stepsize must be a scalar', stepsize)
-
-        self.params = params
-        self.gparams = theano.tensor.grad(cost, self.params) if gradients is None else gradients
-
-        self._updates = (dict((p, p - self.stepsize * g) for p, g in zip(self.params, self.gparams)))
-        if updates is not None:
-            self._updates.update(updates)
-
-        if methods:
-            if auxout is None:
-                self.step = theano.Method(args, [], updates=self._updates)
-                self.step_cost = theano.Method(args, cost, updates=self._updates)
-            else:
-                # step cost always returns a list if auxout
-                self.step = theano.Method(
-                        args, [] + auxout,
-                        updates=self._updates)
-                self.step_cost = theano.Method(
-                        args, [cost]+auxout,
-                        updates=self._updates)
-
-
-    updates = property(lambda self: self._updates.copy())
-
-    def _instance_initialize(self, obj):
-        pass
-
-def sgd_minimizer(stepsize=None):
-    """Curry the stepsize argument to StochasticGradientDescent, providing standard minimizer interface
-    
-    :returns: standard minimizer constructor f(args, cost, params, gradient=None)
-    """
-    def f(args, cost, params, gradients=None, updates=None, auxout=None):
-        return StochasticGradientDescent(args, cost, params, gradients=gradients, stepsize=stepsize,
-                updates=updates, auxout=auxout)
-    return f
--- a/pylearn/algorithms/stopper.py	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,131 +0,0 @@
-import time
-"""Early stopping iterators
-
-The idea here is to supply early-stopping heuristics that can be used in the
-form:
-
-    stopper = SomeEarlyStopper()
-
-    for i in stopper():
-        # train from data
-        if i.set_score:
-            i.score = validation_score
-
-
-So far I only have one heuristic, so maybe this won't scale.
-"""
-
-class Stopper(object):
-
-    def train(self, data, update_rows_fn, update, validate, save=None):
-        """Return the best model trained on data
-
-        Parameters:
-        data - a thing that accepts getitem(<list of int64>), or a tuple of such things
-        update_rows_fn - fn : int --> <list or tensor of int>
-        update - fn: update an internal model from elements of data
-        validate - fn: evaluate an internal model based on elements of data
-        save - fn: return a copy of the internal model
-
-        The body of this function exhausts the <self> iterator, and trains a
-        model using early stopping in the process.
-        """
-
-        best = None
-        for stp in self:
-            i = stp.iter
-
-            # call update on some training set rows
-            t_rows = update_rows_fn(i)
-            if isinstance(data, (tuple, list)):
-                update(*[d[t_rows] for d in data])
-            else:
-                update(data[t_rows])
-
-            if stp.set_score:
-                stp.score = validate()
-                if (stp.score < stp.best_score) and save:
-                    best = save()
-        return best
-
-    def find_min(self, step, check, save):
-        best = None
-        for stp in self:
-            step()
-            if stp.set_score:
-                stp.score = check()
-                if (stp.score < stp.best_score) and save:
-                    best = (save(), stp.iter, stp.score)
-        return best
-
-class ICML08Stopper(Stopper):
-    @staticmethod
-    def icml08(ntrain, batchsize):
-        """Some setting similar to what I used for ICML08 submission"""
-        #TODO: what did I actually use? put that in here.
-        return ICML08Stopper(30*ntrain/batchsize,
-                ntrain/batchsize, 0.96, 2.0, 100000000)
-
-    def __init__(self, i_wait, v_int, min_improvement, patience, hard_limit, hard_time_limit=None):
-        self.initial_wait = i_wait
-        self.set_score_interval = v_int
-        self.min_improvement = min_improvement
-        self.patience = patience
-        self.hard_limit = hard_limit
-        self.hard_limit_seconds = hard_time_limit
-        self.start_time = time.time()
-
-        self.best_score = float('inf')
-        self.best_iter = -1
-        self.iter = -1
-
-        self.set_score = False
-        self.score = None
-
-    def __iter__(self):
-        return self
-
-    E_set_score = 'when iter.set_score is True, caller must assign a score to iter.score'
-    def next(self):
-
-        #print 'ICML08 stopper, were doing a next'
-
-        if self.set_score: #left over from last time
-            if self.score is None:
-                raise Exception(ICML08Stopper.E_set_score)
-            if self.score < (self.best_score * self.min_improvement):
-                (self.best_score, self.best_iter) = (self.score, self.iter)
-            self.score = None #un-set it
-
-
-        starting = self.iter < self.initial_wait
-        waiting = self.iter < (self.patience * self.best_iter)
-        if self.hard_limit_seconds != None:
-            times_up = (time.time() - self.start_time) > self.hard_limit_seconds
-        else: times_up = False
-        if (starting or waiting) and not times_up:
-            # continue to iterate
-            self.iter += 1
-            if self.iter == self.hard_limit:
-                raise StopIteration
-            self.set_score = (self.iter % self.set_score_interval == 0)
-            return self
-
-        raise StopIteration
-
-class NStages(ICML08Stopper):
-    """Run for a fixed number of steps, checking validation set every so
-    often."""
-    def __init__(self, hard_limit, v_int):
-        ICML08Stopper.__init__(self, hard_limit, v_int, 1.0, 1.0, hard_limit)
-
-    #TODO: could optimize next() function. Most of what's in ICML08Stopper.next()
-    #is not necessary
-
-def geometric_patience(i_wait, v_int, min_improvement, patience, hard_limit):
-    return ICML08Stopper(i_wait, v_int, min_improvement, patience, hard_limit)
-
-def nstages(hard_limit, v_int):
-    return ICML08Stopper(hard_limit, v_int, 1.0, 1.0, hard_limit)
-
-
--- a/pylearn/algorithms/tests/test_sgd.py	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-import theano
-from theano.compile.debugmode import DebugMode
-from pylearn.algorithms import sgd
-
-mode = theano.compile.mode.get_default_mode()
-if isinstance(mode,DebugMode):
-    mode = 'FAST_RUN'
-
-def test_sgd0():
-
-    x = theano.tensor.dscalar('x')
-    y = theano.tensor.dscalar('y')
-
-    M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=0.01)
-    M.y = y
-    m = M.make(mode=mode)
-    m.y = 5.0
-    for i in xrange(100):
-        c = m.step_cost(3.0)
-        #print c[0], m.y
-
-    assert c < 1.0e-5
-    assert abs(m.y - (1.0 / 3)) < 1.0e-4
-
-def test_sgd_stepsize_variable():
-
-    x = theano.tensor.dscalar('x')
-    y = theano.tensor.dscalar('y')
-    lr = theano.tensor.dscalar('lr')
-
-    M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=lr)
-    M.y = y
-    M.lr = lr
-    m = M.make(mode=mode)
-    m.y = 5.0
-    m.lr = 0.01
-    for i in xrange(100):
-        c = m.step_cost(3.0)
-        # print c, m.y
-
-    assert c < 1.0e-5
-    assert abs(m.y - (1.0 / 3)) < 1.0e-4
-
-
-    #test that changing the lr has impact
-
-    m.y = 5.0
-    m.lr = 0.0
-    for i in xrange(10):
-        c = m.step_cost(3.0)
-        # print c, m.y
-
-    assert m.y == 5.0
-
-def test_sgd_stepsize_none():
-
-    x = theano.tensor.dscalar('x')
-    y = theano.tensor.dscalar('y')
-
-    M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y])
-    M.y = y
-    m = M.make(mode=mode)
-    m.y = 5.0
-    #there should be a learning rate here by default
-    assert m.stepsize is None
-    m.stepsize = 0.01
-    for i in xrange(100):
-        c = m.step_cost(3.0)
-        # print c, m.y
-
-    assert c < 1.0e-5
-    assert abs(m.y - (1.0 / 3)) < 1.0e-4
-
-if __name__ == '__main__':
-    test_sgd0()
--- a/pylearn/dataset_ops/cifar10.py	Mon Aug 16 10:39:36 2010 -0400
+++ b/pylearn/dataset_ops/cifar10.py	Fri Aug 20 09:31:39 2010 -0400
@@ -59,7 +59,9 @@
 
 
 def cifar10(s_idx, split, dtype='float64', rasterized=False, color='grey'):
-    """
+    """ 
+    Returns a pair (img, label) of theano expressions for cifar-10 samples
+
     :param s_idx: the indexes
 
     :param split:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/dataset_ops/image_patches.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,41 @@
+import os, numpy
+import theano
+
+from pylearn.datasets.image_patches import  (
+        olshausen_field_1996_whitened_images,
+        extract_random_patches)
+
+from .protocol import TensorFnDataset # protocol.py __init__.py
+from .memo import memo
+
+@memo
+def get_dataset(N,R,C,dtype):
+    seed=98234
+    rng = numpy.random.RandomState(seed)
+    img_stack = olshausen_field_1996_whitened_images()
+    patch_stack = extract_random_patches(img_stack, N,R,C,rng)
+    return patch_stack.astype(dtype).reshape((N,(R*C)))
+
+def image_patches(s_idx, dims,
+        split='train', dtype=theano.config.floatX, rasterized=False):
+    N,R,C=dims
+
+    if split != 'train':
+        raise NotImplementedError('train/test/valid splits for randomly sampled image patches?')
+
+    if not rasterized:
+        raise NotImplementedError()
+
+    op = TensorFnDataset(dtype, bcast=(False,), fn=(get_dataset, (N,R,C,dtype,)), single_shape=(R*C,))
+    x = op(s_idx%N)
+    if x.ndim == 1:
+        if not rasterized:
+            x = x.reshape((20,20))
+    elif x.ndim == 2:
+        if not rasterized:
+            x = x.reshape((x.shape[0], 20,20))
+    else:
+        assert False, 'what happened?'
+
+    return x
+
--- a/pylearn/dataset_ops/protocol.py	Mon Aug 16 10:39:36 2010 -0400
+++ b/pylearn/dataset_ops/protocol.py	Fri Aug 20 09:31:39 2010 -0400
@@ -66,6 +66,12 @@
     property of this Op, then pickling the Op would require pickling the entire dataset.
     """
     def __init__(self, dtype, bcast, fn, single_shape=None, batch_size=None):
+        """
+        :type fn: callable or (callable, args) tuple [MUST BE PICKLABLE!]
+
+        :param fn: function that returns the dataset as a tensor. Leading index is the example
+        index, others are considered part of each example.
+        """
         super(TensorFnDataset, self).__init__(dtype, bcast, single_shape, batch_size)
         try:
             self.fn, self.fn_args = fn
--- a/pylearn/datasets/cifar10.py	Mon Aug 16 10:39:36 2010 -0400
+++ b/pylearn/datasets/cifar10.py	Fri Aug 20 09:31:39 2010 -0400
@@ -88,3 +88,19 @@
 
 def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200):
     return cifar10(dtype, ntrain, nvalid, ntest)
+
+def tile_rasterized_examples(X):
+    """Returns an ndarray that is ready to be passed to `image_tiling.save_tiled_raster_images`
+
+    This function is for the `x` matrices in the cifar dataset, or for the weight matrices
+    (filters) used to multiply them.
+    """
+    X = X.astype('float32')
+    r = X[:,:1024]
+    g = X[:,1024:2048]
+    b = X[:,2048:]
+    from pylearn.io.image_tiling import tile_raster_images
+    rval = tile_raster_images((r,g,b,None), img_shape=(32,32))
+    return rval
+
+
--- a/pylearn/datasets/image_patches.py	Mon Aug 16 10:39:36 2010 -0400
+++ b/pylearn/datasets/image_patches.py	Fri Aug 20 09:31:39 2010 -0400
@@ -5,6 +5,8 @@
 import os
 import numpy
 
+import scipy.io  #for loadmat
+
 from .config import data_root
 from .dataset import Dataset
 
@@ -14,6 +16,13 @@
          '12by12_whiten_01': ('natural_images_patches_whiten_12_by_12_0_1.amat',(12,12))}
 
 def load_dataset(ntrain=70000, nvalid=15000, ntest=15000, variant='20by20_whiten_01'):
+    # This is implementation loads files which are way too big (storing bytes in double), and
+    # stored in ascii!??, and which appear to be un-documented variants on the original
+    # raw/whitened
+    # data.
+    # I'd like to deprecate this function.
+    # -JB Aug 2010
+    print >> sys.stderr, "WARNING: pylearn.datasets.image_patches.load_dataset is badly documented and does some weird stuff... could someone who uses this function do something about it?"
     
     ndata = 100000
 
@@ -40,3 +49,65 @@
     rval.img_shape = paths[variant][1]
 
     return rval
+
+
+#TODO: a little function to render a tiling of example images to an image using PIL
+
+#TODO: a pca_load_dataset function that loads the data, as projected onto principle components
+
+def olshausen_field_1996_whitened_images(path=None):
+    """Returns a (512,512,10) ndarray containing 10 whitened images.
+    
+    The images are in floating point.
+    Whitening was done by the paper authors, with band-pass whitening I think.
+    """
+    if path is None:
+        path=os.path.join(data_root(), 'image_patches', 'olshausen',
+                'original', 'IMAGES.mat')
+    images = scipy.io.loadmat(path)['IMAGES']
+    assert images.shape == (512,512,10)
+    return images.astype('float32')
+
+def olshausen_field_1996_raw_images(path=None):
+    """Returns a (512,512,10) ndarray containing 10 images.
+    
+    The images are in floating point.
+    """
+    if path is None:
+        path=os.path.join(data_root(), 'image_patches', 'olshausen',
+                'original', 'IMAGES_RAW.mat')
+    images = scipy.io.loadmat(path)['IMAGES_RAW']
+    assert images.shape == (512,512,10)
+    return images.astype('float32')
+
+def extract_random_patches(img_stack, N, R,C, rng):
+    """Return subimages from the img_stack
+
+    :param img_stack: a 3D ndarray (n_images, rows, cols) or a list of 2D images.
+    :param N: number of patches to extract
+    :param R: number of rows in patch
+    :param C: number of cols in patch
+    :param rng: numpy RandomState
+    
+    Sub-image regions are chosen at random from the img_stack with uniform probability, and
+    then within each image with uniform probability across the image.  Patches from a larger
+    image in the stack therefore would be sampled less frequently than patches from a smaller
+    image in the stack.
+
+    To use ZCA whitening, extract patches from the raw data, and pass it to
+    preprocessing.pca.zca_whitening.
+    """
+    rval = numpy.empty((N,R,C), dtype=img_stack[0].dtype)
+
+    L = len(img_stack)
+    img_idxlist = rng.randint(L,size=N)
+
+    for n, img_idxlist in enumerate(img_idxlist):
+        img_idx = rng.randint(L)
+        img_n = img_stack[img_idx]
+        offset_R = rng.randint(img_n.shape[0]-R+1)
+        offset_C = rng.randint(img_n.shape[1]-C+1)
+        rval[n] = img_n[offset_R:offset_R+R,offset_C:offset_C+C]
+
+    return rval
+
--- a/pylearn/examples/daa.py	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,206 +0,0 @@
-import math, sys, time, copy, cPickle, shutil, functools
-import numpy 
-
-import theano
-from theano import tensor
-from theano.compile import module
-from theano.sandbox.softsign import softsign
-from theano import tensor as T, sparse as S
-from theano.tensor import nnet, as_tensor
-from theano.tensor.randomstreams import RandomStreams
-from theano.printing import Print
-from theano.compile.mode import Mode
-
-from ..algorithms import cost, minimizer, logistic_regression, sgd, stopper
-from ..io import filetensor
-from ..datasets import MNIST
-
-
-class AbstractFunction(Exception): """Override me"""
-
-#
-# default corruption function for BasicDAA
-# to extend
-#
-def corrupt_random_zeros(x, rng, p_zero):
-    mask = rng.binomial(T.shape(x), 1, 1.0 - p_zero)
-    return mask * x
-
-#
-# BasicDAA
-# ========
-#
-# Re-usable module, could be in pylearn.algorithms.daa
-#
-class BasicDAA(theano.Module):
-    def __init__(self, n_visible, n_hidden, seed,
-            w=None, 
-            vbias=None, 
-            hbias=None,
-            sigmoid=nnet.sigmoid,
-            corrupt=functools.partial(corrupt_random_zeros, p_zero=0.1),
-            reconstruction_cost=cost.cross_entropy,
-            w_scale = None
-            ):
-        """
-        :param w_scale: should be a floating-point value or None.  The weights are initialized
-        to a random value on the interval (-1,1) and scaled by this value.
-        None means the default of 1/sqrt(max(n_visible, n_hidden))
-
-
-        """
-        super(BasicDAA, self).__init__()
-
-        self.n_visible = n_visible
-        self.n_hidden = n_hidden
-        self.random = RandomStreams()
-        self.seed = seed
-        self.w_scale = w_scale if w_scale is not None else 1.0 / math.sqrt(max(n_visible,n_hidden))
-        self.sigmoid = sigmoid
-        self.corrupt = corrupt
-        self.reconstruction_cost = reconstruction_cost
-
-        self.w = tensor.dmatrix() if w is None else w
-        self.vbias = tensor.dvector() if vbias is None else vbias
-        self.hbias = tensor.dvector() if hbias is None else hbias
-
-        self.params = [self.w, self.vbias, self.hbias]
-
-    def _instance_initialize(self, obj):
-        #consider offering override parameters for seed, n_visible, n_hidden, w_scale
-        super(BasicDAA, self)._instance_initialize(obj)
-        rng = numpy.random.RandomState(self.seed)
-        s = rng.randint(2**30)
-        obj.w = (rng.rand(self.n_visible, self.n_hidden) * 2.0 - 1.0) * self.w_scale
-        obj.vbias = numpy.zeros(self.n_visible)
-        obj.hbias = numpy.zeros(self.n_hidden)
-        obj.random.initialize(int(s))
-
-    def hidden_act(self, visible):
-        return theano.dot(visible, self.w) + self.hbias
-
-    def hidden(self, visible):
-        return self.sigmoid(self.hidden_act(visible))
-
-    def visible_act(self, hidden):
-        return theano.dot(hidden, self.w.T) + self.vbias
-
-    def visible(self, hidden):
-        return self.sigmoid(self.visible_act(hidden))
-
-    def reconstruction(self, visible):
-        return self.visible(self.hidden(self.corrupt(visible, self.random)))
-
-    def daa_cost(self, visible):
-        return self.reconstruction_cost(visible, self.reconstruction(visible))
-
-    def l2_cost(self):
-        return self.w.norm(2)
-
-
-
-#
-# StackedDAA
-# ==========
-#
-# Hacky experiment type code, which would *not* be in pylearn.
-#
-# This Module can be extended / parametrized so many ways that I think this code is best cut &
-# pasted.
-#
-class StackedDAA(theano.Module):
-    def __init__(self, layer_widths, seed, finetune_lr=1e-3, pretrain_lr=1e-4):
-        super(StackedDAA, self).__init__()
-        input = theano.tensor.dmatrix()
-
-        #the parameters of this function, required for the minimizer
-        self.params = [] 
-        daa_widths = layer_widths[:-1]
-
-        #create the stack of DAA modules, and the self.params list
-        self.daa_list = []
-        daa_input = input
-        for i in xrange(len(daa_widths)-1):
-            self.daa_list.append(BasicDAA(daa_widths[i], daa_widths[i+1], seed+i))
-            daa_input = self.daa_list[-1].hidden(daa_input)
-            self.params.extend(self.daa_list[-1].params)
-
-        #put a logistic regression module on top for classification
-        self.classif = logistic_regression.LogRegN(input=daa_input,
-                n_in=layer_widths[-2], n_out = layer_widths[-1])
-        self.params.extend(self.classif.params)
-
-        #set up the fine-tuning function (minimizer.step)
-        FineTuneMinimizer = sgd.sgd_minimizer(stepsize=finetune_lr)
-        self.finetune = FineTuneMinimizer([input, self.classif.target], self.classif.unregularized_cost, self.params)
-
-        #set up the pre-training function
-        pretrain_cost, pretrain_input, pretrain_params = reduce(
-                lambda (c,i,p), daa: (c + daa.daa_cost(i), daa.hidden(i), p + daa.params),
-                self.daa_list,
-                (0.0, input, []))
-        PreTrainMinimizer = sgd.sgd_minimizer(stepsize=pretrain_lr)
-        self.pretrain = PreTrainMinimizer([input], pretrain_cost, pretrain_params)
-        
-    def _instance_initialize(self, obj):
-        #consider offering override parameters for seed, n_visible, n_hidden, w_scale
-        super(StackedDAA, self)._instance_initialize(obj)
-
-        #ugh... why do i need to do this???
-        for daa in obj.daa_list:
-            daa.initialize()
-        obj.classif.initialize()
-        obj.finetune.initialize()
-        obj.pretrain.initialize()
-
-#
-# DRIVER
-# ======
-#
-# This learning algorithm is the sort of thing that we've put in 'experiment' functions, that
-# can be run using dbdict.  
-#
-def demo_random(layer_widths=[3,4,5]):
-    sdaa = StackedDAA(layer_widths, seed=666).make(mode='FAST_COMPILE')
-
-    # create some training data
-    rng = numpy.random.RandomState(7832)
-    input_data = rng.randn(10,3)
-    targ_data = rng.binomial(1,.5, size=10)
-
-    print 'Pre-training ...'
-    for i in xrange(5):
-        print sdaa.pretrain.step_cost(input_data)
-
-    print 'Fine-tuning ...'
-    for i in xrange(5):
-        print sdaa.finetune.step_cost(input_data, targ_data)
-
-
-def demo_mnist(layer_widths=[784,500,500]):
-    sdaa = StackedDAA(layer_widths, seed=666).make()
-
-    mnist = MNIST.full()
-    batchsize=10
-    n_pretrain_batches=10000
-    n_finetune_batches=10000
-
-    t0 = time.time()
-    print 'Pre-training ...'
-    for i in xrange(n_pretrain_batches):
-        ii = (i*batchsize) % len(mnist.train.x)
-        x = mnist.train.x[ii:ii+batchsize]
-        c = sdaa.pretrain.step_cost(x)
-        if not i % 100:
-            print i, n_pretrain_batches, time.time() - t0, c
-
-    t1 = time.time()
-    print 'Fine-tuning ...'
-    for i in xrange(n_finetune_batches):
-        ii = (i*batchsize) % len(mnist.train.x)
-        x = mnist.train.x[ii:ii+batchsize]
-        y = mnist.train.y[ii:ii+batchsize]
-        c = sdaa.finetune.step_cost(x, y)
-        if not i % 100:
-            print i, n_finetune_batches, time.time() - t1, c
-
--- a/pylearn/examples/linear_classifier.py	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,224 +0,0 @@
-#! /usr/bin/env python
-"""
-T. Bertin-Mahieux (2008) University of Montreal
-bertinmt@iro.umontreal.ca
-
-linear_classifier.py
-Simple script that creates a linear_classifier, and
-learns the parameters using backpropagation.
-
-This is to illustrate how to use theano/pylearn.
-Anyone who knows how to make this script simpler/clearer is welcome to
-make the modifications.
-"""
-
-
-import os
-import sys
-import time
-import copy
-import pickle
-import numpy
-import numpy as N
-import numpy.random as NR
-from pylearn import cost
-import theano
-from theano import tensor as T
-
-
-def cost_function(*args,**kwargs) :
-    """ default cost function, quadratic """
-    return cost.quadratic(*args,**kwargs)
-
-
-class modelgraph() :
-    """ class that contains the graph of the model """
-    lr = T.scalar()                              # learning rate
-    inputs = T.matrix()                          # inputs (one example per line)
-    true_outputs = T.matrix()                    # outputs (one example per line)
-    W = T.matrix()                               # weights input * W + b= output
-    b = T.vector()                               # bias
-    outputs = T.dot(inputs,W) + b                # output, one per line
-    costs = cost_function(true_outputs,outputs)  # costs
-    g_W = T.grad(costs,W)                        # gradient of W
-    g_b = T.grad(costs,b)                        # gradient of b
-    new_W = T.sub_inplace(W, lr * g_W)           # update inplace of W
-    new_b = T.sub_inplace(b, lr * g_b)           # update inplace of b
-
-
-class model() :
-    """ 
-    The model! 
-    Contains needed matrices, needed functions, and a link to the model graph.
-    """
-
-    def __init__(self,input_size,output_size) :
-        """ init matrix and bias, creates the graph, create a dict of compiled functions """
-        # graph
-        self.graph = modelgraph()
-        # weights and bias, saved in self.params
-        seed = 666
-        r = NR.RandomState(seed)
-        W = r.uniform(size = [input_size, output_size], low = -1/N.sqrt(input_size), high = 1/N.sqrt(input_size))
-        b = numpy.zeros((output_size, ))
-        self.params = [W,b]
-        # dictionary of compiled functions
-        self.func_dict = dict()
-        # keep some init_infos (may not be necessary)
-        self.init_params = [input_size,output_size]
-
-
-    def update(self,lr,true_inputs,true_outputs) :
-        """ does an update of the model, one gradient descent """
-        # do we already have the proper theano function?
-        if self.func_dict.has_key('update_func') :
-            self.func_dict['update_func'](lr,true_inputs,true_outputs,self.params[0],self.params[1])
-            return
-        else :
-            # create the theano function, tell him what are the inputs and outputs)
-            func = theano.function([self.graph.lr,self.graph.inputs,self.graph.true_outputs,
-                                self.graph.W, self.graph.b],
-                               [self.graph.new_W,self.graph.new_b])
-            # add function to dictionary, so we don't compile it again
-            self.func_dict['update_func'] = func
-            # use this function
-            func(lr,true_inputs,true_outputs,self.params[0],self.params[1])
-            return
-    
-    def costs(self,true_inputs,true_outputs) :
-        """ get the costs for given examples, don't update """
-        # do we already have the proper theano function?
-        if self.func_dict.has_key('costs_func') :
-            return self.func_dict['costs_func'](true_inputs,true_outputs,self.params[0],self.params[1])
-        else :
-            # create the theano function, tell him what are the inputs and outputs)
-            func = theano.function([self.graph.inputs,self.graph.true_outputs,self.graph.W,self.graph.b],
-                               [self.graph.costs])
-            # add function to dictionary, se we don't compile it again
-            self.func_dict['costs_func'] = func
-            # use this function
-            return func(true_inputs,true_outputs,self.params[0],self.params[1])
-
-    def outputs(self,true_inputs) :
-        """ get the output for a set of examples (could be called 'predict') """
-        # do we already have the proper theano function?
-        if self.func_dict.has_key('outputs_func') :
-            return self.func_dict['outputs_func'](true_inputs,self.params[0],self.params[1])
-        else :
-            # create the theano function, tell him what are the inputs and outputs)
-            func = theano.function([self.graph.inputs, self.graph.W, self.graph.b],
-                               [self.graph.outputs])
-            # add function to dictionary, se we don't compile it again
-            self.func_dict['outputs_func'] = func
-            # use this function
-            return func(true_inputs,self.params[0],self.params[1])
-
-    def __getitem__(self,inputs) :
-        """ for simplicity, we can use the model this way: predictions = model[inputs] """
-        return self.outputs(inputs)
-
-    def __getstate__(self) :
-        """
-        To save/copy the model, used by pickle.dump() and by copy.deepcopy().
-        @return a dictionnary with the params (matrix + bias)
-        """
-        d = dict()
-        d['params'] = self.params
-        d['init_params'] = self.init_params
-        return d
-        
-    def __setstate__(self,d) :
-        """
-        Get the dictionary created by __getstate__(), use it to recreate the model.
-        """
-        self.params = d['params']
-        self.init_params = d['init_params']
-        self.graph = modelgraph() # we did not save the model graph 
-
-    def __str__(self) :
-        """ returns a string representing the model """
-        res = "Linear regressor, input size =",str(self.init_params[0])
-        res += ", output size =", str(self.init_params[1])
-        return res
-
-    def __equal__(self,other) :
-        """ 
-        Compares the model based on the params.
-        @return True if the params are the same, False otherwise
-        """
-        # class
-        if not isinstance(other,model) :
-            return False
-        # input size
-        if self.params[0].shape[0] != other.params[0].shape[0] :
-            return False
-        # output size
-        if self.params[0].shape[1] != other.params[0].shape[1] :
-            return False
-        # actual values
-        if not (self.params[0] == other.params[0]).all():
-            return False
-        if not (self.params[1] == other.params[1]).all():
-            return False
-        # all good
-        return True
-
-
-def die_with_usage() :
-    """ help menu """
-    print 'simple script to illustrate how to use theano/pylearn'
-    print 'to launch:'
-    print '  python linear_classifier.py -launch'
-    sys.exit(0)
-
-
-
-#************************************************************
-# main
-
-if __name__ == '__main__' :
-
-    if len(sys.argv) < 2 :
-        die_with_usage()
-
-    # print create data
-    inputs = numpy.array([[.1,.2],
-                          [.2,.8],
-                          [.9,.3],
-                          [.6,.5]])
-    outputs = numpy.array([[0],
-                           [0],
-                           [1],
-                           [1]])
-    assert inputs.shape[0] == outputs.shape[0]
-
-    # create model
-    m = model(2,1)
-    
-    # predict
-    print 'prediction before training:'
-    print m[inputs]
-
-    # update it for 100 iterations
-    for k in range(50) :
-        m.update(.1,inputs,outputs)
-
-     # predict
-    print 'prediction after training:'
-    print m[inputs]
-
-    # show points
-    import pylab as P
-    colors = outputs.flatten().tolist()
-    x = inputs[:,0]
-    y = inputs[:,1]
-    P.plot(x[numpy.where(outputs==0)[0]],y[numpy.where(outputs==0)[0]],'r+')
-    P.plot(x[numpy.where(outputs==1)[0]],y[numpy.where(outputs==1)[0]],'b+')
-    # decision line
-    p1 = (.5 - m.params[1] * 1.) / m.params[0][1,0] # abs = 0
-    p2 = (.5 - m.params[1] * 1.) / m.params[0][0,0] # ord = 0
-    P.plot((0,p2[0],2*p2[0]),(p1[0],0,-p1[0]),'g-')
-    # show
-    P.axis([-1,2,-1,2])
-    P.show()
-    
--- a/pylearn/examples/theano_update.py	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,56 +0,0 @@
-import theano
-from theano import tensor
-
-import numpy
-
-# Two scalar symbolic variables
-a = tensor.scalar()
-b = tensor.scalar()
-
-# Definition of output symbolic variable
-c = a * b
-# Definition of the function computing it
-fprop = theano.function([a,b], [c])
-
-# Initialize numerical variables
-a_val = numpy.array(12.)
-b_val = numpy.array(2.)
-print 'a_val =', a_val
-print 'b_val =', b_val
-
-# Numerical value of output is returned by the call to "fprop"
-c_val = fprop(a_val, b_val)
-print 'c_val =', c_val
-
-
-# Definition of simple update (increment by one)
-new_b = b + 1
-update = theano.function([b], [new_b])
-
-# New numerical value of b is returned by the call to "update"
-b_val = update(b_val)
-print 'new b_val =', b_val
-# We can use the new value in "fprop"
-c_val = fprop(a_val, b_val)
-print 'c_val =', c_val
-
-
-# Definition of in-place update (increment by one)
-re_new_b = tensor.add_inplace(b, 1.)
-re_update = theano.function([b], [re_new_b])
-
-# "re_update" can be used the same way as "update"
-b_val = re_update(b_val)
-print 'new b_val =', b_val
-# We can use the new value in "fprop"
-c_val = fprop(a_val, b_val)
-print 'c_val =', c_val
-
-# It is not necessary to keep the return value when the update is done in place
-re_update(b_val)
-print 'new b_val =', b_val
-c_val = fprop(a_val, b_val)
-print 'c_val =', c_val
-
-
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/gd/README.txt	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,2 @@
+
+see __init__.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/gd/__init__.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,11 @@
+"""Gradient Descent
+
+This module should contain tools and algorithms related to [stochastic] gradient descent.  For
+example:
+
+ - SGD with/without momentum
+ - Hessian Free GD
+ - TONGA
+ - Stopping criteria (incl. for use in theano functions)
+
+"""
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/gd/sgd.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,75 @@
+"""A stochastic gradient descent minimizer. (Possibly the simplest minimizer.)
+"""
+
+import theano
+
+class StochasticGradientDescent(theano.Module):
+    """Fixed stepsize gradient descent
+
+    Methods for gradient descent are:
+    - step(arg_vals) which returns None and updates the params
+    - step_cost(arg_vals) which returns the cost value, and updates the params
+    
+    """
+    def __init__(self, args, cost, params, 
+                 gradients=None, stepsize=None, 
+                 updates=None, auxout=None, methods=True):
+        """
+        :param stepsize: the step to take in (negative) gradient direction
+        :type stepsize: None, scalar value, or scalar TensorVariable
+
+        :param updates: extra symbolic updates to make when evating either step or step_cost
+        (these override the gradients if necessary)
+        :type updates: dict Variable -> Variable
+        :param auxout: auxiliary outputs, list containing output symbols to 
+                      compute at the same time as cost (for efficiency)
+        :param methods: Should this module define the step and step_cost methods?
+        """
+        super(StochasticGradientDescent, self).__init__()
+        self.stepsize_init = None
+
+        if stepsize is None:
+            self.stepsize = theano.tensor.dscalar()
+        elif isinstance(stepsize, theano.tensor.TensorVariable):
+            self.stepsize = stepsize
+        else:
+            self.stepsize = (theano.tensor.as_tensor_variable(stepsize))
+
+        if self.stepsize.ndim != 0:
+            raise TypeError('stepsize must be a scalar', stepsize)
+
+        self.params = params
+        self.gparams = theano.tensor.grad(cost, self.params) if gradients is None else gradients
+
+        self._updates = (dict((p, p - self.stepsize * g) for p, g in zip(self.params, self.gparams)))
+        if updates is not None:
+            self._updates.update(updates)
+
+        if methods:
+            if auxout is None:
+                self.step = theano.Method(args, [], updates=self._updates)
+                self.step_cost = theano.Method(args, cost, updates=self._updates)
+            else:
+                # step cost always returns a list if auxout
+                self.step = theano.Method(
+                        args, [] + auxout,
+                        updates=self._updates)
+                self.step_cost = theano.Method(
+                        args, [cost]+auxout,
+                        updates=self._updates)
+
+
+    updates = property(lambda self: self._updates.copy())
+
+    def _instance_initialize(self, obj):
+        pass
+
+def sgd_minimizer(stepsize=None):
+    """Curry the stepsize argument to StochasticGradientDescent, providing standard minimizer interface
+    
+    :returns: standard minimizer constructor f(args, cost, params, gradient=None)
+    """
+    def f(args, cost, params, gradients=None, updates=None, auxout=None):
+        return StochasticGradientDescent(args, cost, params, gradients=gradients, stepsize=stepsize,
+                updates=updates, auxout=auxout)
+    return f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/gd/stopper.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,131 @@
+import time
+"""Early stopping iterators
+
+The idea here is to supply early-stopping heuristics that can be used in the
+form:
+
+    stopper = SomeEarlyStopper()
+
+    for i in stopper():
+        # train from data
+        if i.set_score:
+            i.score = validation_score
+
+
+So far I only have one heuristic, so maybe this won't scale.
+"""
+
+class Stopper(object):
+
+    def train(self, data, update_rows_fn, update, validate, save=None):
+        """Return the best model trained on data
+
+        Parameters:
+        data - a thing that accepts getitem(<list of int64>), or a tuple of such things
+        update_rows_fn - fn : int --> <list or tensor of int>
+        update - fn: update an internal model from elements of data
+        validate - fn: evaluate an internal model based on elements of data
+        save - fn: return a copy of the internal model
+
+        The body of this function exhausts the <self> iterator, and trains a
+        model using early stopping in the process.
+        """
+
+        best = None
+        for stp in self:
+            i = stp.iter
+
+            # call update on some training set rows
+            t_rows = update_rows_fn(i)
+            if isinstance(data, (tuple, list)):
+                update(*[d[t_rows] for d in data])
+            else:
+                update(data[t_rows])
+
+            if stp.set_score:
+                stp.score = validate()
+                if (stp.score < stp.best_score) and save:
+                    best = save()
+        return best
+
+    def find_min(self, step, check, save):
+        best = None
+        for stp in self:
+            step()
+            if stp.set_score:
+                stp.score = check()
+                if (stp.score < stp.best_score) and save:
+                    best = (save(), stp.iter, stp.score)
+        return best
+
+class ICML08Stopper(Stopper):
+    @staticmethod
+    def icml08(ntrain, batchsize):
+        """Some setting similar to what I used for ICML08 submission"""
+        #TODO: what did I actually use? put that in here.
+        return ICML08Stopper(30*ntrain/batchsize,
+                ntrain/batchsize, 0.96, 2.0, 100000000)
+
+    def __init__(self, i_wait, v_int, min_improvement, patience, hard_limit, hard_time_limit=None):
+        self.initial_wait = i_wait
+        self.set_score_interval = v_int
+        self.min_improvement = min_improvement
+        self.patience = patience
+        self.hard_limit = hard_limit
+        self.hard_limit_seconds = hard_time_limit
+        self.start_time = time.time()
+
+        self.best_score = float('inf')
+        self.best_iter = -1
+        self.iter = -1
+
+        self.set_score = False
+        self.score = None
+
+    def __iter__(self):
+        return self
+
+    E_set_score = 'when iter.set_score is True, caller must assign a score to iter.score'
+    def next(self):
+
+        #print 'ICML08 stopper, were doing a next'
+
+        if self.set_score: #left over from last time
+            if self.score is None:
+                raise Exception(ICML08Stopper.E_set_score)
+            if self.score < (self.best_score * self.min_improvement):
+                (self.best_score, self.best_iter) = (self.score, self.iter)
+            self.score = None #un-set it
+
+
+        starting = self.iter < self.initial_wait
+        waiting = self.iter < (self.patience * self.best_iter)
+        if self.hard_limit_seconds != None:
+            times_up = (time.time() - self.start_time) > self.hard_limit_seconds
+        else: times_up = False
+        if (starting or waiting) and not times_up:
+            # continue to iterate
+            self.iter += 1
+            if self.iter == self.hard_limit:
+                raise StopIteration
+            self.set_score = (self.iter % self.set_score_interval == 0)
+            return self
+
+        raise StopIteration
+
+class NStages(ICML08Stopper):
+    """Run for a fixed number of steps, checking validation set every so
+    often."""
+    def __init__(self, hard_limit, v_int):
+        ICML08Stopper.__init__(self, hard_limit, v_int, 1.0, 1.0, hard_limit)
+
+    #TODO: could optimize next() function. Most of what's in ICML08Stopper.next()
+    #is not necessary
+
+def geometric_patience(i_wait, v_int, min_improvement, patience, hard_limit):
+    return ICML08Stopper(i_wait, v_int, min_improvement, patience, hard_limit)
+
+def nstages(hard_limit, v_int):
+    return ICML08Stopper(hard_limit, v_int, 1.0, 1.0, hard_limit)
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/gd/tests/test_sgd.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,75 @@
+import theano
+from theano.compile.debugmode import DebugMode
+from pylearn.gd import sgd
+
+mode = theano.compile.mode.get_default_mode()
+if isinstance(mode,DebugMode):
+    mode = 'FAST_RUN'
+
+def test_sgd0():
+
+    x = theano.tensor.dscalar('x')
+    y = theano.tensor.dscalar('y')
+
+    M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=0.01)
+    M.y = y
+    m = M.make(mode=mode)
+    m.y = 5.0
+    for i in xrange(100):
+        c = m.step_cost(3.0)
+        #print c[0], m.y
+
+    assert c < 1.0e-5
+    assert abs(m.y - (1.0 / 3)) < 1.0e-4
+
+def test_sgd_stepsize_variable():
+
+    x = theano.tensor.dscalar('x')
+    y = theano.tensor.dscalar('y')
+    lr = theano.tensor.dscalar('lr')
+
+    M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=lr)
+    M.y = y
+    M.lr = lr
+    m = M.make(mode=mode)
+    m.y = 5.0
+    m.lr = 0.01
+    for i in xrange(100):
+        c = m.step_cost(3.0)
+        # print c, m.y
+
+    assert c < 1.0e-5
+    assert abs(m.y - (1.0 / 3)) < 1.0e-4
+
+
+    #test that changing the lr has impact
+
+    m.y = 5.0
+    m.lr = 0.0
+    for i in xrange(10):
+        c = m.step_cost(3.0)
+        # print c, m.y
+
+    assert m.y == 5.0
+
+def test_sgd_stepsize_none():
+
+    x = theano.tensor.dscalar('x')
+    y = theano.tensor.dscalar('y')
+
+    M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y])
+    M.y = y
+    m = M.make(mode=mode)
+    m.y = 5.0
+    #there should be a learning rate here by default
+    assert m.stepsize is None
+    m.stepsize = 0.01
+    for i in xrange(100):
+        c = m.step_cost(3.0)
+        # print c, m.y
+
+    assert c < 1.0e-5
+    assert abs(m.y - (1.0 / 3)) < 1.0e-4
+
+if __name__ == '__main__':
+    test_sgd0()
--- a/pylearn/io/image_tiling.py	Mon Aug 16 10:39:36 2010 -0400
+++ b/pylearn/io/image_tiling.py	Fri Aug 20 09:31:39 2010 -0400
@@ -11,7 +11,8 @@
     ndar *= 1.0 / (ndar.max()+eps)
     return ndar
 
-def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0,0),
+def tile_raster_images(X, img_shape, 
+        tile_shape=None, tile_spacing=(1,1),
         scale_rows_to_unit_interval=True, 
         output_pixel_vals=True
         ):
@@ -27,16 +28,26 @@
     :type img_shape: tuple; (height, width)
     :param img_shape: the original shape of each image
     :type tile_shape: tuple; (rows, cols)
-    :param tile_shape: the number of images to tile (rows, cols)
+    :param tile_shape: the number of images to tile (rows, cols) (Defaults to a square-ish
+        shape with the right area for the number of images)
 
     :returns: array suitable for viewing as an image.  (See:`PIL.Image.fromarray`.)
     :rtype: a 2-d array with same dtype as X.
 
     """
+    if isinstance(X, tuple): 
+        n_images_in_x = X[0].shape[0]
+    else:
+        n_images_in_x = X.shape[0]
+
+    if tile_shape is None:
+        tile_shape = most_square_shape(n_images_in_x)
+
     assert len(img_shape) == 2
     assert len(tile_shape) == 2
     assert len(tile_spacing) == 2
 
+    #out_shape is the shape in pixels of the returned image array
     out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 
         in zip(img_shape, tile_shape, tile_spacing)]
 
@@ -82,3 +93,25 @@
         return out_array
 
 
+def most_square_shape(N):
+    """rectangle (height, width) with area N that is closest to sqaure
+    """
+    for i in xrange(int(numpy.sqrt(N)),0, -1):
+        if 0 == N % i:
+            return (i, N/i)
+
+def save_tiled_raster_images(tiled_img, filename):
+    """Save a a return value from `tile_raster_images` to `filename`.
+
+    Returns the PIL image that was saved
+    """
+    if tiled_img.ndim==2:
+        img = Image.fromarray( tiled_img, 'L')
+    elif tiled_img_ndim==3:
+        img = Image.fromarray( tiled_img, 'RGBA')
+    else:
+        raise TypeError('bad ndim', tiled_img)
+
+    img.save(filename)
+    return img
+
--- a/pylearn/lib/README.txt	Mon Aug 16 10:39:36 2010 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-This folder is for general, reusable code.
-Code may be moved upstream from time to time.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/README.txt	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,2 @@
+
+Module documentation is given by docstring in the __init__.py file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/__init__.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,13 @@
+
+"""Preprocessing module
+
+This module provides little numpy functions (not necessarily Theano expressions) for one-time
+pre-processing of data for modeling experiments.  The module is a work in progress, but the
+sort of thing that belongs here is:
+
+ - PCA
+ - whitening (for images and video)
+ - TF/IDF (for text)
+ - MFCC (for audio)
+
+"""
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/fft_whiten.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,76 @@
+"""
+Whitening algorithm used in Olshausen & Field, 1997.
+
+If you want to use some other images, there are a number of
+preprocessing steps you need to consider beforehand.  First, you
+should make sure all images have approximately the same overall
+contrast.  One way of doing this is to normalize each image so that
+the variance of the pixels is the same (i.e., 1).  Then you will need
+to prewhiten the images.  For a full explanation of whitening see
+
+Olshausen BA, Field DJ (1997)  Sparse Coding with an Overcomplete
+Basis Set: A Strategy Employed by V1?  Vision Research, 37: 3311-3325. 
+
+Basically, to whiten an image of size NxN, you multiply by the filter
+f*exp(-(f/f_0)^4) in the frequency domain, where f_0=0.4*N (f_0 is the
+cutoff frequency of a lowpass filter that is combined with the
+whitening filter).  Once you have preprocessed a number of images this
+way, all the same size, then you should combine them into one big N^2
+x M array, where M is the number of images.  Then rescale this array
+so that the average image variance is 0.1 (the parameters in sparsenet
+are set assuming that the data variance is 0.1).  Name this array
+IMAGES, save it to a file for future use, and you should be off and
+running.  The following Matlab commands should do this:
+
+
+    N=image_size;
+    M=num_images;
+
+    [fx fy]=meshgrid(-N/2:N/2-1,-N/2:N/2-1);
+    rho=sqrt(fx.*fx+fy.*fy);
+    f_0=0.4*N;
+    filt=rho.*exp(-(rho/f_0).^4);
+
+    for i=1:M
+    image=get_image;  % you will need to provide get_image
+    If=fft2(image);
+    imagew=real(ifft2(If.*fftshift(filt)));
+    IMAGES(:,i)=reshape(imagew,N^2,1);
+    end
+
+    IMAGES=sqrt(0.1)*IMAGES/sqrt(mean(var(IMAGES)));
+
+    save MY_IMAGES IMAGES
+
+"""
+
+import numpy as np
+def whiten(X, f0=None, n=None):
+    """
+    :param X: a 3D tensor n_images x rows x cols
+    :param f0: filter parameter (see docs)
+    :param n: filter parameter (see docs)
+
+    :returns: 3D tensor n_images x rows x cols of filtered images.
+    """
+    # May be mixed up with the size2 and size1s because matlab does things differnetly
+    R, C = X.shape[-2:]
+    if R %2:
+        raise NotImplementedError()
+    if C %2:
+        raise NotImplementedError()
+
+    if f0 is None:
+        f0 = .4 * min(R,C)
+    if n is None:
+        n = 4
+
+
+    fx,fy = np.mgrid[-R/2:R/2, -C/2:C/2]
+    rho=np.sqrt(fx**2 + fy**2)
+    filt=rho * np.exp( - (rho/f0)**4)
+    If = np.fft.fft2(X)
+    imagew=np.real(np.fft.ifft2(If * np.fft.fftshift(filt)))
+    assert imagew.shape == X.shape
+    return imagew
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/preprocessing/pca.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,119 @@
+"""
+
+There is potentially a lot of approaches to PCA, this file may get there eventually.
+
+
+Elements of this implementation have been borrowed from the MDP toolkit:
+    mdp/nodes/pca_nodes.py
+"""
+
+#TODO: estimate number of principle components by cross-validation (early stopping)
+
+import numpy
+import scipy.linalg
+
+def diag_as_vector(x):
+    if x.ndim != 2:
+        raise TypeError('this diagonal is implemented only for matrices', x)
+    rval = x[0,:min(*x.shape)]
+    rval.strides = (rval.strides[0] + x.strides[0],)
+    return rval
+
+
+def pca_from_cov(cov, lower=0, max_components=None, max_energy_fraction=None):
+    """Return (eigvals, eigvecs) of data with covariance `cov`.
+
+    The returned eigvals will be a numpy ndarray vector.
+    The returned eigvecs will be a numpy ndarray matrix whose *cols* are the eigenvectors.
+
+    This is recommended for retrieving many components from high-dimensional data.
+
+    :param cov: data covariance matrix
+    :type cov: a numpy ndarray 
+    """
+
+    w, v = scipy.linalg.eigh(a=cov, lower=lower)
+    # definition of eigh
+    #  a * v[:,i] = w[i] * vr[:,i]
+    #  v.H * v = identity
+
+
+    # total variance can be computed at this point:
+    # note that vartot == d.sum()
+    vartot = diag_as_vector(cov).sum()
+
+    assert numpy.allclose(vartot, w.sum())
+
+    a = numpy.argsort(w)[::-1]
+
+    w = w[a]
+    v = v[:,a]
+
+    if max_components != None:
+        w = w[:max_components]
+        v = v[:, :max_components]
+
+    if max_energy_fraction != None:
+        if not (0.0 <= max_energy_fraction <= 1.0):
+            raise ValueError('illegal value for max_energy_fraction', max_energy_fraction)
+        if max_energy_fraction < 1.0:
+            energy = 0
+            i = 0
+            while energy < max_energy_fraction * vartot:
+                energy += w[i]
+                i += 1
+            if i < len(w):
+                w = w[:i]
+                v = v[:,:i]
+
+
+    return w,v
+
+
+def pca_from_examples(X, max_components=None, max_energy_fraction=None, x_centered=False):
+    """Return (eigvals, eigvecs), centered_X of observations `X` (1-per-row)
+
+    This function exists to wrap several algorithms for getting the principle components.
+
+    :param max_components:
+        Return no more than this many principle components.
+
+    :param max_energy_fraction: 
+        Return [only] enough components to account for this fraction of the energy (aka
+        variance) in the observations.
+
+    :param x_centered:
+        True means to consider X as having mean 0 (even if it actually doesn't!)
+
+    """
+    if x_centered:
+        centered_X = X
+    else:
+        centered_X = X - numpy.mean(X, axis=0)
+    return pca_from_cov( numpy.cov(centered_X.T), max_components=max_components,
+            max_energy_fraction=max_energy_fraction), centered_X
+
+
+def pca_whiten((eigvals, eigvecs), centered_X,eps=1e-8):
+    """
+    Return the projection of X onto it's principle components.  
+    
+    The return value has the same number of rows as X, but the number of columns is the number
+    of principle components.  Columns of the return value have mean 0, variance 1, and are
+    uncorrelated.
+
+    :param pca: the (w,v) pair returned by e.g. pca_from_examples(X)
+
+    """
+    pca_of_X = numpy.dot(centered_X, eigvecs)
+    pca_of_X /= numpy.sqrt(eigvals)+eps
+    return pca_of_X
+
+def zca_whiten((eigvals, eigvecs), centered_X):
+    """Return the PCA of X but rotated back into the original vector space.
+
+    See also fft_whiten.py
+    """
+    pca_of_X = pca_whiten((eigvals,eigvecs), centered_X)
+    return numpy.dot(pca_of_X, eigvecs.T)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/sampling/README.txt	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,2 @@
+See __init__.py
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/sampling/__init__.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,17 @@
+"""Sampling
+
+This module [will] contain theano-related code for various sampling algorithms, such as for
+example:
+
+ - MCMC
+
+ - [Block] Gibbs Sampling
+
+ - Slice sampling
+
+ - HMC
+
+ - Tempering methods
+
+
+"""
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/sampling/hmc.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,203 @@
+"""Hybrid / Hamiltonian Monte Carlo Sampling
+
+This algorithm is described in Radford Neal's PhD Thesis, pages 63--70.
+
+"""
+import sys
+import logging
+import numpy as np
+from theano import function, shared
+from theano import tensor as TT
+import theano.sparse #installs the sparse shared var handler
+
+
+#TODO: 
+#  Consider how to refactor this code so that the key functions /functionality are provided as
+#  theano expressions??
+#  It could be limiting that the implementation requires that the position be a list of shared
+#  variables, and basically uses procedural logic on top of that.
+
+#TODO: 
+# Consider a heuristic for updating the *MASS* of the particles.  We might want the mass to be
+# such that the momentum is in the same range as the gradient on the energy.  Look at Radford's
+# recent book chapter, maybe there are hints. (2010).
+
+class HMC_sampler(object):
+    """Batch-wise Hybrid Monte-Carlo sampler
+
+
+    The `draw` function advances the markov chain and returns the current sample by calling
+    `simulate` and `get_position` in sequence.
+
+
+    :note:
+
+      The 'mass' of so-called particles is taken to be 1, so that kinetic energy (K) is the sum
+      of squared velocities (p).
+
+        :math:`K = \sum_i p_i^2 / 2`
+
+    """
+
+    # Constants taken from Marc'Aurelio's 'train_mcRBM.py' file found in the code online for his
+    # paper.
+    stepsize_dec = 0.98
+    stepsize_min = 0.001
+    stepsize_max = 0.25
+    stepsize_inc = 1.02
+    avg_acceptance_slowness = 0.9  # used in geometric avg. 1.0 would be not moving at all
+    n_steps=20
+
+    def __init__(self, positions, energy_fn, 
+            velocity=None,
+            initial_stepsize=0.01,
+            target_acceptance_rate=0.9,
+            seed=12345,
+            dtype=theano.config.floatX):
+        """
+        :param positions: list of shared ndarray variables.
+
+        :param energy: 
+
+            callable such that energy_fn(positions) 
+            returns theano vector of energies.  
+            The len of this vector is the batchsize.
+
+            The sum of this energy vector must be differentiable (with theano.tensor.grad) with
+            respect to the positions for HMC sampling to work.
+
+        """
+
+        self.stepsize = initial_stepsize
+        batchsize = positions[0].value.shape[0]
+        self.target_acceptance_rate = target_acceptance_rate
+        self.avg_acceptance_rate = self.target_acceptance_rate
+        self.s_rng = TT.shared_randomstreams.RandomStreams(seed)
+        self.positions = positions
+        if velocity is None:
+            self.velocity = [shared(np.zeros_like(q.value)) for q in self.positions]
+        else:
+            self.velocity = velocity
+
+        self.start_positions = [shared(np.zeros_like(q.value)) for q in self.positions]
+        self.initial_hamiltonian = shared(np.zeros(batchsize).astype(dtype) + float('inf'))
+
+        energy = energy_fn(positions)
+        # assuming mass is 1
+        kinetic_energy = 0.5 * sum([(p**2).sum(axis=1) for p in self.velocity])
+        hamiltonian = energy + kinetic_energy
+
+        dE_dpos = TT.grad(energy.sum(), self.positions)
+
+        s_stepsize = TT.scalar('stepsize')
+
+        self.velocity_step = function([s_stepsize],
+                [dE_dpos[0].norm(2)],
+                updates=[(p, p - s_stepsize*dE_dq) for p,dE_dq in zip(self.velocity,dE_dpos)])
+        self.position_step = function([s_stepsize], [],
+                updates=[(q, q + s_stepsize*p) for (q,p) in zip(self.positions, self.velocity)])
+
+        self.save_start_positions = function([], [],
+                updates=[(self.initial_hamiltonian, hamiltonian)] + \
+                        [(sq, q) for sq, q in zip(self.start_positions, self.positions)])
+
+        # sample the initial velocity from a
+        # gaussian with mean 0.
+        # Note: I think the fact that this distribution is symmetric about zero justifies not
+        # sampling forward versus backward dynamics.
+        self.randomize_velocity = function([], [],
+                updates=[(p, self.s_rng.normal(size=p.value.shape)) for p in self.velocity])
+
+
+        # accept-reject according to Metropolis algo
+
+        accept = TT.exp(self.initial_hamiltonian - hamiltonian) - self.s_rng.uniform(size=(batchsize,)) >= 0
+
+        self.accept_reject_positions = function([], accept.mean(),
+                updates=[
+                    (self.initial_hamiltonian, 
+                        TT.switch(accept, hamiltonian, self.initial_hamiltonian))] + [
+                    (q, 
+                        TT.switch(accept.dimshuffle(0, *(('x',)*(sq.ndim-1))), q, sq)) 
+                        for sq, q in zip(self.start_positions, self.positions)])
+
+    def simulate(self, n_steps=None):
+        if n_steps is None:
+            n_steps = self.n_steps
+
+        # updates self.velocity with new random numbers
+        self.randomize_velocity()
+        self.save_start_positions()
+
+        if 0:
+            # not necessary for random initial direction
+            if np.random.rand() > 0.5:
+                step_amount = self.stepsize
+            else:
+                step_amount = -self.stepsize
+        else:
+            step_amount = self.stepsize
+
+        if 0:
+            print "initial",
+            print "kinetic E", self.prev_energy.value ,
+            print (self.velocity[0].value**2).sum(axis=1),
+            print (self.velocity[0].value**2).sum(axis=1) + self.prev_energy.value
+
+
+        # Note on the order of leap-frog steps:
+        #
+        # the leap-frog algorithm can start with either position or velocity updates first,
+        # but one of them has to run an extra time (because of two half-steps).
+        # The position_step is cheap to evaluate, whereas the velocity_step is expensive,
+        # so we evaluate the position_step the extra time.
+        #
+        # At the same time, we cannot drop the last half-step update of the position, because
+        # the position is actually the terms we care about.
+
+        #opening half-step in leap-frog algorithm
+        self.position_step(step_amount/2.0)
+
+        # full leap-frog steps
+        for ss in range(n_steps):
+            self.velocity_step(step_amount)
+            if ss == n_steps-1:
+                # closing half-step in the leap-frog algorithm
+                self.position_step(step_amount/2.0)
+            else:
+                self.position_step(step_amount)
+
+        acceptance_rate = self.accept_reject_positions()
+        self.avg_acceptance_rate = self.avg_acceptance_slowness * self.avg_acceptance_rate \
+                + (1.0 - self.avg_acceptance_slowness) * acceptance_rate
+
+        if self.avg_acceptance_rate < self.target_acceptance_rate:
+            self.stepsize = max(self.stepsize*self.stepsize_dec,self.stepsize_min)
+        else:
+            self.stepsize = min(self.stepsize*self.stepsize_inc,self.stepsize_max)
+
+        if 0:
+            print "final kinetic E", self.prev_energy.value ,
+            print (self.velocity[0].value**2).sum(axis=1),
+            print (self.velocity[0].value**2).sum(axis=1) + self.prev_energy.value
+
+
+        # post-condition: 
+        # self.positions contains a new sample from our markov chain
+
+        # it is not returned from this function because accessing the .value of a shared
+        # variable can require making a copy
+        # see `draw()` or `get_position` for that behaviour.
+
+    def get_position(self):
+        return [q.value for q in self.positions]
+
+    def draw(self, n_steps=None):
+        """Return the current sample in the Markov chain as a list of numpy arrays
+
+        The size of the arrays will match the size of the `initial_position` argument to
+        __init__.
+        """
+        self.simulate(n_steps=n_steps)
+        return self.get_position()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/sampling/mcmc.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,103 @@
+"""Metropolis-Hastings Monte Carlo
+
+
+See also Hamiltonian Monte Carlo (aka Hybrid Monte Carlo) in the hmc.py file.
+
+"""
+import sys
+import logging
+import numpy as np
+from theano import function, shared
+from theano import tensor as TT
+import theano.sparse #installs the sparse shared var handler
+from theano.printing import Print
+
+#TODO: 
+#  Consider how to refactor this code so that the key functions /functionality are provided as
+#  theano expressions??
+#  It could be limiting that the implementation requires that the position be a list of shared
+#  variables, and basically uses procedural logic on top of that.
+#
+#  Theano can only do function optimization on one function at a time, so until these are
+#  written as update expressions, they must be compiled as their own functions and cannot be
+#  rolled into things the user is doing anyway.
+
+class MCMC_sampler(object):
+    """Generic Metropolis-Hastings Monte-Carlo sampler
+
+    The `draw` function advances the markov chain and returns the current sample by calling
+    `simulate` and `get_position` in sequence.
+
+    The stepsize is updated dynamically to 
+
+    """
+
+    # TODO: anyone has better guesses for these constants??
+    stepsize_dec = 0.98
+    stepsize_min = 0.001
+    stepsize_max = 0.25
+    stepsize_inc = 1.02
+    target_acceptance_rate=0.7
+
+    avg_acceptance_slowness = 0.9  # used in geometric avg. 1.0 would be not moving at all
+    n_steps=1
+
+    def __init__(self, positions, energy_fn, 
+            initial_stepsize=0.01,
+            seed=12345):
+        """
+        :param positions: list of shared ndarray variables.
+
+        :param energy: 
+
+            callable such that energy_fn(positions) 
+            returns theano vector of energies.  
+            The len of this vector is the batchsize.
+        """
+
+        batchsize = positions[0].value.shape[0]
+        self.s_rng = TT.shared_randomstreams.RandomStreams(seed)
+        self.positions = positions
+        self.prev_energy = shared(np.zeros(batchsize) + float('inf'))
+        self.avg_acceptance_rate = 0.5
+        self.stepsize = initial_stepsize
+
+        s_stepsize = TT.scalar('stepsize')
+
+        new_positions = [p + s_stepsize * self.s_rng.normal(size=p.value.shape)
+                for p in self.positions]
+
+        # accept-reject according to Metropolis-Hastings 
+
+        energy = energy_fn(new_positions)
+        accept = TT.exp(self.prev_energy - energy) - self.s_rng.uniform(size=(batchsize,)) >= 0
+
+        self.accept_reject_positions = function([s_stepsize], accept.mean(),
+                updates=[(self.prev_energy, TT.switch(accept, energy, self.prev_energy))] + [
+                    (q, TT.switch(accept.dimshuffle(0, *(('x',)*(q.ndim-1))), new_q, q)) 
+                        for q, new_q in zip(self.positions, new_positions)])
+
+    def simulate(self, n_steps=None):
+        if n_steps is None:
+            n_steps = self.n_steps
+        for ss in range(n_steps):
+            acceptance_rate = self.accept_reject_positions(self.stepsize)
+            self.avg_acceptance_rate = self.avg_acceptance_slowness * self.avg_acceptance_rate \
+                    + (1.0 - self.avg_acceptance_slowness) * acceptance_rate
+            if self.avg_acceptance_rate < self.target_acceptance_rate:
+                self.stepsize = max(self.stepsize*self.stepsize_dec,self.stepsize_min)
+            else:
+                self.stepsize = min(self.stepsize*self.stepsize_inc,self.stepsize_max)
+
+    def get_position(self):
+        return [q.value for q in self.positions]
+
+    def draw(self, n_steps=None):
+        """Return the current sample in the Markov chain as a list of numpy arrays
+
+        The size of the arrays will match the size of the `initial_position` argument to
+        __init__.
+        """
+        self.simulate(n_steps=n_steps)
+        return self.get_position()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/sampling/tests/test_hmc.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,65 @@
+from pylearn.sampling.hmc import *
+
+def _sampler_on_2d_gaussian(sampler_cls, burnin, n_samples):
+    batchsize=3
+
+    rng = np.random.RandomState(234)
+
+    #
+    # Define a covariance and mu for a gaussian
+    #
+    tmp = rng.randn(2,2).astype(theano.config.floatX)
+    tmp[0] += tmp[1] #induce some covariance
+    cov = np.dot(tmp, tmp.T)
+    cov_inv = np.linalg.inv(cov).astype(theano.config.floatX)
+    mu = np.asarray([5, 9.5], dtype=theano.config.floatX)
+
+    def gaussian_energy(xlist):
+        x, = xlist
+        return 0.5 * (TT.dot((x-mu),cov_inv)*(x-mu)).sum(axis=1)
+
+
+    position = shared(rng.randn(batchsize, 2).astype(theano.config.floatX))
+    sampler = sampler_cls([position], gaussian_energy)
+
+    print 'initial position', position.value
+    print 'initial stepsize', sampler.stepsize
+
+    # DRAW SAMPLES
+
+    samples = [sampler.draw() for r in xrange(burnin)] #burn-in
+    samples = np.asarray([sampler.draw() for r in xrange(n_samples)])
+
+    assert sampler.avg_acceptance_rate > 0
+    assert sampler.avg_acceptance_rate < 1
+
+    # TEST THAT THEY ARE FROM THE RIGHT DISTRIBUTION
+
+    # samples.shape == (1000, 1, 3, 2)
+
+    print 'target mean:', mu
+    print 'empirical mean: ', samples.mean(axis=0)[0]
+    #assert np.all(abs(mu - samples.mean(axis=0)) < 1)
+
+
+    print 'final stepsize', sampler.stepsize
+    print 'final acceptance_rate', sampler.avg_acceptance_rate
+
+    print 'target cov', cov
+    s = samples[:,0,0,:]
+    empirical_cov = np.cov(samples[:,0,0,:].T)
+    print ''
+    print 'cov/empirical_cov', cov/empirical_cov
+    empirical_cov = np.cov(samples[:,0,1,:].T)
+    print 'cov/empirical_cov', cov/empirical_cov
+    empirical_cov = np.cov(samples[:,0,2,:].T)
+    print 'cov/empirical_cov', cov/empirical_cov
+    return sampler
+
+def test_hmc():
+    print ('HMC')
+    sampler = _sampler_on_2d_gaussian(HMC_sampler, burnin=3000/20, n_samples=90000/20)
+    assert abs(sampler.avg_acceptance_rate - sampler.target_acceptance_rate) < .1
+    assert sampler.stepsize >= sampler.stepsize_min
+    assert sampler.stepsize <= sampler.stepsize_max
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/sampling/tests/test_mcmc.py	Fri Aug 20 09:31:39 2010 -0400
@@ -0,0 +1,61 @@
+from pylearn.sampling.mcmc import *
+
+def _sampler_on_2d_gaussian(sampler_cls, burnin, n_samples):
+    batchsize=3
+
+    rng = np.random.RandomState(234)
+
+    #
+    # Define a covariance and mu for a gaussian
+    #
+    tmp = rng.randn(2,2)
+    tmp[0] += tmp[1] #induce some covariance
+    cov = np.dot(tmp, tmp.T)
+    cov_inv = np.linalg.inv(cov)
+    mu = np.asarray([5, 9.5], dtype=theano.config.floatX)
+
+    def gaussian_energy(xlist):
+        x, = xlist
+        return 0.5 * (TT.dot((x-mu),cov_inv)*(x-mu)).sum(axis=1)
+
+
+    position = shared(rng.randn(batchsize, 2).astype(theano.config.floatX))
+    sampler = sampler_cls([position], gaussian_energy)
+
+    print 'initial position', position.value
+    print 'initial stepsize', sampler.stepsize
+
+    # DRAW SAMPLES
+
+    samples = [sampler.draw() for r in xrange(burnin)] #burn-in
+    samples = np.asarray([sampler.draw() for r in xrange(n_samples)])
+
+    assert sampler.avg_acceptance_rate > 0
+    assert sampler.avg_acceptance_rate < 1
+
+    # TEST THAT THEY ARE FROM THE RIGHT DISTRIBUTION
+
+    # samples.shape == (1000, 1, 3, 2)
+
+    print 'target mean:', mu
+    print 'empirical mean: ', samples.mean(axis=0)[0]
+    #assert np.all(abs(mu - samples.mean(axis=0)) < 1)
+
+
+    print 'final stepsize', sampler.stepsize
+    print 'final acceptance_rate', sampler.avg_acceptance_rate
+
+    print 'target cov', cov
+    s = samples[:,0,0,:]
+    empirical_cov = np.cov(samples[:,0,0,:].T)
+    print ''
+    print 'cov/empirical_cov', cov/empirical_cov
+    empirical_cov = np.cov(samples[:,0,1,:].T)
+    print 'cov/empirical_cov', cov/empirical_cov
+    empirical_cov = np.cov(samples[:,0,2,:].T)
+    print 'cov/empirical_cov', cov/empirical_cov
+    return sampler
+
+def test_mcmc():
+    print ('MCMC')
+    sampler = _sampler_on_2d_gaussian(MCMC_sampler, burnin=3000, n_samples=90000)