changeset 1331:0541e7d6e916

merge
author gdesjardins
date Thu, 14 Oct 2010 23:55:55 -0400
parents 3efd0effb2a7 (current diff) 63fe96ede21d (diff)
children 837768915081
files
diffstat 25 files changed, 2613 insertions(+), 601 deletions(-) [+]
line wrap: on
line diff
--- a/doc/formulas.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/doc/formulas.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -5,13 +5,18 @@
 ==========
 .. taglist:: 
 
+pylearn.formulas.activations
+----------------------------
+.. automodule:: pylearn.formulas.activations
+    :members:
+
 pylearn.formulas.costs
------------------------
+----------------------
 .. automodule:: pylearn.formulas.costs
     :members:
 
 pylearn.formulas.noise
------------------------
+----------------------
 .. automodule:: pylearn.formulas.noise
     :members:
  
--- a/doc/v2_planning/API_coding_style.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/doc/v2_planning/API_coding_style.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -259,10 +259,10 @@
 
         """Module docstring as the first line, as usual."""
 
-        __authors__ = "Olivier Delalleau, Frederic Bastien, David Warde-Farley"
+        __authors__   = "Olivier Delalleau, Frederic Bastien, David Warde-Farley"
         __copyright__ = "(c) 2010, Universite de Montreal"
-        __license__ = "3-clause BSD License"
-        __contact__ = "Name Of Current Guardian of this file <email@address>"
+        __license__   = "3-clause BSD License"
+        __contact__   = "Name Of Current Guardian of this file <email@address>"
 
     * Use ``//`` for integer division and ``/ float(...)`` if you want the
       floating point operation (for readability and compatibility across all
@@ -318,6 +318,9 @@
       test infinite / NaN values. This is important because ``numpy.nan !=
       float('nan')``.
 
+    * Whenever possible, mimic the numpy / scipy interfaces when writing code
+      similar to what can be found in these packages.
+
     * Avoid backslashes whenever possible. They make it more
       difficult to edit code, and they are ugly (as well as potentially
       dangerous if there are trailing white spaces).
@@ -345,6 +348,8 @@
       The position of the first
       element (on the same line or a new line) should be chosen depending on
       what is easiest to read (sometimes both can be ok).
+      Other formattings may be ok depending on the specific situation, use
+      common sense and pick whichever looks best.
 
       .. code-block:: python
 
@@ -474,18 +479,19 @@
 ===========
 
 The following code sample illustrates some of the coding guidelines one should
-follow in Pylearn. This is still a work-in-progress.
+follow in Pylearn. This is still a work-in-progress. Feel free to improve it and
+add more!
 
 .. code-block:: python
 
     #! /usr/env/bin python
 
-    """Sample code. There may still be mistakes / missing elements."""
+    """Sample code. Edit it as you like!"""
 
-    __authors__ = "Olivier Delalleau"
+    __authors__   = "Olivier Delalleau"
     __copyright__ = "(c) 2010, Universite de Montreal"
-    __license__ = "3-clause BSD License"
-    __contact__ = "Olivier Delalleau <delallea@iro>"
+    __license__   = "3-clause BSD License"
+    __contact__   = "Olivier Delalleau <delallea@iro>"
 
     # Standard library imports are on a single line.
     import os, sys, time
@@ -495,30 +501,145 @@
     import numpy
     import scipy
     import theano
-    # Put 'from' imports below.
+    # Individual 'from' imports come after packages.
     from numpy import argmax
     from theano import tensor
     
     # Application-specific imports come last.
-    from pylearn import dataset
-    from pylearn.optimization import minimize
+    # The absolute path should always be used.
+    from pylearn import datasets, learner
+    from pylearn.formulas import noise
+
+
+    # All exceptions inherit from Exception.
+    class PylearnError(Exception):
+        # TODO Write doc.
+        pass
+
+    # All top-level classes inherit from object.
+    class StorageExample(object):
+        # TODO Write doc.
+        pass
+    
+
+    # Two blank lines between definitions of top-level classes and functions.
+    class AwesomeLearner(learner.Learner):
+        # TODO Write doc.
+
+        def __init__(self, print_fields=None):
+            # TODO Write doc.
+            # print_fields is a list of strings whose counts found in the
+            # training set should be printed at the end of training. If None,
+            # then nothing is printed.
+            # Do not forget to call the parent class constructor.
+            super(AwesomeLearner, self).__init__()
+            # Use None instead of an empty list as default argument to
+            # print_fields to avoid issues with mutable default arguments.
+            self.print_fields = if_none(print_fields, [])
+        
+        # One blank line between method definitions.
+        def add_field(self, field):
+            # TODO Write doc.
+            # Test if something belongs to a container with `in`, not
+            # container-specific methods like `index`.
+            if field in self.print_fields:
+                # TODO Print a warning and do nothing.
+                pass
+            else:
+                # This is why using [] as default to print_fields in the
+                # constructor would have been a bad idea.
+                self.print_fields.append(field)
 
-    def print_files_in(directory):
-        """Print the first line of each file in given directory."""
-        # TODO To be continued...
+        def train(self, dataset):
+            # TODO Write doc (store the mean of each field in the training
+            # set).
+            self.mean_fields = {}
+            count = {}
+            for sample_dict in dataset:
+                # Whenever it is enough for what you need, use iterative
+                # instead of list versions of dictionary methods.
+                for field, value in sample_dict.iteritems():
+                    # Keep line length to max 80 characters, using parentheses
+                    # instead of \ to continue long lines.
+                    self.mean_fields[field] = (self.mean_fields.get(field, 0) +
+                                               value)
+                    count[field] = count.get(field, 0) + 1
+            for field in self.mean_fields:
+                self.mean_fields[field] /= float(count[field])
+            for field in self.print_fields:
+                # Test is done with `in`, not `has_key`.
+                if field in self.sum_fields:
+                    # TODO Use log module instead.
+                    print '%s: %s' % (field, self.sum_fields[field])
+                else:
+                    # TODO Print warning.
+                    pass
+        
+        def test_error(self, dataset):
+            # TODO Write doc.
+            if not hasattr(self, 'sum_fields'):
+                # Exceptions should be raised as follows (in particular, no
+                # string exceptions!).
+                raise PylearnError('Cannot test a learner that was not '
+                                   'trained.')
+            error = 0
+            count = 0
+            for sample_dict in dataset:
+                for field, value in sample_dict.iteritems():
+                    try:
+                        # Minimize code into a try statement.
+                        mean = self.mean_fields[field]
+                    # Always specicy which kind of exception you are
+                    # intercepting with except.
+                    except KeyError:
+                        raise PylearnError(
+                            "Found in a test sample a field ('%s') that had "
+                            "never been seen in the training set." % field)
+                    error += (value - self.mean_fields[field])**2
+                    count += 1
+            # Remember to divide by a floating point number unless you
+            # explicitly want an integer division (in which case you should
+            # use //).
+            mse = error / float(count)
+            # TODO Use log module instead.
+            print 'MSE: %s' % mse
+            return mse
+            
+
+    def if_none(val_if_not_none, val_if_none):
+        # TODO Write doc.
+        if val_if_not_none is not None:
+            return val_if_not_none
+        else:
+            return val_if_none
+        
+
+    def print_subdirs_in(directory):
+        # TODO Write doc.
+        # Using list comprehension rather than filter.
+        sub_dirs = sorted([d for d in os.listdir(directory)
+                             if os.path.isdir(os.path.join(directory, d))])
+        print '%s: %s' % (directory, ' '.join(sub_dirs))
+        # A `for` loop is often easier to read than a call to `map`.
+        for d in sub_dirs:
+            print_subdirs_in(os.path.join(directory, d))
+
 
     def main():
         if len(sys.argv) != 2:
             # Note: conventions on how to display script documentation and
-            # parse arguments are still to-be-determined.
+            # parse arguments are still to-be-determined. This is just one
+            # way to do it.
             print("""\
     Usage: %s <directory>
-    Print first line of each file in given directory (in alphabetic order)."""
+    For the given directory and all sub-directories found inside it, print
+    the list of the directories they contain."""
                   % os.path.basename(sys.argv[0]))
             return 1
-        print_files_in(sys.argv[1])
+        print_subdirs_in(sys.argv[1])
         return 0
 
+
     # Top-level executable code should be minimal.
     if __name__ == '__main__':
         sys.exit(main())
@@ -534,6 +655,44 @@
 
 .. _Wiki page: http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Divers/VimPythonRecommendations
 
+Commit message
+==============
+
+    * A one line summary. Try to keep it short, and provide the information
+      that seems most useful to other developers: in particular the goal of
+      a change is more useful than its description (which is always
+      available through the changeset patch log). E.g. say "Improved stability
+      of cost computation" rather than "Replaced log(exp(a) + exp(b)) by
+      a * log(1 + exp(b -a)) in cost computation".
+    * If needed a blank line followed by a more detailed summary
+    * Make a commit for each logical modification
+        * This makes reviews easier to do
+        * This makes debugging easier as we can more easily pinpoint errors in 
+	  commits with hg bisect
+    * NEVER commit reformatting with functionality changes
+    * Review your change before commiting
+        * "hg diff <files>..." to see the diff you have done
+        * "hg record" allows you to select which changes to a file should be
+          committed. To enable it, put into the file ~/.hgrc:
+
+          .. code-block:: bash
+
+              [extensions]
+              hgext.record=
+
+        * hg record / diff force you to review your code, never commit without
+          running one of these two commands first
+    * Write detailed commit messages in the past tense, not present tense.
+        * Good: "Fixed Unicode bug in RSS API."
+        * Bad: "Fixes Unicode bug in RSS API."
+        * Bad: "Fixing Unicode bug in RSS API."
+    * Separate bug fixes from feature changes.
+    * When fixing a ticket, start the message with "Fixed #abc"
+        * Can make a system to change the ticket?
+    * When referencing a ticket, start the message with "Refs #abc"
+        * Can make a system to put a comment to the ticket?
+
+
 TODO
 ====
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/v2_planning/arch_FB.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -0,0 +1,76 @@
+Current and extenstion of our framework
+=======================================
+
+This proposition is complementary to PL hook system and OB check point. This could be part of the backend of James system. I don't remember/know enought the other proposal.
+
+Supposition I make:
+
+* Dataset, Learner and Layers commity have done their work
+    * That mean we have a more easy way to make a learning model.
+* Checkpoint solved: we ignore(short jobs), don't care, manual checkpoint, structured checkpoint with an example or use OB system.
+
+Example MLP
+-----------
+
+* Select the hyper parameter search space with `jobman sqlschedules`
+* Dispatch the jobs with dbidispatch
+* *Manually* (fixable) reset jobs status to START.
+   * I started it, but I will change the syntax to make it more generic.
+* *Manually* relaunch crashed jobs.
+* *Manually* (fixable) analyse/visualise the result. (We need to start those meeting at some point)
+
+Example MLP+cross validataion
+-----------------------------
+
+* Modify the dataset interface to accept 2 new hyper parameter: nb_cross_fold=X, id_cross_fold.
+* Schedule all of the fold to run in parallel with `jobman sqlschedules`
+* *Manually* (fixable) reset jobs status to START. 
+* *Manually* relaunch crashed jobs.
+* *Manually* (fixable) analyse/visualize the result.
+   * Those tools need to understand the concept of cross validation
+* *Manually* (fixable with proposition bellow) launch a retrain on the full dataset with the best hyper parameter
+
+
+Example DBN
+-----------
+
+* *Concept* JOB PHASE. DBN( unsupervised and supervised)
+   * We suppose the job script have a parameter to tell him witch phase it should do.
+* *Jobman Extension* We can extend jobman to handle dependency between jobs.
+    * Proposed syntax:
+
+.. code-block:: bash
+
+      jobman sqlschedule p0={{}} ... -- p1={{}} ... -- p2=...
+
+        * The parameter before the first `--` tell on witch jobs the new jobs depends. (allow to depend on many jobs at the same time)
+        * The parameter between `--` tell that we want to create a new group of jobs for all those jobs.
+        * The parameter after the second `--` tell the new jobs to be create for each new group of jobs.
+
+* *Jobman Extension* create `jobman dispatch`
+    * This will dispatch new jobs to run on the cluster with dbidispatch when a jobs have his dependency finished.
+* *Jobman Extension* create `jobman monitor`
+    * This repeadly call `jobman condor_check` to print jobs that can potentially have crashed and print them on the screen. It need to filter the output of condor_check.
+    * Can create other `jobman CLUSTER_check` for mammouth,colosse,angel,...
+* *Jobman Extension* when we change the status of a job to START in jobman, change the status of the jobs that depend on it at the same time.
+* *Jobman Extension* determine if a job finished correctly or not
+   * If a job did not finish correctly don't start the following jobs.
+* *Jobman Policy* All change to the db should be doable by jobman command.
+
+* *Manually* relaunch crashed jobs.
+* *Manually* (fixable) analyse/visualise the result.
+   * Those tools need to understand the concept of job phase or be agnostic of that.
+
+
+* *Cross validataion retrain* can be done with an additional phase in the extensions.
+    * The new job need to know how to determine the best hyper parameter from the result.
+
+
+* This can be extended for double cross validation.
+   * Dataset must support double cross validation
+   * We create more phase in jobman.
+
+Hyper parameter search in Pylearn
+---------------------------------
+
+We would want to have the hyper parameter search being done in pylearn in some case. This will add a dependency on jobman. We can finish/verify how jobman work with sqlite to don't have request an installed db. sqlite is included in python 2.5. Jobman request python 2.5. We could make optional the jobman dependency on sqlalchemy when we use sqlite to limit the number of dependency.
--- a/doc/v2_planning/code_review.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/doc/v2_planning/code_review.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -10,10 +10,10 @@
 TODO
 ----
 
-- make a list of point to compare tools
-- review interresting projects
-- make a politic of review(who,what,what,how)
-- make a decission on projects
+- Install Review Board and try it
+    - test review of merge
+    - how work with branch
+- Write our proposed politic
 
 Some system that we should check:
 ---------------------------------
@@ -30,7 +30,7 @@
     - Interesting, but some questions remain (how well it integrates with hg,
       notably)
     - Some advantages over Google code (comment on multi-line chunks, list of
-      unreviewed commits)
+      unreviewed commits, more esthetics, handle many repo, keep assemble easily)
     - Fred will install it so we can test it more thoroughly
 
 - `Code Striker <http://codestriker.sourceforge.net/>`_
@@ -54,6 +54,11 @@
 - `Google Code <http://code.google.com/>`_
     - Test bench with a clone of Theano at
       http://code.google.com/p/theanoclone/
+    - post-commit
+    - no list of not reviewed commit
+    - no python syntax highlight
+    - weird comment by line
+    - diff of merge seam bugged
     - Maybe
 
 What we could want from our code review
@@ -62,9 +67,11 @@
 - integrate with our ticket system?
     - Should we keep our current ticket system?
 - work with mercurial, git?
+    - how branch/merge are handled
 - check each commit of theano/pylearn
 - check experimental repository code when asked
-- how show diff? patch? syntax highlight as vimdiff?
+- how show diff? diff as vimdiff? patch? 
+- syntax highlight
 - If we commit something that is disabled by default and not fully working, we can say it in the commit message to have a faster review(only check that by default it is disabled). Then we should say in the commit message when it is ready for a full review.
 - Review should be done by everybody.
 - Who choose the reviewer(random, commiter)? pool of reviewers? pool level 1,2,3 where 1 is everybody with commit right. pool for specific topic(gpu, ML algo, ...)?
@@ -94,7 +101,7 @@
 
 - We want at least 2 people to read all code. That mean we need a reviewer
 - This help to find better solution to problem
-- This help to train people on our tools ans framework.
+- This help to train people on our tools and framework.
 
 Check list for review
 ---------------------
@@ -108,3 +115,25 @@
 - Is the code clear/comprehensible
 - Are the comment describing what is being done?
 - Answer question by de commiter, this can also serve to train people
+- Check for typo
+- No debug code(print, breakpoint,...)
+- If commit message tell to don't review, check that the code is disabled by default and that when enabled print a warning.
+
+Proposed Politic
+----------------
+
+- For each commit message, if the author don't want this commit to be reviewed, tell it in the message
+   - Usefull for experimental repository, not Theano
+   - Usefull for Official repo when we commit something that is disabled by default and we want to split in many commits or start using event if not fully tested.
+   - Reviewer should check that the check is not enabled by default and when enabled should print a warning.
+- We check all commit to Theano, Pylearn and Jobman.(Official tools)
+- We check experimental repos when asked.
+- One official reviewer per week.
+    - He review all code and ask expert when needed.
+    - Should check the check list again all review.
+    - We choose the reviewer in the theano user of the lab with commit right.
+    - On fait une list d'expert par demain de problem(gpu, optimization, algo,...)
+    - If some body break the build bot, it is him the reviewer for the next week
+    - Maximum of one week by mount.
+    - If their is big week or during rush that include every body of the lab, we can change more frequently.
+    - If a commit have problem, it is the original reviewer that should make the follow up.
--- a/doc/v2_planning/coding_style.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/doc/v2_planning/coding_style.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -22,11 +22,15 @@
                 out which of two versions is most recent)
 
     * OD: I like always doing the following when subclassing
+
+.. code-block:: python
+
       a class A:
         class B(A):
             def __init__(self, b_arg_1, b_arg_2, **kw):
                 super(B, self).__init__(**kw)
                 ...
+
       The point here is that the constructor always allow for extra keyword
       arguments (except for the class at the very top of the hierarchy), which
       are automatically passed to the parent class.
@@ -65,9 +69,11 @@
      OD: That'd make sense. However, when I wrote the above I hadn't looked
          closely at PEP257 yet, and I just noticed the following official
          recommendation for one-line docstrings in it:
-            The docstring is a phrase ending in a period. It prescribes the
-            function or method's effect as a command ("Do this", "Return that"), not as a
-            description; e.g. don't write "Returns the pathname ...".
+
+             The docstring is a phrase ending in a period. It prescribes the
+             function or method's effect as a command ("Do this", "Return that"), not as a
+             description; e.g. don't write "Returns the pathname ...".
+
          Anyone knows which style is most popular in the open-source
          community?
      OD: In lab meeting Yoshua ruled out: it is a waste of time to even
@@ -97,6 +103,7 @@
      JB: I thought that these are nice:
          - "from foo import Bar" 
          - "from foo import Bar, Blah"
+
         What's wrong with them?  They keep the code listing short and readable.
         I would discourage these forms when symbols 'Bar' and 'Blah' are
         ambiguous, in which case the parent module prefix serves to disambiguate
@@ -114,6 +121,13 @@
             from foo import Bar, Blah
          when imported stuff is re-used multiple times in the same file, and
          there is no ambiguity.
+     DWF: One exception I'd like to propose to the "import A as B" moratorium
+          is that we adopt the "import numpy as np" standard that's used in
+          NumPy and SciPy itself. For NumPy heavy code this really cuts down
+	  on clutter, without significant impact on readability (IMHO).
+     OD: We discussed it during a meeting. We agreed not to use the 'np'
+         abbreviation at first. We may still revisit this point in the future
+         if we find situations where it would really help.
 
    * Imports should usually be on separate lines.
      OD: I would add an exception, saying it is ok to group multiple imports
@@ -197,23 +211,30 @@
 
    * If necessary, you can add an extra pair of parentheses around an
      expression, but sometimes using a backslash looks better.
-    --> I rarely find that backslash looks better. In most situations you can
-        get rid of them. Typically I prefer:
+     --> I rarely find that backslash looks better. In most situations you can
+     get rid of them. Typically I prefer:
+
+..code-block:: python
+
             if (cond_1 and
                 cond_2 and
                 cond_3):
-        to
+
+     to
+
+..code-block:: python
+
             if cond_1 and \
                cond_2 and \
                cond_3:
 
    * You should use two spaces after a sentence-ending period.
-    --> Looks weird to me.
-    (DWF: This is an old convention from the typewriter era. It has more
-    or less been wiped out by HTML's convention of ignoring extra 
-    whitespace: see http://en.wikipedia.org/wiki/Sentence_spacing for
-    more detail. I think it's okay to drop this convention in source code.)
-    OD: Cool, thanks, I guess we can drop it then.
+     --> Looks weird to me.
+     (DWF: This is an old convention from the typewriter era. It has more
+     or less been wiped out by HTML's convention of ignoring extra 
+     whitespace: see http://en.wikipedia.org/wiki/Sentence_spacing for
+     more detail. I think it's okay to drop this convention in source code.)
+     OD: Cool, thanks, I guess we can drop it then.
 
     * Missing in PEP 8:
         - How to indent multi-line statements? E.g. do we want
@@ -236,7 +257,50 @@
 Documentation
 -------------
 
-How do we write doc?
+How do we write docs?
+
+Ideas (DE):
+
+   * Most major Python projects suggest following PEP-257:
+     http://www.python.org/dev/peps/pep-0257/, which contains conventions on
+     writing docstrings (what they should contain, not the specific markup)
+     for Python. These are very general conventions, however,.
+   
+   * Numpy, in particular, has a very nice page on how to document things if
+     contributing: http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines
+     (it's mostly about documentation, not coding style, despite the page
+     name). 
+  
+   * A pretty good example from numpy, with relevant comments:
+     http://github.com/numpy/numpy/blob/master/doc/example.py 
+  
+   * A real-life example (record arrays) from numpy:
+     http://github.com/numpy/numpy/blob/master/numpy/core/records.py
+ 
+   * The recommendations are quite sane and common-sense, we should follow them.
+ 
+   * numpy's way of doing things is a bit different from the way we currently
+     document Theano: they don't use param/type/rtype, for instance, but nice
+     readable section titles. I personally find their approach better-looking
+     and they do have a sphinx extension that would allow us to have the same
+     style
+     (http://github.com/numpy/numpy/blob/master/doc/sphinxext/numpydoc.py).
+     The disadvantage of taking this approach is that Theano and Pylearn will
+     be documented slightly differently
+
+   * Make sure that what we write is compatible with tools like sphinx's
+     autodoc extension:
+     http://sphinx.pocoo.org/ext/autodoc.html#module-sphinx.ext.autodoc (which
+     we will most probably use to generate semi-automatic pretty docs)
+  
+   * Nice cheat-sheet for docutils:
+     http://docutils.sourceforge.net/docs/user/rst/quickref.html
+  
+   * http://docs.python.org/release/2.5.2/lib/module-doctest.html -
+     in-documentation unit-testing: we should perhaps encourage people to
+     write such things where warranted (where there are interesting usage
+     examples).  notetests can automatically run those, so no configuration
+     overhead is necessary.
 
 Compatibility with various Python versions
 ------------------------------------------
@@ -244,7 +308,7 @@
     * Which Python 2.x version do we want to support?
 
     * Is it reasonable to have coding guidelines that would make the code as
-compatible as possible with Python 3?
+      compatible as possible with Python 3?
 
 C coding style
 --------------
@@ -255,52 +319,52 @@
 ------------------
 
    * Coding guidelines
-PEP 8 & Google should be a good basis to start with.
-Task: Highlight the most important points in them (OD).
+     PEP 8 & Google should be a good basis to start with.
+     Task: Highlight the most important points in them (OD).
 
    * Documentation
-Use RST with Sphinx.
-Task: Provide specific examples on how to document a class, method, and some
-specific classes like Op (DE). Modify the theano documentation to include that.
-OD: May want to check out
-    http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines
+     Use RST with Sphinx.
+     Task: Provide specific examples on how to document a class, method, and some
+     specific classes like Op (DE). Modify the theano documentation to include that.
+     OD: May want to check out
+     http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines
 
    * Python versions to be supported
-Support 2.4 (because some of the clusters are still running 2.4) and write
-code that can be converted to 3.x with 2to3 in a straightforward way.
-Task: Write to-do's and to-not-do's to avoid compatibility issues. (OD)
+     Support 2.4 (because some of the clusters are still running 2.4) and write
+     code that can be converted to 3.x with 2to3 in a straightforward way.
+     Task: Write to-do's and to-not-do's to avoid compatibility issues. (OD)
 
    * C coding style
-How to write C code (in particular for Numpy / Cuda), and how to mix C and
-Python.
-Task: See if there would be a sensible C code style to follow (maybe look how
-Numpy does it), and how projects that mix C and Python deal with it (e.g. use
-separate files, or be able to have mixed syntax highlighting?) (FB)
+     How to write C code (in particular for Numpy / Cuda), and how to mix C and
+     Python.
+     Task: See if there would be a sensible C code style to follow (maybe look how
+     Numpy does it), and how projects that mix C and Python deal with it (e.g. use
+     separate files, or be able to have mixed syntax highlighting?) (FB)
 
    * Program output
-Use the warning and logging modules. Avoid print as much as possible.
-Task: Look into these modules to define general guidelines e.g. to decide when
-to use warning instead of logging. (DWF)
+     Use the warning and logging modules. Avoid print as much as possible.
+     Task: Look into these modules to define general guidelines e.g. to decide when
+     to use warning instead of logging. (DWF)
 
    * Automatized code verification
-Use pychecker & friends to make sure everything is fine.
-Task: Look into the various options available (DE)
-Result: See sections 'Tools to help us out' and 'Automating and enforcing coding
-style'
+     Use pychecker & friends to make sure everything is fine.
+     Task: Look into the various options available (DE)
+     Result: See sections 'Tools to help us out' and 'Automating and enforcing coding
+     style'
 
    * Tests
-Force people to write tests. Automatic email reminder of code lines not
-covered by tests (see if we can get this from nosetests). Decorator to mark
-some classes / methods as not being tested yet, so as to be able to
-automatically warn the user when he is using untested stuff (and to remind
-ourselves we should add a test).
-Task: See feasibility. (OD)
-Result: See section 'Enforcing strict testing policy'.
+     Force people to write tests. Automatic email reminder of code lines not
+     covered by tests (see if we can get this from nosetests). Decorator to mark
+     some classes / methods as not being tested yet, so as to be able to
+     automatically warn the user when he is using untested stuff (and to remind
+     ourselves we should add a test).
+     Task: See feasibility. (OD)
+     Result: See section 'Enforcing strict testing policy'.
 
    * VIM / Emacs plugins / config files
-To enforce good coding style automatically.
-Task: Look for existing options. (FB)
-(DWF: I have put some time into this for vim, I will send around my files)
+     To enforce good coding style automatically.
+     Task: Look for existing options. (FB)
+     (DWF: I have put some time into this for vim, I will send around my files)
 
 Suggestion by PV
 ----------------
@@ -334,34 +398,70 @@
 During committee's meeting, Fred mentioned a bug with Assembla links for
 multi-line commits.
 
+OD: About what I mentioned in a meeting:
+    - The size of the first line does not really matter (although keeping it
+      short is better, it is not truncated when you do 'hg log')
+    - The fetch commit message is like "Automated merge with
+      https://theanoclone.googlecode.com/hg/". It is too long to ask people to
+      use the same. I guess "Merge" would be the most logical message to use.
+
+FB: The proposed guidelines
+    * A one line summary. Try to keep it short, and provide the information
+      that seems most useful to other developers: in particular the goal of
+      a change is more useful than its description (which is always
+      available through the changeset patch log). E.g. say "Improved stability
+      of cost computation" rather than "Replaced log(exp(a) + exp(b)) by
+      a * log(1 + exp(b -a)) in cost computation".
+    * If needed a blank line followed by a more detailed summary
+    * Make a commit for each logical modification
+        * This makes reviews easier to do
+        * This makes debugging easier as we can more easily pinpoint errors in commits with hg bisect
+    * NEVER commit reformatting with functionality changes
+    * HG RECORD/DIFF are your friend
+        * hg record allows you to select which changes to a file should be
+          committed
+        * hg record / diff force you to review your code, never commit without
+          running one of these two commands first
+    * Stuff from django guide
+        * Write detailed commit messages in the past tense, not present tense.
+            * Good: "Fixed Unicode bug in RSS API."
+            * Bad: "Fixes Unicode bug in RSS API."
+            * Bad: "Fixing Unicode bug in RSS API."
+        * Separate bug fixes from feature changes.
+        * If fix a ticket, make the message start with "Fixed #abc"
+            * Can make a system to change the ticket?
+        * If reference a ticket, make the message start with "Refs #abc"
+            * Can make a system to put a comment to the ticket?
+
+
 Tools to help us out
 ---------------------
 
 Dumi:
 
   * pylint: highly configurable and very popular tool, similar in spirit to lint
-  for C. Can specify a config file, customize/disable warnings and errors, hook
-  it to vim/emacs and include coding style convensions in the check too. A nice
-  feature is that you can include a comment like "# pylint: disable-msg=C0103"
-  into a file and disable a message locally. This is nice and dangerous at the
-  same time. Another cool feature is incremental checking with caching of
-  results, which also allows tracking of progress.
+    for C. Can specify a config file, customize/disable warnings and errors, hook
+    it to vim/emacs and include coding style convensions in the check too. A nice
+    feature is that you can include a comment like "# pylint: disable-msg=C0103"
+    into a file and disable a message locally. This is nice and dangerous at the
+    same time. Another cool feature is incremental checking with caching of
+    results, which also allows tracking of progress.
 
   * pyflakes: pylint alternative that is supposedly faster, but is I think more
-  limited in the number of things it is good at: "PyFlakes will tell you when
-  you have forgotten an import, mistyped a variable name, defined two functions
-  with the same name, shadowed a variable from another scope, imported a module
-  twice, or two different modules with the same name, and so on.". Most reviews
-  found online praise the speed, but note that pylint is clearly superior in
-  every other respect.
+    limited in the number of things it is good at: "PyFlakes will tell you when
+    you have forgotten an import, mistyped a variable name, defined two functions
+    with the same name, shadowed a variable from another scope, imported a module
+    twice, or two different modules with the same name, and so on.". Most reviews
+    found online praise the speed, but note that pylint is clearly superior in
+    every other respect.
 
   * pychecker: it actually *imports* each module (not sure if pylint does this).
-  It seems that pylint = pychecker + coding style and that pylint is more
-  popular.
+    It seems that pylint = pychecker + coding style and that pylint is more
+    popular.
 
   * pep8: if all you care is about obeying PEP-8:
-  http://pypi.python.org/pypi/pep8 (includes the actual PEP-8 snippets with the
-  errors found, which is neat). Otherwise, pylint seems like a superset of this. 
+    http://pypi.python.org/pypi/pep8 (includes the actual PEP-8 snippets with the
+    errors found, which is neat). Otherwise, pylint seems like a superset of this. 
 
   * http://www.doughellmann.com/articles/pythonmagazine/completely-different/2008-03-linters/index.html
   - article from 2008 comparing pylint, pychecker, and pyflakes. The conclusion
@@ -382,36 +482,36 @@
 Dumi: there are several ways of approaching this, independently of the tools used:
 
    * Create a precommit hook for mercurial, which runs the tool(s) of choice and
-   generates warnings or aborts the commit process. This hook is a simple Python
-   module (well, as simple as we want it to be), which we can include into
-   everyone's hgrc, in the precommit.pylint variable, for instance. An example
-   is http://github.com/jrburke/dvcs_jslint/blob/master/dvcs_jslint.js. The
-   advantage of this approach is that the load is distributed and
-   errors/warnings are caught client-side, before the commit.
+     generates warnings or aborts the commit process. This hook is a simple Python
+     module (well, as simple as we want it to be), which we can include into
+     everyone's hgrc, in the precommit.pylint variable, for instance. An example
+     is http://github.com/jrburke/dvcs_jslint/blob/master/dvcs_jslint.js. The
+     advantage of this approach is that the load is distributed and
+     errors/warnings are caught client-side, before the commit.
 
    * Another client-side option is to have editor plugins for the various style
-   checkers: vim and emacs can access pylint pretty easily for instance.
+     checkers: vim and emacs can access pylint pretty easily for instance.
 
    * Instead of doing this client-side, one can do things server-side. On
-   Assembla, this means using their Webhooks
-   (http://www.assembla.com/spaces/demostuff/webhook_tool), since HTTP-based
-   hooks that we would need to tie with our buildbot server (whichever server we
-   choose that to be).
+     Assembla, this means using their Webhooks
+     (http://www.assembla.com/spaces/demostuff/webhook_tool), since HTTP-based
+     hooks that we would need to tie with our buildbot server (whichever server we
+     choose that to be).
 
    * I (DE) prefer starting with the client-side approach, as it is easier to
-   implement, has no single point of failure and is deployable fast. We could
-   have a "batch" script that runs our lint tools in conjunction with hg
-   annotate and sends hate-mail once a week to offenders who have somehow
-   slipped things through the cracks. Also on the server-side we could run
-   time-consuming checking (though how such checks would differ from tests is
-   unclear).
+     implement, has no single point of failure and is deployable fast. We could
+     have a "batch" script that runs our lint tools in conjunction with hg
+     annotate and sends hate-mail once a week to offenders who have somehow
+     slipped things through the cracks. Also on the server-side we could run
+     time-consuming checking (though how such checks would differ from tests is
+     unclear).
 
 Note that:
 
   * I haven't found anything ready-made online, so we need to write these
-  hooks ourselves.
+    hooks ourselves.
   * I think we should make it so that it is not possible to commit things if
-  pylint reports an actual error.
+    pylint reports an actual error.
 
 Type checking
 -------------
@@ -510,3 +610,17 @@
 OD: Finish code sample that showcases all (or many) guidelines, look into
 feasibility of passing arguments to super classes with **kw.
 
+Meeting 2010/10/01
+------------------
+OD: Do your job!
+DE: Send email to ask the lab whether we should go numpy style or theano style
+for class / method documentation, and move guidelines to API file.
+DWF: Finish guidelines for serializable code
+FB: Write guidelines for commits
+
+Doc format in header?
+---------------------
+
+OD: Should we add in each file's header something like:
+    __docformat__ = "restructuredtext en"
+
--- a/doc/v2_planning/committees.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/doc/v2_planning/committees.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -1,4 +1,4 @@
-List of committees and their members (leader marked with a *):
+List of committees and their members (leader marked with a \*):
 
 * Existing Python ML libraries investigation: GD, DWF, IG, DE
 * Dataset interface: DE*, OB, OD, AB, PV
--- a/doc/v2_planning/existing_python_ml_libraries.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/doc/v2_planning/existing_python_ml_libraries.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -96,31 +96,31 @@
 
 3. Recommendations for implementations to wrap
 
-shogun:
-      large scale kernel learning (mostly svms). this wraps other
-libraries we should definitely be interested in, such as libsvm
-(because it is well-established) and others that get state of the art
-performance or are good for extremely large datasets, etc.
-milk:
-      k-means
-      svm's with arbitrary python types for kernel arguments
-pybrain:
-      lstm
-mlpy:
-      feature selection
-mdp:
-      ica
-      LLE
-scikit.learn:
-      lasso
-      nearest neighbor
-      isomap
-      various metrics
-      mean shift
-      cross validation
-      LDA
-      HMMs
-Yet Another Python Graph Library:
-      graph similarity functions that could be useful if we want to
-learn with graphs as data
+* shogun:
+    * large scale kernel learning (mostly svms). this wraps other
+      libraries we should definitely be interested in, such as libsvm
+      (because it is well-established) and others that get state of the art
+      performance or are good for extremely large datasets, etc.
+* milk:
+    * k-means
+    * svm's with arbitrary python types for kernel arguments
+* pybrain:
+    * lstm
+* mlpy:
+    * feature selection
+* mdp:
+    * ica
+    * LLE
+* scikit.learn:
+    * lasso
+    * nearest neighbor
+    * isomap
+    * various metrics
+    * mean shift
+    * cross validation
+    * LDA
+    * HMMs
+* Yet Another Python Graph Library:
+    * graph similarity functions that could be useful if we want to
+      learn with graphs as data
 
--- a/doc/v2_planning/requirements.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/doc/v2_planning/requirements.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -134,10 +134,10 @@
 
 R15. If you see the library as a driver that controls several components ( and
 we argue that any approach can be seen like this), the driver should always :
-  - be serializable
-  - respond to internal interrupts ("checkpoints")
-  - respond to external interrupts ( timeout)
-  - async interrupts ( eg. SIGTERM)
+    - be serializable
+    - respond to internal interrupts ("checkpoints")
+    - respond to external interrupts ( timeout)
+    - async interrupts ( eg. SIGTERM)
 
 R16 Cognitive load should be minimal (debatable requirement)
   Notes : Is hard to actually appreciate cognitive load, so this should be
--- a/pylearn/algorithms/mcRBM.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/algorithms/mcRBM.py	Thu Oct 14 23:55:55 2010 -0400
@@ -45,9 +45,9 @@
         - \sum_j c_j g_j
 
 For the energy function to correspond to a probability distribution, P must be non-positive.  P
-is initialized to be a diagonal, and in our experience it can be left as such because even in
-the paper it has a very low learning rate, and is only allowed to be updated after the filters
-in U are learned (in effect).
+is initialized to be a diagonal or a topological pooling matrix, and in our experience it can
+be left as such because even in the paper it has a very low learning rate, and is only allowed
+to be updated after the filters in U are learned (in effect).
 
 Version in published train_mcRBM code
 -------------------------------------
@@ -90,6 +90,13 @@
         - \sum_j \sum_i W_{ij} g_j v_i
         - \sum_j c_j g_j
 
+    E (v, h, g) =
+        - 0.5 \sum_f \sum_k P_{fk} h_k (\sum_i U_{if} v_i / sqrt(\sum_i v_i^2/I + 0.5))^2 
+        - \sum_k b_k h_k
+        + 0.5 \sum_i v_i^2
+        - \sum_j \sum_i W_{ij} g_j v_i
+        - \sum_j c_j g_j
+
       
 
 Conventions in this file
@@ -101,12 +108,14 @@
 
 Global functions like `free_energy` work on an mcRBM as parametrized in a particular way.
 Suppose we have 
-I input dimensions, 
-F squared filters, 
-J mean variables, and 
-K covariance variables.
-The mcRBM is parametrized by 5 variables:
+ - I input dimensions, 
+ - F squared filters, 
+ - J mean variables, and
+ - K covariance variables.
 
+The mcRBM is parametrized by 6 variables:
+
+ - `P`, a matrix whose rows indicate covariance filter groups (F x K)
  - `U`, a matrix whose rows are visible covariance directions (I x F)
  - `W`, a matrix whose rows are visible mean directions (I x J)
  - `b`, a vector of hidden covariance biases (K)
@@ -199,9 +208,9 @@
 from theano import tensor as TT
 floatX = theano.config.floatX
 
+sharedX = lambda X, name : shared(numpy.asarray(X, dtype=floatX), name=name)
+
 import pylearn
-#TODO: clean up the HMC_sampler code
-#TODO: think of naming convention for acronyms + suffix?
 from pylearn.sampling.hmc import HMC_sampler
 from pylearn.io import image_tiling
 from pylearn.gd.sgd import sgd_updates
@@ -213,31 +222,83 @@
 #
 ###########################################
 
-#TODO: Document, move to pylearn's math lib
 def l1(X):
+    """
+    :param X: TensorType variable
+
+    :rtype: TensorType scalar
+
+    :returns: the sum of absolute values of the terms in X
+
+    :math: \sum_i |X_i|
+
+    Where i is an appropriately dimensioned index.
+
+    """
     return abs(X).sum()
 
-#TODO: Document, move to pylearn's math lib
 def l2(X):
+    """
+    :param X: TensorType variable
+
+    :rtype: TensorType scalar
+
+    :returns: the sum of absolute values of the terms in X
+
+    :math: \sqrt{ \sum_i X_i^2 }
+
+    Where i is an appropriately dimensioned index.
+
+    """
     return TT.sqrt((X**2).sum())
 
-#TODO: Document, move to pylearn's math lib
 def contrastive_cost(free_energy_fn, pos_v, neg_v):
+    """
+    :param free_energy_fn: lambda (TensorType matrix MxN) ->  TensorType vector of M free energies
+    :param pos_v: TensorType matrix MxN of M "positive phase" particles
+    :param neg_v: TensorType matrix MxN of M "negative phase" particles
+
+    :returns: TensorType scalar that's the sum of the difference of free energies
+
+    :math: \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
+
+    """
     return (free_energy_fn(pos_v) - free_energy_fn(neg_v)).sum()
 
-#TODO: Typical use of contrastive_cost is to later use tensor.grad, but in that case we want to
-#      block  gradient going through neg_v
-def contrastive_grad(free_energy_fn, pos_v, neg_v, params, other_cost=0):
+def contrastive_grad(free_energy_fn, pos_v, neg_v, wrt, other_cost=0):
     """
+    :param free_energy_fn: lambda (TensorType matrix MxN) ->  TensorType vector of M free energies
     :param pos_v: positive-phase sample of visible units
     :param neg_v: negative-phase sample of visible units
+    :param wrt: TensorType variables with respect to which we want gradients (similar to the
+        'wrt' argument to tensor.grad)
+    :param other_cost: TensorType scalar 
+
+    :returns: TensorType variables for the gradient on each of the 'wrt' arguments
+
+
+    :math: Cost = other_cost + \sum_i free_energy(pos_v[i]) - free_energy(neg_v[i])
+    :math: d Cost / dW for W in `wrt`
+
+
+    This function is similar to tensor.grad - it returns the gradient[s] on a cost with respect
+    to one or more parameters.  The difference between tensor.grad and this function is that
+    the negative phase term (`neg_v`) is considered constant, i.e. d `Cost` / d `neg_v` = 0.
+    This is desirable because `neg_v` might be the result of a sampling expression involving
+    some of the parameters, but the contrastive divergence algorithm does not call for
+    backpropagating through the sampling procedure.
+
+    Warning - if other_cost depends on pos_v or neg_v and you *do* want to backpropagate from
+    the `other_cost` through those terms, then this function is inappropriate.  In that case,
+    you should call tensor.grad separately for the other_cost and add the gradient expressions
+    you get from ``contrastive_grad(..., other_cost=0)``
+
     """
-    #block the grad through neg_v
     cost=contrastive_cost(free_energy_fn, pos_v, neg_v)
     if other_cost:
         cost = cost + other_cost
     return theano.tensor.grad(cost,
-            wrt=params,
+            wrt=wrt,
             consider_constant=[neg_v])
 
 ###########################################
@@ -246,97 +307,236 @@
 #
 ###########################################
 
-#TODO: make global function to initialize parameter
+class mcRBM(object):
+    """Light-weight class that provides the math related to inference
+
+    Attributes:
+
+      - U - the covariance filters (theano shared variable)
+      - W - the mean filters (theano shared variable)
+      - a - the visible bias (theano shared variable)
+      - b - the covariance bias (theano shared variable)
+      - c - the mean bias (theano shared variable)
 
-def hidden_cov_units_preactivation_given_v(rbm, v, small=0.5):
-    """Return argument to the sigmoid that would give mean of covariance hid units
+    """
+    def __init__(self, U, W, a, b, c):
+        self.U = U
+        self.W = W
+        self.a = a
+        self.b = b
+        self.c = c
 
-    See the math at the top of this file for what 'adjusted' means.
+    def hidden_cov_units_preactivation_given_v(self, v, small=0.5):
+        """Return argument to the sigmoid that would give mean of covariance hid units
+
+        See the math at the top of this file for what 'adjusted' means.
 
-    return b - 0.5 * dot(adjusted(v), U)**2
-    """
-    (U,W,a,b,c) = rbm
-    unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
-    return b - 0.5 * dot(unit_v, U)**2
+        return b - 0.5 * dot(adjusted(v), U)**2
+        """
+        unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
+        return self.b - 0.5 * dot(unit_v, self.U)**2
+
+    def free_energy_terms_given_v(self, v):
+        """Returns theano expression for the terms that are added to form the free energy of
+        visible vector `v` in an mcRBM.
 
-def free_energy_terms_given_v(rbm, v):
-    """Returns theano expression for the terms that are added to form the free energy of
-    visible vector `v` in an mcRBM.
+         1.  Free energy related to covariance hiddens
+         2.  Free energy related to mean hiddens
+         3.  Free energy related to L2-Norm of `v`
+         4.  Free energy related to projection of `v` onto biases `a`
+        """
+        t0 = -TT.sum(TT.nnet.softplus(self.hidden_cov_units_preactivation_given_v(v)),axis=1)
+        t1 = -TT.sum(TT.nnet.softplus(self.c + dot(v,self.W)), axis=1)
+        t2 =  0.5 * TT.sum(v**2, axis=1)
+        t3 = -TT.dot(v, self.a)
+        return [t0, t1, t2, t3]
+
+    def free_energy_given_v(self, v):
+        """Returns theano expression for free energy of visible vector `v` in an mcRBM
+        """
+        return TT.add(*self.free_energy_terms_given_v(v))
+
+    def expected_h_g_given_v(self, v):
+        """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM.
 
-     1.  Free energy related to covariance hiddens
-     2.  Free energy related to mean hiddens
-     3.  Free energy related to L2-Norm of `v`
-     4.  Free energy related to projection of `v` onto biases `a`
-    """
-    U, W, a, b, c = rbm
-    t0 = -TT.sum(TT.nnet.softplus(hidden_cov_units_preactivation_given_v(rbm, v)),axis=1)
-    t1 = -TT.sum(TT.nnet.softplus(c + dot(v,W)), axis=1)
-    t2 =  0.5 * TT.sum(v**2, axis=1)
-    t3 = -TT.dot(v, a)
-    return [t0, t1, t2, t3]
+        `h` is the conditional on the covariance units.
+        `g` is the conditional on the mean units.
+        
+        """
+        h = TT.nnet.sigmoid(self.hidden_cov_units_preactivation_given_v(v))
+        g = TT.nnet.sigmoid(self.c + dot(v,self.W))
+        return (h, g)
+
+    def n_visible_units(self):
+        """Return the number of visible units of this RBM
+
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.W.value.shape[0]
+        except AttributeError:
+            return self.W.shape[0]
+
+    def n_hidden_cov_units(self):
+        """Return the number of hidden units for the covariance in this RBM
 
-def free_energy_given_v(rbm, v):
-    """Returns theano expression for free energy of visible vector `v` in an mcRBM
-    """
-    return sum(free_energy_terms_given_v(rbm,v))
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.U.value.shape[1]
+        except AttributeError:
+            return self.U.shape[1]
+
+    def n_hidden_mean_units(self):
+        """Return the number of hidden units for the mean in this RBM
 
-def expected_h_g_given_v(rbm, v):
-    """Returns tuple (`h`, `g`) of theano expression conditional expectations in an mcRBM.
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.W.value.shape[1]
+        except AttributeError:
+            return self.W.shape[1]
+
+    def CD1_sampler(self, v, n_particles, n_visible=None, rng=8923984):
+        """Return a symbolic negative-phase particle obtained by simulating the Hamiltonian
+        associated with the energy function.
+        """
+        #TODO: why not expose all HMC arguments somehow?
+        if not hasattr(rng, 'randn'):
+            rng = np.random.RandomState(rng)
+        if n_visible is None:
+            n_visible = self.n_visible_units()
 
-    `h` is the conditional on the covariance units.
-    `g` is the conditional on the mean units.
-    
-    """
-    (U, W, a, b, c) = rbm
-    h = TT.nnet.sigmoid(hidden_cov_units_preactivation_given_v(rbm, v))
-    g = nnet.sigmoid(c + dot(v,W))
-    return (h, g)
+        # create a dummy hmc object because we want to use *some* of it
+        hmc = HMC_sampler.new_from_shared_positions(
+                shared_positions=v, # v is not shared, so some functionality will not work
+                energy_fn=self.free_energy_given_v,
+                seed=int(rng.randint(2**30)),
+                shared_positions_shape=(n_particles,n_visible),
+                compile_simulate=False)
+        updates = dict(hmc.updates())
+        final_p = updates.pop(v)
+        return hmc, final_p, updates
+
+    def sampler(self, n_particles, n_visible=None, rng=7823748):
+        """Return an `HMC_sampler` that will draw samples from the distribution over visible
+        units specified by this RBM.
 
-def n_visible_units(rbm):
-    """Return the number of visible units of this RBM
+        :param n_particles: this many parallel chains will be simulated.
+        :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation.
+        """
+        #TODO: why not expose all HMC arguments somehow?
+        #TODO: Consider returning a sample kwargs for passing to HMC_sampler?
+        if not hasattr(rng, 'randn'):
+            rng = np.random.RandomState(rng)
+        if n_visible is None:
+            n_visible = self.n_visible_units()
+        rval = HMC_sampler.new_from_shared_positions(
+            shared_positions = sharedX(
+                rng.randn(
+                    n_particles,
+                    n_visible),
+                name='particles'),
+            energy_fn=self.free_energy_given_v,
+            seed=int(rng.randint(2**30)))
+        return rval
+
+    if 0:
+        def as_feedforward_layer(self, v):
+            """Return a dictionary with keys: inputs, outputs and params
+
+            The inputs is [v]
+
+            The outputs is :math:`[E[h|v], E[g|v]]` where `h` is the covariance hidden units and `g` is
+            the mean hidden units.
+
+            The params are ``[U, W, b, c]``, the model parameters that enter into the conditional
+            expectations.
+
+            :TODO: add an optional parameter to return only one of the expections.
+
+            """
+            return dict(
+                    inputs = [v],
+                    outputs = list(self.expected_h_g_given_v(v)),
+                    params = [self.U, self.W, self.b, self.c],
+                    )
+
+    def params(self):
+        """Return the elements of [U,W,a,b,c] that are shared variables
 
-    For an RBM made from shared variables, this will return an integer,
-    for a purely symbolic RBM this will return a theano expression.
-    
-    """
-    W = rbm[1]
-    try:
-        return W.value.shape[0]
-    except AttributeError:
-        return W.shape[0]
+        WRITEME : a *prescriptive* definition of this method suitable for mention in the API
+        doc.
+        
+        """
+        return list(self._params)
 
-def sampler(rbm, n_particles, n_visible=None, rng=7823748):
-    """Return an `HMC_sampler` that will draw samples from the distribution over visible
-    units specified by this RBM.
+    @classmethod
+    def alloc(cls, n_I, n_K, n_J, rng = 8923402190,
+            U_range=0.02,
+            W_range=0.05,
+            a_ival=0,
+            b_ival=2,
+            c_ival=-2):
+        """
+        Return a MeanCovRBM instance with randomly-initialized shared variable parameters.
+
+        :param n_I: input dimensionality
+        :param n_K: number of covariance hidden units
+        :param n_J: number of mean filters (linear)
+        :param rng: seed or numpy RandomState object to initialize parameters
+        
+        :note:
+        Constants for initial ranges and values taken from train_mcRBM.py.
+        """
+        if not hasattr(rng, 'randn'):
+            rng = np.random.RandomState(rng)
 
-    :param n_particles: this many parallel chains will be simulated.
-    :param rng: seed or numpy RandomState object to initialize particles, and to drive the simulation.
-    """
-    if not hasattr(rng, 'randn'):
-        rng = np.random.RandomState(rng)
-    if n_visible is None:
-        n_visible = n_visible_units(rbm)
-    rval = HMC_sampler(
-        positions = [shared(
-            rng.randn(
-                n_particles,
-                n_visible).astype(floatX),
-            name='particles')],
-        energy_fn = lambda p : free_energy_given_v(rbm, p[0]),
-        seed=int(rng.randint(2**30)))
+        rval =  cls(
+                U = sharedX(U_range * rng.randn(n_I, n_K),'U'),
+                W = sharedX(W_range * rng.randn(n_I, n_J),'W'),
+                a = sharedX(np.ones(n_I)*a_ival,'a'),
+                b = sharedX(np.ones(n_K)*b_ival,'b'),
+                c = sharedX(np.ones(n_J)*c_ival,'c'),)
+        rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c]
+        return rval
+
+def topological_connectivity(out_shape=(12,12), window_shape=(3,3), window_stride=(2,2),
+        **kwargs):
+
+    in_shape = (window_stride[0] * out_shape[0],
+            window_stride[1] * out_shape[1])
+
+    rval = numpy.zeros(in_shape + out_shape, dtype=theano.config.floatX)
+    A,B,C,D = rval.shape
+
+    # for each output position (out_r, out_c)
+    for out_r in range(out_shape[0]):
+        for out_c in range(out_shape[1]):
+            # for each window position (win_r, win_c)
+            for win_r in range(window_shape[0]):
+                for win_c in range(window_shape[1]):
+                    # add 1 to the corresponding input location
+                    in_r = out_r * window_stride[0] + win_r
+                    in_c = out_c * window_stride[1] + win_c
+                    rval[in_r%A, in_c%B, out_r%C, out_c%D] += 1
+
+    # This normalization algorithm is a guess, based on inspection of the matrix loaded from 
+    # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat
+    rval = rval.reshape((A*B, C*D))
+    rval = (rval.T / rval.sum(axis=1)).T
+
+    rval /= rval.sum(axis=0)
     return rval
 
-#############################
-#
-# Convenient data container
-#
-#############################
-
-class MeanCovRBM(object):
-    """Container for mcRBM parameters
-
-    It provides parameter lookup by name, as well as a heuristic for initializing the
-    parameters for effective learning.
+class mcRBM_withP(mcRBM):
+    """Light-weight class that provides the math related to inference
 
     Attributes:
 
@@ -345,66 +545,261 @@
       - a - the visible bias (theano shared variable)
       - b - the covariance bias (theano shared variable)
       - c - the mean bias (theano shared variable)
-    
-    """
 
-    params = property(lambda s: [s.U, s.W, s.a, s.b, s.c])
+    """
+    def __init__(self, U, W, a, b, c, P):
+        self.P = P
+        super(mcRBM_withP, self).__init__(U,W,a,b,c)
 
-    n_visible = property(lambda s: s.W.value.shape[0])
+    def hidden_cov_units_preactivation_given_v(self, v, small=0.5):
+        """Return argument to the sigmoid that would give mean of covariance hid units
+
+        See the math at the top of this file for what 'adjusted' means.
 
-    def __init__(self, U, W, a, b, c):
-        self.__dict__.update(locals())
-        del self.__dict__['self']
+        return b - 0.5 * dot(adjusted(v), U)**2
+        """
+        unit_v = v / (TT.sqrt(TT.mean(v**2, axis=1)+small)).dimshuffle(0,'x') # adjust row norm
+        return self.b + 0.5 * dot(dot(unit_v, self.U)**2, self.P)
+
+    def n_hidden_cov_units(self):
+        """Return the number of hidden units for the covariance in this RBM
 
-    def __getitem__(self, idx):
-        # support unpacking of this container as if it were a tuple
-        return self.params[idx]
+        For an RBM made from shared variables, this will return an integer,
+        for a purely symbolic RBM this will return a theano expression.
+        
+        """
+        try:
+            return self.P.value.shape[1]
+        except AttributeError:
+            return self.P.shape[1]
 
     @classmethod
-    def new_from_dims(cls, n_I, n_K, n_J, rng = 8923402190):
+    def alloc(cls, n_I, n_K, n_J, *args, **kwargs):
         """
-        Return a MeanCovRBM instance with randomly-initialized parameters.
+        Return a MeanCovRBM instance with randomly-initialized shared variable parameters.
 
         :param n_I: input dimensionality
         :param n_K: number of covariance hidden units
         :param n_J: number of mean filters (linear)
-        :param rng: seed or numpy RandomState object to initialize params
+        :param rng: seed or numpy RandomState object to initialize parameters
+        
+        :note:
+        Constants for initial ranges and values taken from train_mcRBM.py.
         """
+        return cls.alloc_with_P(
+            -numpy.eye((n_K, n_K)).astype(theano.config.floatX),
+            n_I,
+            n_J,
+            *args, **kwargs)
+
+    @classmethod
+    def alloc_topo_P(cls, n_I, n_J, p_out_shape=(12,12), p_win_shape=(3,3), p_win_stride=(2,2),
+            **kwargs):
+        return cls.alloc_with_P(
+                -topological_connectivity(p_out_shape, p_win_shape, p_win_stride),
+                n_I=n_I, n_J=n_J, **kwargs)
+
+    @classmethod
+    def alloc_with_P(cls, Pval, n_I, n_J, rng = 8923402190,
+            U_range=0.02,
+            W_range=0.05,
+            a_ival=0,
+            b_ival=2,
+            c_ival=-2):
+        n_F, n_K = Pval.shape
         if not hasattr(rng, 'randn'):
             rng = np.random.RandomState(rng)
-
-        def shrd(X,name):
-            return shared(X.astype(floatX), name=name)
+        rval =  cls(
+                U = sharedX(U_range * rng.randn(n_I, n_F),'U'),
+                W = sharedX(W_range * rng.randn(n_I, n_J),'W'),
+                a = sharedX(np.ones(n_I)*a_ival,'a'),
+                b = sharedX(np.ones(n_K)*b_ival,'b'),
+                c = sharedX(np.ones(n_J)*c_ival,'c'),
+                P = sharedX(Pval, 'P'),)
+        rval._params = [rval.U, rval.W, rval.a, rval.b, rval.c, rval.P]
+        return rval
 
-        # initialization taken from train_mcRBM.py
-        return cls(
-                U = shrd(0.02 * rng.randn(n_I, n_K),'U'),
-                W = shrd(0.05 * rng.randn(n_I, n_J),'W'),
-                a = shrd(np.ones(n_I)*(0),'a'),
-                b = shrd(np.ones(n_K)*2,'b'),
-                c = shrd(np.ones(n_J)*(-2),'c'))
+class mcRBMTrainer(object):
+    """Light-weight class encapsulating math for mcRBM training 
+
+    Attributes:
+      - rbm  - an mcRBM instance
+      - sampler - an HMC_sampler instance
+      - normVF - geometrically updated norm of U matrix columns (shared var)
+      - learn_rate - SGD learning rate [un-annealed]
+      - learn_rate_multipliers - the learning rates for each of the parameters of the rbm (in
+        order corresponding to what's returned by ``rbm.params()``)
+      - l1_penalty - float or TensorType scalar to modulate l1 penalty of rbm.U and rbm.W
+      - iter - number of cd_updates (shared var) - used to anneal the effective learn_rate
+      - lr_anneal_start - scalar or TensorType scalar - iter at which time to start decreasing
+            the learning rate proportional to 1/iter
 
-    def __getstate__(self):
-        # unpack shared containers, which may have references to Theano stuff and are not a
-        # long-term stable data type.
-        return dict(
-                U = self.U.value,
-                W = self.W.value,
-                b = self.b.value,
-                c = self.c.value)
+    """
+    # TODO: accept a GD algo as an argument?
+    @classmethod
+    def alloc_for_P(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234,
+            l1_penalty=0,
+            l1_penalty_start=0,
+            learn_rate_multipliers=None,
+            lr_anneal_start=2000,
+            p_training_start=4000,
+            p_training_lr=0.02,
+            persistent_chains=True
+            ):
+        if learn_rate_multipliers is None:
+            p_lr = sharedX(0.0, 'P_lr_multiplier')
+            learn_rate_multipliers = [2, .2, .02, .1, .02, p_lr]
+        else:
+            p_lr = None
+        rval = cls.alloc(rbm, visible_batch, batchsize, initial_lr_per_example, rng, l1_penalty,
+                l1_penalty_start, learn_rate_multipliers, lr_anneal_start, persistent_chains)
+
+        rval.p_mask = sharedX((rbm.P.value!=0).astype('float32'), 'p_mask')
 
-    def __setstate__(self, dct):
-        d = dict(dct)
-        for key in ['U', 'W', 'a', 'b', 'c']:
-            d[key] = shared(d[key], name=key)
-        self.__init__(**d)
+        rval.p_lr = p_lr
+        rval.p_training_start=p_training_start
+        rval.p_training_lr=p_training_lr
+        return rval
+
+
+    @classmethod
+    def alloc(cls, rbm, visible_batch, batchsize, initial_lr_per_example=0.075, rng=234,
+            l1_penalty=0,
+            l1_penalty_start=0,
+            learn_rate_multipliers=[2, .2, .02, .1, .02],
+            lr_anneal_start=2000,
+            persistent_chains=True
+            ):
+
+        """
+        :param rbm: mcRBM instance to train
+        :param visible_batch: TensorType variable for training data
+        :param batchsize: the number of rows in visible_batch
+        :param initial_lr_per_example: the learning rate (may be annealed)
+        :param rng: seed or RandomState to initialze PCD sampler
+        :param l1_penalty: see class doc
+        :param learn_rate_multipliers: see class doc
+        :param lr_anneal_start: see class doc
+        """
+        #TODO: :param lr_anneal_iter: the iteration at which 1/t annealing will begin
+
+        #TODO: get batchsize from visible_batch??
+        # allocates shared var for negative phase particles
 
 
-#TODO: put the normalization of U as a global function
+        # TODO: should normVF be initialized to match the size of rbm.U ?
+
+        if (l1_penalty_start > 0) and (l1_penalty != 0.0):
+            effective_l1_penalty = sharedX(0.0, 'effective_l1_penalty')
+        else:
+            effective_l1_penalty = l1_penalty
+
+        if persistent_chains:
+            sampler = rbm.sampler(batchsize, rng=rng)
+        else:
+            sampler = None
+
+        return cls(
+                rbm=rbm,
+                batchsize=batchsize,
+                visible_batch=visible_batch,
+                sampler=sampler,
+                normVF=sharedX(1.0, 'normVF'),
+                learn_rate=sharedX(initial_lr_per_example/batchsize, 'learn_rate'),
+                iter=sharedX(0, 'iter'),
+                effective_l1_penalty=effective_l1_penalty,
+                l1_penalty=l1_penalty,
+                l1_penalty_start=l1_penalty_start,
+                learn_rate_multipliers=learn_rate_multipliers,
+                lr_anneal_start=lr_anneal_start,
+                persistent_chains=persistent_chains,)
+
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+    def normalize_U(self, new_U):
+        """
+        :param new_U: a proposed new value for rbm.U
+
+        :returns: a pair of TensorType variables: 
+            a corrected new value for U, and a new value for self.normVF
+
+        This is a weird normalization procedure, but the sample code for the paper has it, and
+        it seems to be important.
+        """
+        U_norms = TT.sqrt((new_U**2).sum(axis=0))
+        new_normVF = .95 * self.normVF + .05 * TT.mean(U_norms)
+        return (new_U * new_normVF / U_norms), new_normVF
+
+    def contrastive_grads(self, neg_v = None):
+        """Return the contrastive divergence gradients on the parameters of self.rbm """
+        if neg_v is None:
+            neg_v = self.sampler.positions
+        return contrastive_grad(
+                free_energy_fn=self.rbm.free_energy_given_v,
+                pos_v=self.visible_batch, 
+                neg_v=neg_v,
+                wrt = self.rbm.params(),
+                other_cost=(l1(self.rbm.U)+l1(self.rbm.W)) * self.effective_l1_penalty)
+
+    def cd_updates(self):
+        """
+        Return a dictionary of shared variable updates that implements contrastive divergence
+        learning by stochastic gradient descent with an annealed learning rate.
+        """
+
+        ups = {}
+
+        if self.persistent_chains:
+            grads = self.contrastive_grads()
+            ups.update(dict(self.sampler.updates()))
+        else:
+            cd1_sampler, final_p, cd1_updates = self.rbm.CD1_sampler(self.visible_batch,
+                    self.batchsize)
+            self._last_cd1_sampler = cd1_sampler # hacked in here for the unit test
+            #ignore the cd1_sampler
+            grads = self.contrastive_grads(neg_v = final_p)
+            ups.update(dict(cd1_updates))
 
 
-#TODO: put the learning loop as a global function or class, so that someone could load and *TRAIN* an mcRBM!!!
+        # contrastive divergence updates
+        # TODO: sgd_updates is a particular optization algo (others are possible)
+        #       parametrize so that algo is plugin
+        #       the normalization normVF might be sgd-specific though...
+
+        # TODO: when sgd has an annealing schedule, this should
+        #       go through that mechanism.
+
+        lr = TT.clip(
+                self.learn_rate * TT.cast(self.lr_anneal_start / (self.iter+1), floatX), 
+                0.0, #min
+                self.learn_rate) #max
+
+        ups.update(dict(sgd_updates(
+                    self.rbm.params(),
+                    grads,
+                    stepsizes=[a*lr for a in self.learn_rate_multipliers])))
+
+        ups[self.iter] = self.iter + 1
+
 
-if __name__ == '__main__':
-    import pylearn.algorithms.tests.test_mcRBM
-    pylearn.algorithms.tests.test_mcRBM.test_reproduce_ranzato_hinton_2010(as_unittest=True)
+        # add trainer updates (replace CD update of U)
+        ups[self.rbm.U], ups[self.normVF] = self.normalize_U(ups[self.rbm.U])
+
+        #l1_updates:
+        if (self.l1_penalty_start > 0) and (self.l1_penalty != 0.0):
+            ups[self.effective_l1_penalty] = TT.switch(
+                    self.iter >= self.l1_penalty_start,
+                    self.l1_penalty,
+                    0.0)
+
+        if getattr(self,'p_lr', None):
+            ups[self.p_lr] = TT.switch(self.iter > self.p_training_start,
+                    self.p_training_lr,
+                    0)
+            new_P = ups[self.rbm.P] * self.p_mask
+            no_pos_P = TT.switch(new_P<0, new_P, 0)
+            ups[self.rbm.P] = - no_pos_P / no_pos_P.sum(axis=0) #normalize to that columns sum 1
+
+        return ups
+
--- a/pylearn/algorithms/tests/test_mcRBM.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/algorithms/tests/test_mcRBM.py	Thu Oct 14 23:55:55 2010 -0400
@@ -1,42 +1,67 @@
+import sys
+from pylearn.algorithms.mcRBM import *
+import pylearn.datasets.cifar10
+import pylearn.dataset_ops.tinyimages
+
+import pylearn.dataset_ops.cifar10
+from theano import tensor
+from pylearn.shared.layers.logreg import LogisticRegression
 
 
-from pylearn.algorithms.mcRBM import *
+def _default_rbm_alloc(n_I, n_K=256, n_J=100):
+    return mcRBM.alloc(n_I, n_K, n_J)
+
+def _default_trainer_alloc(rbm, train_batch, batchsize, l1_penalty, l1_penalty_start):
+    return mcRBMTrainer.alloc(rbm, train_batch, batchsize, l1_penalty=l1_penalty,
+            l1_penalty_start=l1_penalty_start,persistent_chains=persistent_chains)
 
 
-def test_reproduce_ranzato_hinton_2010(dataset='MAR', as_unittest=True):
-    dataset='MAR'
+def test_reproduce_ranzato_hinton_2010(dataset='MAR', as_unittest=True, n_train_iters=5000,
+        rbm_alloc=_default_rbm_alloc, trainer_alloc=_default_trainer_alloc,
+        lr_per_example=.075,
+        l1_penalty=1e-3,
+        l1_penalty_start=1000,
+        persistent_chains=True,
+        ):
+
+    batchsize = 128
+
     if dataset == 'MAR':
         n_vis=105
         n_patches=10240
+        epoch_size=n_patches
+    elif dataset=='cifar10patches8x8':
+        R,C= 8,8 # the size of image patches
+        n_vis=96 # pca components
+        epoch_size=batchsize*500
+        n_patches=epoch_size*20
+    elif dataset=='tinyimages_patches':
+        R,C=8,8
+        n_vis=81
+        epoch_size=batchsize*500
+        n_patches=epoch_size*20
     else:
         R,C= 16,16 # the size of image patches
         n_vis=R*C
         n_patches=100000
-
-    n_train_iters=5000
-
-    n_burnin_steps=10000
-
-    l1_penalty=1e-3
-    no_l1_epochs = 10
-    effective_l1_penalty=0.0
-
-    epoch_size=n_patches
-    batchsize = 128
-    lr = 0.075 / batchsize
-    s_lr = TT.scalar()
-    s_l1_penalty=TT.scalar()
-    n_K=256
-    n_J=100
-
-    rbm = MeanCovRBM.new_from_dims(n_I=n_vis, n_K=n_K, n_J=n_J) 
-
-    smplr = sampler(rbm, n_particles=batchsize)
+        epoch_size=n_patches
 
     def l2(X):
         return numpy.sqrt((X**2).sum())
+
     if dataset == 'MAR':
         tile = pylearn.dataset_ops.image_patches.save_filters_of_ranzato_hinton_2010
+    elif dataset == 'cifar10patches8x8':
+        def tile(X, fname):
+            _img = pylearn.datasets.cifar10.tile_rasterized_examples(
+                    pylearn.preprocessing.pca.pca_whiten_inverse(
+                        pylearn.dataset_ops.cifar10.random_cifar_patches_pca(
+                            n_vis, None, 'float32', n_patches, R, C,),
+                        X),
+                    img_shape=(R,C))
+            image_tiling.save_tiled_raster_images(_img, fname)
+    elif dataset == 'tinyimages_patches':
+        tile = pylearn.dataset_ops.tinyimages.save_filters
     else:
         def tile(X, fname):
             _img = image_tiling.tile_raster_images(X,
@@ -45,9 +70,16 @@
             image_tiling.save_tiled_raster_images(_img, fname)
 
     batch_idx = TT.iscalar()
+    batch_range =batch_idx * batchsize + np.arange(batchsize)
 
     if dataset == 'MAR':
-        train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_idx * batchsize + np.arange(batchsize))
+        train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_range)
+    elif dataset == 'cifar10patches8x8':
+        train_batch = pylearn.dataset_ops.cifar10.cifar10_patches(
+                batch_range, 'train', n_patches=n_patches, patch_size=(R,C),
+                pca_components=n_vis)
+    elif dataset == 'tinyimages_patches':
+        train_batch = pylearn.dataset_ops.tinyimages.tinydataset_op(batch_range)
     else:
         train_batch = pylearn.dataset_ops.image_patches.image_patches(
                 s_idx = (batch_idx * batchsize + np.arange(batchsize)),
@@ -60,26 +92,36 @@
     if not as_unittest:
         imgs_fn = function([batch_idx], outputs=train_batch)
 
-    grads = contrastive_grad(
-            free_energy_fn=lambda v: free_energy_given_v(rbm, v),
-            pos_v=train_batch, 
-            neg_v=smplr.positions[0],
-            params=list(rbm),
-            other_cost=(l1(rbm.U)+l1(rbm.W)) * s_l1_penalty)
-    sgd_ups = sgd_updates(
-                rbm.params,
-                grads,
-                stepsizes=[2*s_lr, .2*s_lr, .02*s_lr, .1*s_lr, .02*s_lr ])
-    learn_fn = function([batch_idx, s_lr, s_l1_penalty], 
-            outputs=[ 
-                grads[0].norm(2),
-                (sgd_ups[0][1] - sgd_ups[0][0]).norm(2),
-                (sgd_ups[1][1] - sgd_ups[1][0]).norm(2),
-                ],
-            updates = sgd_ups)
+    trainer = trainer_alloc(
+            rbm_alloc(n_I=n_vis),
+            train_batch,
+            batchsize, 
+            initial_lr_per_example=lr_per_example,
+            l1_penalty=l1_penalty,
+            l1_penalty_start=l1_penalty_start,
+            persistent_chains=persistent_chains)
+    rbm=trainer.rbm
+
+    if persistent_chains:
+        grads = trainer.contrastive_grads()
+        learn_fn = function([batch_idx], 
+                outputs=[grads[0].norm(2), grads[0].norm(2), grads[1].norm(2)],
+                updates=trainer.cd_updates())
+    else:
+        learn_fn = function([batch_idx], outputs=[], updates=trainer.cd_updates())
+
+    if persistent_chains:
+        smplr = trainer.sampler
+    else:
+        smplr = trainer._last_cd1_sampler
+
+    if dataset == 'cifar10patches8x8':
+        cPickle.dump(
+                pylearn.dataset_ops.cifar10.random_cifar_patches_pca(
+                    n_vis, None, 'float32', n_patches, R, C,),
+                open('test_mcRBM.pca.pkl','w'))
 
     print "Learning..."
-    normVF=1
     last_epoch = -1
     for jj in xrange(n_train_iters):
         epoch = jj*batchsize / epoch_size
@@ -87,9 +129,6 @@
         print_jj = epoch != last_epoch
         last_epoch = epoch
 
-        if epoch > 10:
-            break
-
         if as_unittest and epoch == 5:
             U = rbm.U.value
             W = rbm.W.value
@@ -117,14 +156,20 @@
         if print_jj:
             if not as_unittest:
                 tile(imgs_fn(jj), "imgs_%06i.png"%jj)
-                tile(smplr.positions[0].value, "sample_%06i.png"%jj)
+                if persistent_chains:
+                    tile(smplr.positions.value, "sample_%06i.png"%jj)
                 tile(rbm.U.value.T, "U_%06i.png"%jj)
                 tile(rbm.W.value.T, "W_%06i.png"%jj)
 
             print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize)
 
             print 'l2(U)', l2(rbm.U.value),
-            print 'l2(W)', l2(rbm.W.value)
+            print 'l2(W)', l2(rbm.W.value),
+            print 'l1_penalty', 
+            try:
+                print trainer.effective_l1_penalty.value
+            except:
+                print trainer.effective_l1_penalty
 
             print 'U min max', rbm.U.value.min(), rbm.U.value.max(),
             print 'W min max', rbm.W.value.min(), rbm.W.value.max(),
@@ -132,20 +177,16 @@
             print 'b min max', rbm.b.value.min(), rbm.b.value.max(),
             print 'c min max', rbm.c.value.min(), rbm.c.value.max()
 
-            print 'parts min', smplr.positions[0].value.min(), 
-            print 'max',smplr.positions[0].value.max(),
-            print 'HMC step', smplr.stepsize,
-            print 'arate', smplr.avg_acceptance_rate
+            if persistent_chains:
+                print 'parts min', smplr.positions.value.min(), 
+                print 'max',smplr.positions.value.max(),
+            print 'HMC step', smplr.stepsize.value,
+            print 'arate', smplr.avg_acceptance_rate.value
 
-        # Continue HMC chain
-        smplr.simulate()
 
-        # Do CD update
-        l2_of_Ugrad = learn_fn(jj, 
-                lr/max(1, jj/(20*epoch_size/batchsize)),
-                effective_l1_penalty)
+        l2_of_Ugrad = learn_fn(jj)
 
-        if print_jj:
+        if persistent_chains and print_jj:
             print 'l2(U_grad)', float(l2_of_Ugrad[0]),
             print 'l2(U_inc)', float(l2_of_Ugrad[1]),
             print 'l2(W_inc)', float(l2_of_Ugrad[2]),
@@ -155,15 +196,293 @@
             #print 'FE+[2]', float(l2_of_Ugrad[5]),
             #print 'FE+[3]', float(l2_of_Ugrad[6])
 
-        if jj == no_l1_epochs * epoch_size/batchsize:
-            print "Activating L1 weight decay"
-            effective_l1_penalty = 1e-3
+        if not as_unittest:
+            if jj % 2000 == 0:
+                print ''
+                print 'Saving rbm...'
+                cPickle.dump(rbm, open('mcRBM.rbm.%06i.pkl'%jj, 'w'), -1)
+                if persistent_chains:
+                    print 'Saving sampler...'
+                    cPickle.dump(smplr, open('mcRBM.smplr.%06i.pkl'%jj, 'w'), -1)
+
+
+    if not as_unittest:
+        return rbm, smplr
+
+
+def run_classif_experiment(checkpoint):
+
+    R,C=8,8
+    n_vis=74
+    # PRETRAIN
+    #
+    # extract 1 million 8x8 patches from TinyImages
+    # pre-process them the right way
+    # find 74 dims of PCA
+    # filter patches through PCA
+    whitened_patches, pca_dct = pylearn.dataset_ops.tinyimages.main(n_imgs=100000,
+            max_components=n_vis, seed=234)
+    #
+    # Set up mcRBM Trainer
+    # Initialize P using topological 3x3 overlapping patches thing
+    # start learning P matrix after 2 passes through dataset
+    # 
+    rbm_filename = 'mcRBM.rbm.%06i.pkl'%46000
+    try:
+        open(rbm_filename).close()
+        load_mcrbm = True
+    except:
+        load_mcrbm = False
+
+    if load_mcrbm:
+        print 'loading mcRBM from file', rbm_filename
+        rbm = cPickle.load(open(rbm_filename))
+
+    else:
+        print "Training mcRBM"
+        batchsize=128
+        epoch_size=len(whitened_patches)
+        tile = pylearn.dataset_ops.tinyimages.save_filters
+        train_batch = theano.tensor.matrix()
+        trainer = mcRBMTrainer.alloc_for_P(
+                rbm=mcRBM_withP.alloc_topo_P(n_I=n_vis, n_J=81),
+                visible_batch=train_batch,
+                batchsize=batchsize, 
+                initial_lr_per_example=0.05,
+                l1_penalty=1e-3,
+                l1_penalty_start=sys.maxint,
+                p_training_start=2*epoch_size//batchsize,
+                persistent_chains=False)
+        rbm=trainer.rbm
+        learn_fn = function([train_batch], outputs=[], updates=trainer.cd_updates())
+        smplr = trainer._last_cd1_sampler
+
+        ii = 0
+        for i_epoch in range(6):
+            for i_batch in xrange(epoch_size // batchsize):
+                batch_vals = whitened_patches[i_batch*batchsize:(i_batch+1)*batchsize]
+                learn_fn(batch_vals)
+
+                if (ii % 1000) == 0:
+                    #tile(imgs_fn(ii), "imgs_%06i.png"%ii)
+                    tile(rbm.U.value.T, "U_%06i.png"%ii)
+                    tile(rbm.W.value.T, "W_%06i.png"%ii)
+
+                    print 'saving samples', ii, 'epoch', i_epoch, i_batch
+
+                    print 'l2(U)', l2(rbm.U.value),
+                    print 'l2(W)', l2(rbm.W.value),
+                    print 'l1_penalty', 
+                    try:
+                        print trainer.effective_l1_penalty.value
+                    except:
+                        print trainer.effective_l1_penalty
+
+                    print 'U min max', rbm.U.value.min(), rbm.U.value.max(),
+                    print 'W min max', rbm.W.value.min(), rbm.W.value.max(),
+                    print 'a min max', rbm.a.value.min(), rbm.a.value.max(),
+                    print 'b min max', rbm.b.value.min(), rbm.b.value.max(),
+                    print 'c min max', rbm.c.value.min(), rbm.c.value.max()
+
+                    print 'HMC step', smplr.stepsize.value,
+                    print 'arate', smplr.avg_acceptance_rate.value
+                    print 'P min max', rbm.P.value.min(), rbm.P.value.max(),
+                    print 'P_lr', trainer.p_lr.value
+                    print ''
+                    print 'Saving rbm...'
+                    cPickle.dump(rbm, open('mcRBM.rbm.%06i.pkl'%ii, 'w'), -1)
+
+                ii += 1
+
+
+    # extract convolutional features from the CIFAR10 data
+    feat_filename = 'cifar10.features.46000.npy'
+    feat_filename = 'mcrbm_features.npy'
+    try:
+        open(feat_filename).close()
+        load_features = True
+    except:
+        load_features = False
+
+    if load_features:
+        print 'Loading features from', feat_filename
+        all_features = numpy.load(feat_filename, mmap_mode='r')
+    else:
+        batchsize=100
+        feat_idx = tensor.lscalar()
+        feat_idx_range = feat_idx * batchsize + tensor.arange(batchsize)
+        train_batch_x, train_batch_y = pylearn.dataset_ops.cifar10.cifar10(
+                feat_idx_range, 
+                split='all', 
+                dtype='uint8', 
+                rasterized=False,
+                color='rgb')
+
+        WINDOW_SIZE=8
+        WINDOW_STRIDE=4
+
+        # put these into shared vars because support for big matrix constants is bad,
+        # (comparing them is slow)
+        pca_eigvecs = shared(pca_dct['eig_vecs'].astype('float32'))
+        pca_eigvals = shared(pca_dct['eig_vals'].astype('float32'))
+        pca_mean    = shared(pca_dct['mean'].astype('float32'))
+
+        def theano_pca_whiten(X):
+            #copying preprepcessing.pca.pca_whiten
+            return tensor.true_div(
+                tensor.dot(X-pca_mean, pca_eigvecs),
+                tensor.sqrt(pca_eigvals)+1e-8)
+
+        h_list = []
+        g_list = []
+        for r_offset in range(0, 32-WINDOW_SIZE+1, WINDOW_STRIDE):
+            for c_offset in range(0, 32-WINDOW_SIZE+1, WINDOW_STRIDE):
+                window = train_batch_x[:, r_offset:r_offset+WINDOW_SIZE,
+                        c_offset:c_offset+WINDOW_SIZE, :]
+                assert window.dtype=='uint8'
 
-        # weird normalization technique...
-        # It constrains all the columns of the matrix to have the same length
-        # But the matrix itself is re-scaled to have an arbitrary abslute size.
-        U = rbm.U.value
-        U_norms = np.sqrt((U*U).sum(axis=0))
-        assert len(U_norms) == n_K
-        normVF = .95 * normVF + .05 * np.mean(U_norms)
-        rbm.U.value = rbm.U.value * normVF/U_norms
+                #rasterize the patches
+                raster_window = tensor.flatten(tensor.cast(window, 'float32'),2)
+
+                #subtract off the mean of each image
+                raster_window = raster_window - raster_window.mean(axis=1).reshape((batchsize,1))
+
+                h,g = rbm.expected_h_g_given_v(theano_pca_whiten(raster_window))
+
+                h_list.append(h)
+                g_list.append(g)
+
+        hg = tensor.concatenate(h_list + g_list, axis=1)
+
+        feat_fn = function([feat_idx], hg)
+        features = numpy.empty((60000, 11025), dtype='float32')
+        for i in xrange(60000//batchsize):
+            if i % 100 == 0:
+                print("feature batch %i"%i)
+            features[i*batchsize:(i+1)*batchsize] = feat_fn(i)
+
+        print("saving features to %s"%feat_filename)
+        numpy.save(feat_filename, features)
+        all_features = features
+        del features
+
+
+    # CLASSIFY FEATURES
+
+    if 0:
+        # nothing to load
+        pass
+    else:
+        batchsize=10
+        x_i = tensor.matrix()
+        y_i = tensor.ivector()
+        lr = tensor.scalar()
+        l1_regularization = float(sys.argv[1]) #1.e-3
+        l2_regularization = float(sys.argv[2]) #1.e-3*0
+
+        feature_logreg = LogisticRegression.new(x_i, 
+                n_in = 11025, n_out=10,
+                dtype=x_i.dtype)
+
+        traincost = feature_logreg.nll(y_i).sum()
+        traincost = traincost + abs(feature_logreg.w).sum() * l1_regularization
+        traincost = traincost + (feature_logreg.w**2).sum() * l2_regularization
+        train_logreg_fn = function([x_i, y_i, lr], 
+                [feature_logreg.nll(y_i).mean(),
+                    feature_logreg.errors(y_i).mean()],
+                updates=pylearn.gd.sgd.sgd_updates(
+                    params=feature_logreg.params,
+                    grads=tensor.grad(traincost, feature_logreg.params),
+                    stepsizes=[lr,lr]))
+
+        all_labels = pylearn.dataset_ops.cifar10.all_data_labels('uint8')[1]
+        pylearn.dataset_ops.cifar10.all_data_labels.forget() # clear memo cache
+        assert len(all_labels)==60000
+        train_labels = all_labels[:40000]
+        valid_labels = all_labels[40000:50000]
+        test_labels = all_labels[50000:60000]
+        train_features = all_features[:40000]
+        valid_features = all_features[40000:50000]
+        test_features = all_features[50000:60000]
+
+        print "Computing mean and std.dev"
+        train_mean = train_features.mean(axis=0)
+        train_std = train_features.std(axis=0)+1e-4
+
+
+        for epoch in xrange(20):
+            print 'epoch', epoch
+            # validate
+
+            l01s = []
+            nlls = []
+            for i in xrange(10000/batchsize):
+                x_i = valid_features[i*batchsize:(i+1)*batchsize]
+                y_i = valid_labels[i*batchsize:(i+1)*batchsize]
+
+                #lr=0.0 -> no learning, safe for validation set
+                nll, l01 = train_logreg_fn((x_i-train_mean)/train_std, y_i, 0.0) 
+                nlls.append(nll)
+                l01s.append(l01)
+            print 'validate log_reg', numpy.mean(nlls), numpy.mean(l01s)
+
+            # test
+
+            l01s = []
+            nlls = []
+            for i in xrange(10000/batchsize):
+                x_i = test_features[i*batchsize:(i+1)*batchsize]
+                y_i = test_labels[i*batchsize:(i+1)*batchsize]
+
+                #lr=0.0 -> no learning, safe for validation set
+                nll, l01 = train_logreg_fn((x_i-train_mean)/train_std, y_i, 0.0) 
+                nlls.append(nll)
+                l01s.append(l01)
+            print 'test log_reg', numpy.mean(nlls), numpy.mean(l01s)
+
+            #train
+            l01s = []
+            nlls = []
+            for i in xrange(40000/batchsize):
+                x_i = train_features[i*batchsize:(i+1)*batchsize]
+                y_i = train_labels[i*batchsize:(i+1)*batchsize]
+                nll, l01 = train_logreg_fn((x_i-train_mean)/train_std, y_i, 0.00003)
+                nlls.append(nll)
+                l01s.append(l01)
+            print 'train log_reg', numpy.mean(nlls), numpy.mean(l01s)
+
+
+
+
+import pickle as cPickle
+#import cPickle
+if __name__ == '__main__':
+    if 0: 
+        #learning 16 x 16 pinwheel filters from official cifar patches (MAR)
+        rbm,smplr = test_reproduce_ranzato_hinton_2010(
+                as_unittest=False,
+                n_train_iters=5000,
+                rbm_alloc=lambda n_I : mcRBM_withP.alloc_topo_P(n_I, n_J=81),
+                trainer_alloc=mcRBMTrainer.alloc_for_P,
+                dataset='MAR'
+                )
+
+    if 0:
+        # pretraining settings
+        rbm,smplr = test_reproduce_ranzato_hinton_2010(
+                as_unittest=False,
+                n_train_iters=60000,
+                rbm_alloc=lambda n_I : mcRBM_withP.alloc_topo_P(n_I, n_J=81),
+                trainer_alloc=mcRBMTrainer.alloc_for_P,
+                lr_per_example=0.05,
+                dataset='tinyimages_patches',
+                l1_penalty=1e-3,
+                l1_penalty_start=30000,
+                #l1_penalty_start=350, #DEBUG
+                persistent_chains=False
+                )
+
+    if 1:
+        def checkpoint():
+            return checkpoint
+        run_classif_experiment(checkpoint=checkpoint)
--- a/pylearn/dataset_ops/cifar10.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/dataset_ops/cifar10.py	Thu Oct 14 23:55:55 2010 -0400
@@ -21,11 +21,20 @@
     dict = cPickle.load(fo)
     fo.close()
     data, labels = numpy.asarray(dict['data'], dtype=dtype), numpy.asarray(dict['labels'], dtype='int32')
-    if dtype in ('float32', 'float64'):
+    if str(dtype) in ('float32', 'float64'):
         data /= 255
     return data, labels
 
 @memo
+def all_data_labels(dtype='uint8'):
+    train_batch_data, train_batch_labels = zip(*[ _unpickle( os.path.join(data_root(), 'cifar10', 
+        'cifar-10-batches-py', 'data_batch_%i'%i), dtype) for i in range(1,6)])
+    test_batch_data, test_batch_labels = test_data_labels(dtype)
+    data = numpy.vstack(list(train_batch_data)+[test_batch_data])
+    labels = numpy.hstack(list(train_batch_labels)+[test_batch_labels])
+    return data, labels
+
+@memo
 def train_data_labels(dtype='uint8'):
     batch_data, batch_labels = zip(*[ _unpickle( os.path.join(data_root(), 'cifar10', 
         'cifar-10-batches-py', 'data_batch_%i'%i), dtype) for i in range(1,6)])
@@ -40,6 +49,7 @@
 def forget():
     train_data_labels.forget()
     test_data_labels.forget()
+    all_data_labels.forget()
 
 
 # functions for TensorFnDataset
@@ -56,9 +66,21 @@
     return test_data_labels(dtype)[0]
 def test_labels():
     return test_data_labels()[1]
+def all_data(dtype):
+    if dtype!='uint8':
+        raise ValueError()
+    return all_data_labels()[0]
+def all_labels():
+    return all_data_labels()[1]
 
 
-def cifar10(s_idx, split, dtype='float64', rasterized=False, color='grey'):
+def cifar10(s_idx, split, dtype='float64', rasterized=False, color='grey',
+        split_options = {'train':(train_data, train_labels),
+                'valid': (valid_data, valid_labels),
+                'test': (test_data, test_labels),
+                'all': (all_data, all_labels),
+                }
+            ):
     """ 
     Returns a pair (img, label) of theano expressions for cifar-10 samples
 
@@ -76,10 +98,6 @@
 
     """
 
-    split_options = {'train':(train_data, train_labels),
-            'valid': (valid_data, valid_labels),
-            'test': (test_data, test_labels)}
-
     if split not in split_options:
         raise ValueError('invalid split option', (split, split_options.keys()))
 
@@ -95,14 +113,15 @@
     x = x_op(s_idx)
     y = y_op(s_idx)
 
-    # Y = 0.3R + 0.59G + 0.11B from
-    # http://gimp-savvy.com/BOOK/index.html?node54.html
-    rgb_dtype = 'float32'
-    if dtype == 'float64':
-        rgb_dtype = dtype
-    r = numpy.asarray(.3, dtype=rgb_dtype)
-    g = numpy.asarray(.59, dtype=rgb_dtype)
-    b = numpy.asarray(.11, dtype=rgb_dtype)
+    if color=='grey':
+        # Y = 0.3R + 0.59G + 0.11B from
+        # http://gimp-savvy.com/BOOK/index.html?node54.html
+        rgb_dtype = 'float32'
+        if dtype == 'float64':
+            rgb_dtype = dtype
+        r = numpy.asarray(.3, dtype=rgb_dtype)
+        g = numpy.asarray(.59, dtype=rgb_dtype)
+        b = numpy.asarray(.11, dtype=rgb_dtype)
 
     if x.ndim == 1:
         if rasterized:
@@ -148,8 +167,8 @@
                     x = theano.tensor.cast(x, 'uint8')
                 x.reshape((N, 32, 32))
             elif color=='rgb':
-                # the strides aren't what you'd expect between channels,
-                # but theano is all about weird strides
+                # note: the strides aren't what you'd expect between channels,
+                # but a copy of the data would correct that.
                 x = x.reshape((N,3,32,32)).dimshuffle(0, 2, 3, 1)
             else:
                 raise NotImplemented('color', color)
@@ -160,6 +179,76 @@
 
 nclasses = 10
 
+import pylearn.datasets.image_patches
+import pylearn.preprocessing.pca
+
+@memo
+def random_cifar_patches(dtype, N,R,C, centered=True):
+    #These used to be arguments, but optional arguments don't work well with the cache
+    # because the cache doesn't [yet] look up what they are
+    rng_seed=89234
+    channel_rank=2
+
+    rng=numpy.random.RandomState(rng_seed)
+    imgs = train_data_labels(dtype)[0][:40000].reshape((40000,3,32,32)).transpose((0,2,3,1))
+    forget() #un-cache the original images
+    #import pdb; pdb.set_trace()
+    patches = pylearn.datasets.image_patches.extract_random_patches(imgs, N,R,C, rng)
+    orig_shape = patches.shape
+
+    # center individual examples
+    patches = patches.reshape((orig_shape[0], R*C*3))
+    patches -= patches.mean(axis=1).reshape((N, 1))
+    patches = patches.reshape(orig_shape)
+
+    if channel_rank==4:
+        pass
+    elif channel_rank==2:
+        # put the channels the cifar10 way :/
+        patches = patches.transpose((0,3,1,2)).copy()
+    else:
+        raise NotImplementedError()
+    if centered:
+        patches -= patches.mean(axis=0)
+    return patches
+
+@memo
+def random_cifar_patches_pca(max_components, max_energy_fraction, dtype, N,R,C,*args):
+    pca, _ = pylearn.preprocessing.pca.pca_from_examples(
+            random_cifar_patches(dtype,N,R,C,*args).reshape((N,R*C*3)),
+            max_components, max_energy_fraction, x_centered=True)
+    return pca
+
+@memo
+def whitened_random_cifar_patches(max_components, max_energy_fraction, dtype,N,R,C,*args):
+    pca = random_cifar_patches_pca(max_components, max_energy_fraction, dtype,N,R,C,*args)
+    patches = random_cifar_patches(dtype,N,R,C,*args).reshape((N,R*C*3))
+    random_cifar_patches.forget() #un-cache the original patches
+    return pylearn.preprocessing.pca.pca_whiten(pca, patches).astype(dtype)
+
+def cifar10_patches(s_idx, split, dtype='float32', rasterized=True, color='rgb', 
+        n_patches=1000, patch_size=(8,8), pca_components=80):
+    """
+    Return
+    """
+    if split != 'train': raise NotImplementedError()
+    if dtype != 'float32':raise NotImplementedError()
+    if color != 'rgb': raise NotImplementedError()
+    if s_idx.ndim != 1: raise NotImplementedError()
+
+    x_op = TensorFnDataset(dtype, (False,), 
+            (whitened_random_cifar_patches, (
+                pca_components,None,dtype,n_patches, patch_size[0], patch_size[1])),
+            (patch_size[0],patch_size[1],3))
+    x = x_op(s_idx%n_patches)
+
+    if rasterized:
+        x = x.flatten(2)
+    else:
+        raise NotImplementedError()
+
+    return x
+
 def glviewer(split='train'):
     from glviewer import GlViewer
     i = theano.tensor.iscalar()
--- a/pylearn/dataset_ops/image_patches.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/dataset_ops/image_patches.py	Thu Oct 14 23:55:55 2010 -0400
@@ -30,7 +30,8 @@
 def image_patches(s_idx, dims,
         split='train', dtype=theano.config.floatX, rasterized=False,
         center=True,
-        unitvar=True):
+        unitvar=True,
+        fn=get_dataset):
     N,R,C=dims
 
     if split != 'train':
@@ -39,7 +40,7 @@
     if not rasterized:
         raise NotImplementedError()
 
-    op = TensorFnDataset(dtype, bcast=(False,), fn=(get_dataset, (N,R,C,dtype,center,unitvar)), single_shape=(R*C,))
+    op = TensorFnDataset(dtype, bcast=(False,), fn=(fn, (N,R,C,dtype,center,unitvar)), single_shape=(R*C,))
     x = op(s_idx%N)
     if x.ndim == 1:
         if not rasterized:
@@ -87,7 +88,8 @@
         split='train', 
         dtype=theano.config.floatX, rasterized=True,
         center=True,
-        unitvar=True):
+        unitvar=True,
+        fn=ranzato_hinton_2010_whitened_patches):
     N = 10240
 
     if split != 'train':
@@ -104,7 +106,7 @@
 
     op = TensorFnDataset(dtype,
             bcast=(False,), 
-            fn=ranzato_hinton_2010_whitened_patches,
+            fn=fn,
             single_shape=(105,))
     x = op(s_idx%N)
     return x
--- a/pylearn/dataset_ops/memo.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/dataset_ops/memo.py	Thu Oct 14 23:55:55 2010 -0400
@@ -9,9 +9,6 @@
 def memo(f):
     #TODO: support kwargs to rval.  This requires looking up the names of f's parameters to 
     #      construct a proper key.
-
-    #TODO: use weak references instead of a normal dict so that the cache doesn't prevent
-    #      garbage collection
     cache = {}
     def rval(*args):
         if args not in cache:
@@ -24,5 +21,6 @@
     rval.forget = forget
     rval.__name__ = 'memo@%s'%f.__name__
     rval.cache = cache
+    rval.__doc__ = f.__doc__
     return rval
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/dataset_ops/tinyimages.py	Thu Oct 14 23:55:55 2010 -0400
@@ -0,0 +1,303 @@
+"""I'm not sure where to put this code.
+
+THIS IS NOT POLISHED LIBRARY CODE YET.
+
+"""
+
+__authors__ = "James Bergstra"
+__copyright__ = "(c) 2010, Universite de Montreal"
+__license__ = "3-clause BSD License"
+__contact__ = "bergstrj@iro.umontreal.ca"
+
+
+import cPickle, logging, sys
+import numpy
+from pylearn.datasets import tinyimages, image_patches
+import pylearn.preprocessing.pca
+import theano
+from pylearn.io import image_tiling
+
+from .protocol import TensorFnDataset # protocol.py __init__.py
+from .memo import memo
+
+
+#
+# This part of the file (until main()) is for generating a dataset of image patches from the
+# tinyimages dataset.  These patches are used in the pretraining stage of the mcRBM training
+# algorithm.
+#
+# Since the 'dataset' is properly seen as a cached-to-disk preprocessing derived from raw
+# material in tinyimages, it is not a real dataset (with a standard disk location in the
+# PYLEARN_DATA_ROOT root).
+#
+# Hopefully the upcoming pylearn library proposal will have a policy on how/where this sort of
+# pre-processed data should be stored.  For now it is stored in the current working directory.
+#
+
+_raw_patch_file = 'tinydataset_raw.npy'
+_pca_file       = 'tinydataset_pca.pkl'
+_whitened_file  = 'tinydataset_whitened.npy'
+
+def normalize_channels(X, max_scale=5):
+    """Map images from (0,1) to all reals so that each channel of each image has zero mean,
+    [maximum] unit variance.  
+
+    Channels will not be scaled by more than max_scale, so the output variance might be smaller
+    than 1.
+    """
+    n_imgs,n_rows,n_cols,n_channels = X.shape
+    X = X.copy()
+    # ensure that we're working with floats on (0,1)
+    if not  str(X.dtype).startswith('float'):
+        raise TypeError()
+    if X.min() < 0:
+        raise ValueError('min out of bounds')
+    if X.max() > 1:
+        raise ValueError('max out of bounds')
+    assert n_channels==3
+    imaxscale = 1.0 / max_scale
+    def centre(imgstack):
+        a,b,c = imgstack.shape
+        flat = imgstack.reshape((a,b*c))
+        flat -= flat.mean(axis=1).reshape((a,1))
+        flat /= numpy.maximum(flat.std(axis=1).reshape((a,1)),imaxscale)
+        return flat.reshape((a,b,c))
+    X[:,:,:,0]=centre(X[:,:,:,0])
+    X[:,:,:,1]=centre(X[:,:,:,1])
+    X[:,:,:,2]=centre(X[:,:,:,2])
+    return X
+
+def save_filters(X, fname, min_dynamic_range=1e-8, data_path=None):
+    """
+    Save filters X (encoded as whitened images) in the original image space.
+    """
+    dct = load_pca_dct()
+    pca = dct['eig_vals'], dct['eig_vecs']
+
+    _img = image_tiling.tile_raster_images(
+            pylearn.preprocessing.pca.pca_whiten_inverse(pca, X),
+            img_shape=(8,8),
+            min_dynamic_range=1e-6)
+    image_tiling.save_tiled_raster_images(_img, fname)
+
+def extract_patches(n_imgs=1000*100, n_patches_per_image=10, patch_shape=(8,8), rng=numpy.random.RandomState(234)):
+    """
+    Extract a number of patches from each of the first TinyImages
+    """
+    R,C=patch_shape
+
+    dataset = numpy.empty((n_imgs*n_patches_per_image, R, C, 3), dtype='uint8')
+
+    assert n_imgs < tinyimages.n_images
+
+    image_stream = tinyimages.image_generator()
+
+    i = 0
+    while i < n_imgs:
+        y = image_stream.next()
+        yy = image_patches.extract_random_patches(
+                y.reshape((1,32,32,3)),
+                n_patches_per_image, 
+                R,C,
+                rng)
+        ii = i*n_patches_per_image
+        dataset[ii:ii+n_patches_per_image] = yy
+        i += 1
+    return dataset
+
+def compute_pca_dct(X, use_only=100000, max_components=128, max_energy_fraction=.99):
+
+    # each image channel is adjusted here
+    ### X = normalize_channels(numpy.asarray(data[:use_only], dtype='float32')/255)
+
+
+    # rasterize images
+    X = X.reshape((X.shape[0], X.shape[1]* X.shape[2]* X.shape[3]))
+
+    # switch to floats
+    X = X.astype('float32')
+
+    # subtract off each image mean (ignoring channels) #TODO: IS THIS GOOD IDEA?
+    X = X - X.mean(axis=1).reshape((X.shape[0], 1))
+
+    # subtract off global mean as part of pca
+    data_mean = X.mean(axis=0)
+    X = X - data_mean
+
+    # calculating pca
+    (eig_vals,eig_vecs), _ = pylearn.preprocessing.pca.pca_from_examples(
+            X, max_components, max_energy_fraction, x_centered=True)
+
+    print "Keeping %i principle components" % len(eig_vals)
+
+    return dict(
+            mean=data_mean,
+            eig_vecs=eig_vecs,
+            eig_vals=eig_vals)
+
+def whiten_patches(raw_patches, pca_dct):
+    """
+    Load the patches from sys.argv[1] and whiten them with sys.argv[2], saving them to
+    sys.argv[3].
+    """
+
+    rval = numpy.empty((raw_patches.shape[0], len(pca_dct['eig_vals'])), dtype='float32')
+
+    print 'allocated output of size', rval.shape
+
+    b = 100 #batchsize
+    i = 0
+    while i < len(rval):
+        xi = numpy.asarray(raw_patches[i:i+b], dtype='float32')
+        # rasterize
+        xi = xi.reshape((xi.shape[0], xi.shape[1]*xi.shape[2]*xi.shape[3]))
+        # remove image mean
+        xi = xi - xi.mean(axis=1).reshape((xi.shape[0], 1))
+        # remove pixel means
+        xi -= pca_dct['mean']
+        rval[i:i+b] = pylearn.preprocessing.pca.pca_whiten((pca_dct['eig_vals'], pca_dct['eig_vecs']), xi)
+        i += b
+    return rval
+
+def main(n_imgs=1000, n_patches_per_image=10, max_components=128, seed=234):
+    if 0: #do this to render the dataset to the screen
+        sys.exit(glviewer())
+
+    rng = numpy.random.RandomState(seed)
+
+    try:
+        open(_raw_patch_file).close() #fails if file not present
+        load_raw_patches = True
+    except:
+        load_raw_patches = False
+
+    if load_raw_patches:
+        print 'loading raw patches from', _raw_patch_file
+        raw_patches = numpy.load(_raw_patch_file, mmap_mode='r')
+    else:
+        print 'extracting raw patches'
+        raw_patches = extract_patches(rng=rng, n_imgs=n_imgs,
+                n_patches_per_image=n_patches_per_image)
+        rng.shuffle(raw_patches)
+        print 'saving raw patches to', _raw_patch_file
+        numpy.save(open(_raw_patch_file, 'wb'), raw_patches)
+
+    try:
+        open(_pca_file).close()
+        load_pca = True
+    except:
+        load_pca = False
+
+    if load_pca:
+        print 'loading pca from', _pca_file
+        pca_dct = cPickle.load(open(_pca_file))
+    else:
+        print 'computing pca'
+        pca_dct = compute_pca_dct(raw_patches, max_components=max_components)
+        print 'saving pca to', _pca_file
+        cPickle.dump(pca_dct, open(_pca_file, 'wb'))
+
+    try:
+        open(_whitened_file).close()
+        load_patches = True
+    except:
+        load_patches = False
+
+    if load_patches:
+        print 'loading whitened patches from', _whitened_file
+        whitened_patches = numpy.load(_whitened_file, mmap_mode='r')
+    else:
+        print 'computing whitened data'
+        whitened_patches = whiten_patches(raw_patches, pca_dct)
+        print 'saving', _whitened_file
+        numpy.save(_whitened_file, whitened_patches)
+
+    return whitened_patches, pca_dct
+
+#
+# This part of the file defines an op-constructor that uses the pre-processed cache / dataset generated
+#
+
+@memo
+def load_whitened(path=_whitened_file):
+    """
+    Replacement for dataset_ops.image_patches.ranzato_hinton_2010_op
+    """
+    try:
+        return numpy.load(path, mmap_mode='r')
+    except:
+        print >> sys.stderr, "Maybe you need to run 'python pylearn.dataset_ops.tinyimages'?"
+        raise
+
+@memo
+def load_pca_dct(path=_pca_file):
+    return cPickle.load(open(path))
+
+def tinydataset_op(s_idx,
+        split='train', 
+        fn=load_whitened):
+
+    n_examples,n_dim = fn().shape 
+
+    if split != 'train':
+        raise NotImplementedError('train/test/valid splits for randomly sampled image patches?')
+
+    op = TensorFnDataset('float32', bcast=(False,), fn=fn, single_shape=(n_dim,))
+    x = op(s_idx%n_examples)
+    return x
+
+
+def save_filters(X, fname):
+    dct = load_pca_dct()
+    eigs = dct['eig_vals'], dct['eig_vecs']
+    mean = dct['mean']
+    rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean
+    _img = image_tiling.tile_raster_images(
+            (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None),
+            img_shape=(8,8),
+            min_dynamic_range=1e-6)
+    image_tiling.save_tiled_raster_images(_img, fname)
+
+def glviewer(split='train'):
+    from glviewer import GlViewer
+    #i = theano.tensor.iscalar()
+    #f = theano.function([i], mnist(i, split, dtype='uint8', rasterized=False)[0])
+    data = numpy.load(_raw_patch_file, mmap_mode='r')
+    print 'RAW', data.shape
+    data = numpy.load(_whitened_file, mmap_mode='r')
+    print 'WHI', data.shape
+
+    if 1: # check the raw data
+        data = numpy.load(_raw_patch_file, mmap_mode='r')
+        data = data.reshape((data.shape[0], data.shape[1]*data.shape[2], data.shape[3]))
+        def f(i):
+            j = i*5000
+            jj = j + 5000
+            return image_tiling.tile_raster_images(
+                    (data[j:jj,:,0], data[j:jj,:,1], data[j:jj,:,2], None),
+                    img_shape=(8,8))
+    if 0: # check the whitened data
+        dct = load_pca_dct()
+        eigs = dct['eig_vals'], dct['eig_vecs']
+        mean = dct['mean']
+        data = numpy.load(_whitened_file, mmap_mode='r')
+        def f(i):
+            j = i*5000
+            jj = j + 5000
+            X = data[j:jj]
+            print 'j', j, jj
+            rasterized = pylearn.preprocessing.pca.pca_whiten_inverse(eigs, X)+mean
+            _img = image_tiling.tile_raster_images(
+                    (rasterized[:,::3], rasterized[:,1::3], rasterized[:,2::3], None),
+                    img_shape=(8,8),
+                    min_dynamic_range=1e-6)
+            return _img
+    GlViewer(f).main()
+
+
+
+if __name__=='__main__':
+    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
+    sys.exit(main())
+
+
--- a/pylearn/datasets/cifar10.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/datasets/cifar10.py	Thu Oct 14 23:55:55 2010 -0400
@@ -89,18 +89,20 @@
 def first_1k(dtype='uint8', ntrain=1000, nvalid=200, ntest=200):
     return cifar10(dtype, ntrain, nvalid, ntest)
 
-def tile_rasterized_examples(X):
+def tile_rasterized_examples(X, img_shape=(32,32)):
     """Returns an ndarray that is ready to be passed to `image_tiling.save_tiled_raster_images`
 
     This function is for the `x` matrices in the cifar dataset, or for the weight matrices
     (filters) used to multiply them.
     """
+    ndim = img_shape[0]*img_shape[1]
+    assert ndim *3 == X.shape[1], (ndim, X.shape)
     X = X.astype('float32')
-    r = X[:,:1024]
-    g = X[:,1024:2048]
-    b = X[:,2048:]
+    r = X[:,:ndim]
+    g = X[:,ndim:ndim*2]
+    b = X[:,ndim*2:]
     from pylearn.io.image_tiling import tile_raster_images
-    rval = tile_raster_images((r,g,b,None), img_shape=(32,32))
+    rval = tile_raster_images((r,g,b,None), img_shape=img_shape)
     return rval
 
 
--- a/pylearn/datasets/config.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/datasets/config.py	Thu Oct 14 23:55:55 2010 -0400
@@ -5,12 +5,12 @@
 """
 
 import os, sys, logging
-_logger = logging.getLogger('pylearn.datasets.config')
-def debug(*msg): _logger.debug(' '.join(str(m) for m in msg))
-def info(*msg): _logger.info(' '.join(str(m) for m in msg))
-def warn(*msg): _logger.warn(' '.join(str(m) for m in msg))
-def warning(*msg): _logger.warning(' '.join(str(m) for m in msg))
-def error(*msg): _logger.error(' '.join(str(m) for m in msg))
+def _logger():  return logging.getLogger('pylearn.datasets.config')
+def debug(*msg): _logger().debug(' '.join(str(m) for m in msg))
+def info(*msg): _logger().info(' '.join(str(m) for m in msg))
+def warn(*msg): _logger().warn(' '.join(str(m) for m in msg))
+def warning(*msg): _logger().warning(' '.join(str(m) for m in msg))
+def error(*msg): _logger().error(' '.join(str(m) for m in msg))
 
 
 def env_get(key, default, key2 = None):
--- a/pylearn/datasets/image_patches.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/datasets/image_patches.py	Thu Oct 14 23:55:55 2010 -0400
@@ -83,7 +83,7 @@
 def extract_random_patches(img_stack, N, R,C, rng):
     """Return subimages from the img_stack
 
-    :param img_stack: a 3D ndarray (n_images, rows, cols) or a list of 2D images.
+    :param img_stack: a 3D[4D] ndarray (n_images, rows, cols,[channels]) or a list of images.
     :param N: number of patches to extract
     :param R: number of rows in patch
     :param C: number of cols in patch
@@ -94,21 +94,25 @@
     image in the stack therefore would be sampled less frequently than patches from a smaller
     image in the stack.
 
-    To use ZCA whitening, extract patches from the raw data, and pass it to
-    preprocessing.pca.zca_whitening.
+    :hint:
+        To use ZCA whitening, extract patches from the raw data, and pass it to
+        preprocessing.pca.zca_whitening.
     """
-    rval = numpy.empty((N,R,C), dtype=img_stack[0].dtype)
+
+    rval = numpy.empty((N,R,C)+img_stack.shape[3:], dtype=img_stack[0].dtype)
 
     L = len(img_stack)
     img_idxlist = rng.randint(L,size=N)
+    offsets_R = rng.randint(img_stack.shape[1]-R+1, size=N)
+    offsets_C = rng.randint(img_stack.shape[2]-C+1, size=N)
 
-    for n, img_idxlist in enumerate(img_idxlist):
-        img_idx = rng.randint(L)
-        img_n = img_stack[img_idx]
-        offset_R = rng.randint(img_n.shape[0]-R+1)
-        offset_C = rng.randint(img_n.shape[1]-C+1)
-        rval[n] = img_n[offset_R:offset_R+R,offset_C:offset_C+C]
-
+    for n, (l,r,c) in enumerate(zip(img_idxlist, offsets_R, offsets_C)):
+        #img_idx = rng.randint(L)
+        #offset_R = rng.randint(img_n.shape[0]-R+1)
+        #offset_C = rng.randint(img_n.shape[1]-C+1)
+        #img_n = img_stack[l]
+        #rval[n] = img_n[offset_R:offset_R+R,offset_C:offset_C+C]
+        rval[n] = img_stack[l,r:r+R,c:c+C]
     return rval
 
 
--- a/pylearn/datasets/tinyimages.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/datasets/tinyimages.py	Thu Oct 14 23:55:55 2010 -0400
@@ -5,10 +5,12 @@
 __license__ = "3-clause BSD License"
 __contact__ = "bergstrj@iro.umontreal.ca"
 
-import os, sys
+import logging, os, sys
 import PIL.Image
 import numpy
 
+logger = logging.getLogger('pylearn.datasets.tinyimages')
+
 def sorted_listdir(*path):
     r = os.listdir(os.path.join(*path))
     r.sort()
@@ -27,8 +29,7 @@
                 yield path, letter, label, img
 
 def load_image(path):
-    """
-    """
+    """Return the image at `path` as a numpy ndarray """
     rval = numpy.asarray(PIL.Image.open(path))
     return rval
 
@@ -40,10 +41,17 @@
     Be careful with this generator because the dataset in total is close to
     20GB!
     """
+    n_colour_conversions = 0
+    n_yielded = 0
     for p in iterate_over_filenames(path=_original):
-        y = load_image(*p)
-        assert y.shape == (32,32,3)
-        assert y.dtype == numpy.uint8
+        y = load_image(os.path.join(*p))
+        n_yielded += 1
+        if y.shape == (32,32):
+            logger.info("put %i'th/%i images in colour"%(n_colour_conversions, n_yielded))
+            y = numpy.asarray([y,y,y]).transpose((1,2,0)).copy()
+            n_colour_conversions += 1
+        assert y.shape == (32,32,3), (p,y.shape)
+        assert y.dtype == numpy.uint8, (p,y.dtype)
         yield y
 
 def load_first_N(N):
@@ -53,14 +61,20 @@
         yield it.next()
         i +=1
 
-if __name__ == '__main__':
-    if 0:
-        def iter_len(x):
-            i = 0
-            for xx in x:
-                i += 1
-            return i
-        print 'got %i files' % iter_len(iterate_over_filenames())
+n_images = 1608356 
+
+def main():
+    def iter_len(x):
+        i = 0
+        for xx in x:
+            i += 1
+        return i
+    n_files = iter_len(iterate_over_filenames())
+    print 'got %i files' % n_files
+    assert n_images == n_files
 
     for p in load_first_N(10):
         load_image(os.path.join(*p))
+
+if __name__ == '__main__':
+    sys.exit(main())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/formulas/activations.py	Thu Oct 14 23:55:55 2010 -0400
@@ -0,0 +1,256 @@
+"""
+Activation function for artificial neural units. 
+
+"""
+
+__authors__   = "Razvan Pascanu, .."
+__copyright__ = "(c) 2010, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "Razvan Pascanu <r.pascanu@gmail.com>"
+
+import theano
+from theano import tensor
+
+import tags
+
+
+
+@tags.tags('activation', 'unary',
+           'sigmoid', 'logistic',
+           'non-negative', 'increasing')
+def sigmoid(x):
+    """
+    Return a symbolic variable representing the sigmoid (logistic)
+    function of the input x.
+
+    .. math::
+        \\textrm{sigmoid}(x) = \\frac{1}{1 + e^x}
+
+    The image of :math:`\\textrm{sigmoid}(x)` is the open interval (0,
+    1), *in theory*. *In practice*, due to rounding errors in floating
+    point representations, :math:`\\textrm{sigmoid}(x)` will lie in the
+    closed range [0, 1].
+
+    :param x: tensor-like (a Theano variable with type theano.Tensor,
+              or a value that can be converted to one) :math:`\in
+              \mathbb{R}^n`
+
+    :return: a Theano variable with the same shape as the input, where
+             the sigmoid function is mapped to each element of the
+             input x.
+    """
+    return theano.tensor.nnet.sigmoid(x)
+
+
+
+@tags.tags('activation', 'unary',
+           'tanh', 'hyperbolic tangent',
+           'odd', 'increasing')
+def tanh(x):
+    """
+    Return a symbolic variable representing the tanh (hyperbolic
+    tangent) of the input x.
+
+    .. math::
+        \\textrm{tanh}(x) = \\frac{e^{2x} - 1}{e^{2x} + 1}
+
+    The image of :math:`\\textrm{tanh}(x)` is the open interval (-1,
+    1), *in theory*. *In practice*, due to rounding errors in floating
+    point representations, :math:`\\textrm{tanh}(x)` will lie in the
+    closed range [-1, 1].
+
+    :param x: tensor-like (a Theano variable with type theano.Tensor,
+              or a value that can be converted to one) :math:`\in
+              \mathbb{R}^n`
+
+    :return: a Theano variable with the same shape as the input, where
+             the tanh function is mapped to each element of the input
+             x.
+    """
+    return theano.tensor.tanh(x)
+
+
+
+@tags.tags('activation', 'unary',
+           'tanh', 'hyperbolic tangent', 'normalized',
+           'odd', 'increasing')
+def tanh_normalized(x):
+    """
+    Return a symbolic variable representing a normalized tanh
+    (hyperbolic tangent) of the input x.
+    TODO: where does 1.759 come from? why is it normalized like that?
+
+    .. math::
+        \\textrm{tanh\_normalized}(x) = 1.759\\textrm{ tanh}\left(\\frac{2x}{3}\\right)
+
+    The image of :math:`\\textrm{tanh\_normalized}(x)` is the open
+    interval (-1.759, 1.759), *in theory*. *In practice*, due to
+    rounding errors in floating point representations,
+    :math:`\\textrm{tanh\_normalized}(x)` will lie in the approximative
+    closed range [-1.759, 1.759]. The exact bound depends on the
+    precision of the floating point representation.
+
+    :param x: tensor-like (a Theano variable with type theano.Tensor,
+              or a value that can be converted to one) :math:`\in
+              \mathbb{R}^n`
+
+    :return: a Theano variable with the same shape as the input, where
+             the tanh\_normalized function is mapped to each element of
+             the input x.
+    """
+    return 1.759*theano.tensor.tanh(0.6666*x)
+
+
+
+@tags.tags('activation', 'unary',
+           'abs_tanh', 'abs', 'tanh', 'hyperbolic tangent',
+           'non-negative', 'even')
+def abs_tanh(x):
+    """
+    Return a symbolic variable representing the absolute value of the
+    hyperbolic tangent of x.
+
+    .. math::
+        \\textrm{abs\_tanh}(x) = |\\textrm{tanh}(x)|
+
+    The image of :math:`\\textrm{abs\_tanh}(x)` is the interval [0, 1),
+    *in theory*. *In practice*, due to rounding errors in floating
+    point representations, :math:`\\textrm{abs\_tanh}(x)` will lie in
+    the range [0, 1].
+
+    :param x: tensor-like (a Theano variable with type theano.Tensor,
+              or a value that can be converted to one) :math:`\in
+              \mathbb{R}^n`
+
+    :return: a Theano variable with the same shape as the input, where
+             the abs_tanh function is mapped to each element of the
+             input x.
+    """
+    return theano.tensor.abs_(theano.tensor.tanh(x))
+
+
+
+@tags.tags('activation', 'unary',
+           'abs_tanh', 'abs', 'tanh', 'hyperbolic tangent', 'normalized',
+           'non-negative', 'even')
+def abs_tanh_normalized(x):
+    """
+    Return a symbolic variable representing the absolute value of a
+    normalized tanh (hyperbolic tangent) of the input x.
+    TODO: where does 1.759 come from? why is it normalized like that?
+
+    .. math::
+        \\textrm{abs\_tanh\_normalized}(x) = \left|1.759\\textrm{ tanh}\left(\\frac{2x}{3}\\right)\\right|
+
+    The image of :math:`\\textrm{abs\_tanh\_normalized}(x)` is the range
+    [0, 1.759), *in theory*. *In practice*, due to rounding errors in
+    floating point representations,
+    :math:`\\textrm{abs\_tanh\_normalized}(x)` will lie in the
+    approximative closed range [0, 1.759]. The exact upper bound
+    depends on the precision of the floating point representation.
+
+    :param x: tensor-like (a Theano variable with type theano.Tensor,
+              or a value that can be converted to one) :math:`\in
+              \mathbb{R}^n`
+
+    :return: a Theano variable with the same shape as the input, where
+             the abs_tanh_normalized function is mapped to each
+             element of the input x.
+    """
+    return theano.tensor.abs_(1.759*theano.tensor.tanh(0.6666*x))
+
+
+
+@tags.tags('activation','softsign')
+def softsign_act(input):
+    """
+    Returns a symbolic variable that computes the softsign of ``input``.
+    
+    .. math::
+                f(input) = \\frac{input}{1.0 + |input|}
+
+    :type input:  tensor-like
+    :param input: input tensor to which softsign should be applied
+    :rtype:       Theano variable
+    :return:      tensor obtained after applying the softsign function
+
+    """
+    return input/(1.0 + tensor.abs_(input))
+
+@tags.tags('activation','softsign','abs')
+def abssoftsign_act(input):
+    """
+    Returns a symbolic variable that computes the absolute value of the
+    softsign function on the input tensor ``input``.
+
+    .. math::
+                f(input) = \left| \\frac{input}{1.0 +|input|} \\right|
+
+    :type input:  tensor-like
+    :param input: input tensor to which softsign should be applied
+    :rtype:       Tensor variable
+    :return:      tensor obtained by taking the absolute value of softsign 
+                  of the input
+    """
+    return tensor.abs_(input)/(1.0 + tensor.abs_(input))
+
+
+@tags.tags('activation','rectifier')
+def rectifier_act(input):
+    """
+    Returns a symbolic variable that computes the value of the ``input`` if
+    and only if it is positive, 0 otherwise.
+
+    .. math::
+                f(input) = \left \lbrace \\begin{array}{l}
+                            input \quad \\text{ if } input > 0 \\
+                            0     \quad \\text{ else }
+                         \end{array}
+                         \\right \}
+
+    :type input:  tensor-like
+    :param input: input tensor to which the rectifier activation function 
+                  will be applied
+    :rtype:       Tensor variable
+    :return:      always positive tensor which equals with the input if it is also 
+                  positive or to 0 otherwise
+
+    """
+    return input*(input>=0)
+
+@tags.tags('activation','softplus')
+def softplus_act(input):
+    """
+    Returns a symbolic variable that computes the softplus of ``input``.
+    Note : (TODO) rescale in order to have a steady state regime close to 0 
+           at initialization.
+
+    .. math::
+                f(input) = ln \left( 1 + e^{input} \\right)
+
+    :type input:  tensor-like
+    :param input: input tensor to which the softplus should be applied
+    :rtype:       Theano variable
+    :return:      tensor obtained by applying softsign on the input
+    """
+    return tensor.nnet.softplus(input)
+
+@tags.tags('activation','abs')
+def abs_act(input):
+    """
+    Returns the symbolic variable that represents the absolute value of
+    ``input``.
+
+    .. math::
+                f(input) = |input|
+
+    :type input:  tensor-like
+    :param input: input tensor
+    :rtype:       Theano variable
+    :return:      tensor that represents the absolute value of the input
+
+
+    """
+    return theano.tensor.abs_(input)
+
+
--- a/pylearn/formulas/costs.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/formulas/costs.py	Thu Oct 14 23:55:55 2010 -0400
@@ -1,22 +1,169 @@
 """
-This script defines a few often used cost functions.
+
+TODO: make sur stabilization optimization are done.
+TODO: make test
+TODO: check that this work for nd tensor.
 """
+
+#"""
+#Common training criteria.
+#"""
 import theano
 import theano.tensor as T
+
 from tags import tags
 
+__authors__   = "Frederic Bastien, Nicolas Boulanger-Lewandowski, .."
+__copyright__ = "(c) 2010, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-user <theano-users@googlegroups.com>"
+
 @tags('cost','binary','cross-entropy')
 def binary_crossentropy(output, target):
     """ Compute the crossentropy of binary output wrt binary target.
 
     .. math::
-                L_{CE} \equiv t\log(o) + (1-t)\log(1-o) 
+                L_{CE} \equiv t\log(o) + (1-t)\log(1-o)
 
     :type output: Theano variable
     :param output: Binary output or prediction :math:`\in[0,1]`
     :type target: Theano variable
     :param target: Binary target usually :math:`\in\{0,1\}`
+
+    :note: no stabilization optimization needed for a generic output variable
     """
-    return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output))
+    return -(target * T.log(output) + (1.0 - target) * T.log(1.0 - output))
 
 
+@tags('cost','binary','cross-entropy', 'sigmoid')
+def sigmoid_crossentropy(output, target):
+    """ crossentropy of a sigmoid activation
+
+    .. math::
+                L_{CE} \equiv t\log(\sigma(a)) + (1-t)\log(1-\sigma(a))
+
+    :type output: Theano variable
+    :param output: Output before activation
+    :type target: Theano variable
+    :param target: Target
+
+    :note: no stabilization done. 
+    """
+    return target * (- T.log(1.0 + T.exp(-output))) + (1.0 - target) * (- T.log(1.0 + T.exp(output)))
+
+@tags('cost','binary','cross-entropy', 'tanh')
+def tanh_crossentropy(output, target):
+    """ crossentropy of a tanh activation
+
+    .. math::
+                L_{CE} \equiv t\log(\\frac{1+\\tanh(a)}2) + (1-t)\log(\\frac{1-\\tanh(a)}2)
+
+    :type output: Theano variable
+    :param output: Output before activation
+    :type target: Theano variable
+    :param target: Target
+
+    :note: no stabilization done. 
+    """
+    return sigmoid_crossentropy(2.0*output, target)
+
+@tags('cost','binary','cross-entropy', 'tanh', 'abs')
+def abstanh_crossentropy(output, target):
+    """ crossentropy of a absolute value tanh activation
+
+    .. math::
+                L_{CE} \equiv t\log(\\frac{1+\\tanh(|a|)}2) + (1-t)\log(\\frac{1-\\tanh(|a|)}2)
+
+    :type output: Theano variable
+    :param output: Output before activation
+    :type target: Theano variable
+    :param target: Target
+
+    :note: no stabilization done. 
+    """
+    return tanh_crossentropy(T.abs_(output), target)
+
+@tags('cost','binary','cross-entropy', 'tanh', 'normalized')
+def normtanh_crossentropy(output, target):
+    """ crossentropy of a "normalized" tanh activation (LeCun)
+
+    .. math::
+                L_{CE} \equiv t\log(\\frac{1+\\tanh(0.6666a)}2) + (1-t)\log(\\frac{1-\\tanh(0.6666a)}2)
+
+    :type output: Theano variable
+    :param output: Output before activation
+    :type target: Theano variable
+    :param target: Target
+
+    :note: no stabilization done. 
+    """
+    return tanh_crossentropy(0.6666*output, target)
+
+@tags('cost','binary','cross-entropy', 'tanh', 'normalized', 'abs')
+def absnormtanh_cross_entropy(output, target):
+    """ crossentropy of a "absolute normalized" tanh activation
+
+    .. math::
+                L_{CE} \equiv t\log(\\frac{1+\\tanh(0.6666*|a|)}2) + (1-t)\log(\\frac{1-\\tanh(0.6666*|a|)}2)
+
+    :type output: Theano variable
+    :param output: Output before activation
+    :type target: Theano variable
+    :param target: Target
+
+    :note: no stabilization done. 
+    """
+    return normtanh_crossentropy(T.abs_(output), target)
+
+def cross_entropy(output_act, output, target, act=None):
+    """ Execute the cross entropy with a sum on the last dimension and a mean on the first dimension.
+
+    If act is in 'sigmoid', 'tanh', 'tanhnorm', 'abstanh', 'abstanhnorm' we 
+    call the specialized version.
+    
+    .. math::
+        mean(sum(sqr(output-target),axis=-1),axis=0)
+
+    :type output_act: Theano variable
+    :param output_act: Output after activation
+    :type output: Theano variable
+    :param output: Output before activation
+    :type target:Theano variable
+    :param target: Target
+    :type act: str or None
+    :param act: The type of activation used
+    """
+    if act in ['sigmoid','tanh','tanhnorm','abstanh','abstanhnorm']:
+        if act == 'sigmoid':
+            return sigmoid_crossentropy(output, target)
+        if act == 'tanh':
+            return tanh_crossentropy(output, target)
+        if act == 'tanhnorm':
+            return normtanh_crossentropy(output, target)
+        if act == 'abstanh':
+            return abstanh_crossentropy(output, target)
+        if act == 'abstanhnorm':
+            return absnormtanh_cross_entropy(output, target)
+    elif act is None:
+        XE = target * T.log(output_act) + (1 - target) * T.log(1 - output_act)
+        return -T.mean(T.sum(XE, axis=-1),axis=0)
+    else:
+        raise Exception("cross_entropy() Expected parameter act to be in ['sigmoid','tanh','tanhnorm','abstanh','abstanhnorm', None]")
+
+def quadratic_cost(output, target):
+    """ The quadratic cost of output again target with a sum on the last dimension and a mean on the first dimension.
+    
+    .. math::
+        mean(sum(sqr(output-target),axis=-1),axis=0)
+
+    :type output: Theano variable
+    :param output: The value that we want to compare again target
+    :type target:Theano variable
+    :param target: The value that we consider correct
+    """
+    return T.mean(T.sum(T.sqr(output - target), axis=-1),axis=0)
+
+
+# This file seems like it has some overlap with theano.tensor.nnet.  Which functions should go
+# in which file?
+
--- a/pylearn/formulas/noise.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/formulas/noise.py	Thu Oct 14 23:55:55 2010 -0400
@@ -1,12 +1,14 @@
 """
+Noise functions used to train Denoising Auto-Associators.
 
-This script define the different symbolic noise functions.
+Functions in this module often include a `noise_lvl` argument that controls the amount of noise
+that the function applies.
 The noise contract is simple: noise_lvl is a symbolic variable going from 0 to 1.
-0: no changement.
-1: max noise.
+0: no change.
+1: maximum noise.
 """
 import theano
-from tags import tags
+import tags
 s="""
 * A latex mathematical description of the formulas(for picture representation in generated documentation)
 * Tags(for searching):
@@ -19,37 +21,58 @@
 * Tell the domaine, range of the input/output(range should use the english notation of including or excluding)
 """
 
-@tags('noise','binomial','salt')
-def binomial_noise(theano_rng,inp,noise_lvl):
-    """ This add binomial noise to inp. Only the salt part of pepper and salt.
+@tags.tags('noise','binomial','salt')
+def binomial_noise(theano_rng, input, noise_lvl, noise_value=0):
+    """
+    Return `inp` with randomly-chosen elements set to zero.
+
+    TODO: MATH DEFINITION
 
-    :type inp: Theano Variable
-    :param inp: The input that we want to add noise
+    :type input: Theano tensor variable
+    :param input: input
     :type noise_lvl: float
-    :param noise_lvl: The % of noise. Between 0(no noise) and 1.
+    :param noise_lvl: The probability of setting each element to zero.
+    :type noise_value: Theano scalar variable
+    :param noise_value: The value that we want when their is noise.
     """
-    return theano_rng.binomial( size = inp.shape, n = 1, p =  1 - noise_lvl, dtype=theano.config.floatX) * inp
+    mask = theano_rng.binomial(
+            size = input.shape,
+            n = 1,
+            p =  1 - noise_lvl,
+            dtype=input.dtype)
+    value = theano.tensor.as_tensor_variable(noise_value)
+    if value.type.ndim!=0:
+        raise Exception('binomial_noise only support scalar noise_value')
+    if noise_value==0:
+        return mask * input
+    else:
+        return mask * input + noise_value*(not mask)
 
 
-@tags('noise','binomial NLP','pepper','salt')
+@tags.tags('noise','binomial NLP','pepper','salt')
 def pepper_and_salt_noise(theano_rng,inp,noise_lvl):
     """ This add pepper and salt noise to inp
-    
-    :type inp: Theano Variable
+
+    :type inp: Theano variable
     :param inp: The input that we want to add noise
     :type noise_lvl: tuple(float,float)
-    :param noise_lvl: The % of noise for the salt and pepper. Between 0(no noise) and 1.
+    :param noise_lvl: The probability of changing each element to zero or one. 
+                      (prob of salt, prob of pepper)
+
+    :note: The sum of the prob of salt and prob of pepper should be less then 1.
     """
-    return theano_rng.binomial( size = inp.shape, n = 1, p =  1 - noise_lvl[0], dtype=theano.config.floatX) * inp \
-                        + (inp==0) * theano_rng.binomial( size = inp.shape, n = 1, p =  noise_lvl[1], dtype=theano.config.floatX)
+    assert inp.dtype in ['float32','float64']
+    return theano_rng.binomial( size = inp.shape, n = 1, p =  1 - noise_lvl[0], dtype=inp.dtype) * inp \
+                        + (inp==0) * theano_rng.binomial( size = inp.shape, n = 1, p =  noise_lvl[1], dtype=inp.dtype)
 
-@tags('noise','gauss','gaussian')
+@tags.tags('noise','gauss','gaussian')
 def gaussian_noise(theano_rng,inp,noise_lvl):
     """ This add gaussian NLP noise to inp
 
-    :type inp: Theano Variable
+    :type inp: Theano variable
     :param inp: The input that we want to add noise
     :type noise_lvl: float
     :param noise_lvl: The standard deviation of the gaussian.
     """
-    return theano_rng.normal( size = inp.shape, std = noise_lvl, dtype=theano.config.floatX) + inp
+    assert inp.dtype in ['float32','float64']
+    return theano_rng.normal( size = inp.shape, std = noise_lvl, dtype=inp.dtype) + inp
--- a/pylearn/gd/README.txt	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/gd/README.txt	Thu Oct 14 23:55:55 2010 -0400
@@ -1,2 +1,21 @@
 
 see __init__.py
+
+
+TODO: - add an sgd with annealed learning rate (the annealing schedule is implemented via
+shared variables and an update schedule
+
+Wishlist:
+    - sgd 
+    - momentum
+    - annealing schedule
+    - conjugate methods
+    - l-bfgs
+    - hessian free
+    - tonga
+    - smth by nick schraudolff (sp?)
+
+Theano needs lazy if for these to be efficient
+
+
+Early stopping heuristics as update expressions too?
--- a/pylearn/preprocessing/pca.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/preprocessing/pca.py	Thu Oct 14 23:55:55 2010 -0400
@@ -9,6 +9,9 @@
 
 #TODO: estimate number of principle components by cross-validation (early stopping)
 
+#TODO: include the original feature means in the `pca` tuple object so that the full transform
+# can be saved, applied to new datasets, and approximately inverted.
+
 import numpy
 import scipy.linalg
 
@@ -59,7 +62,7 @@
         if max_energy_fraction < 1.0:
             energy = 0
             i = 0
-            while energy < max_energy_fraction * vartot:
+            while (energy < max_energy_fraction * vartot) and (i < len(w)):
                 energy += w[i]
                 i += 1
             if i < len(w):
@@ -109,6 +112,15 @@
     pca_of_X /= numpy.sqrt(eigvals)+eps
     return pca_of_X
 
+def pca_whiten_inverse((eigvals, eigvecs), whitened_X, eps=1e-8):
+    """
+    Return an approximate inverse of the `pca_whiten` transform.
+
+    The inverse is not perfect because pca_whitening discards the least-significant components
+    of the data.
+    """
+    return numpy.dot(whitened_X * (numpy.sqrt(eigvals)+eps), eigvecs.T)
+
 def zca_whiten((eigvals, eigvecs), centered_X):
     """Return the PCA of X but rotated back into the original vector space.
 
--- a/pylearn/sampling/hmc.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/sampling/hmc.py	Thu Oct 14 23:55:55 2010 -0400
@@ -1,203 +1,249 @@
+
 """Hybrid / Hamiltonian Monte Carlo Sampling
 
 This algorithm is described in Radford Neal's PhD Thesis, pages 63--70.
 
+:note:
+
+  The 'mass' of so-called particles is taken to be 1, so that kinetic energy (K) is the sum
+  of squared velocities (p).
+
+    :math:`K = \sum_i p_i^2 / 2`
+
+
+The 'leap-frog' algorithm that advances by turns the velocity and the position is
+currently implemented via several theano functions, rather than one complete theano
+expression graph.
+
+The initialize_dynamics() theano-function does several things:
+1. samples a random velocity for each particle (saving it to self.velocities) 
+2. calculates the initial hamiltonian based on those velocities (saving it to
+    self.initial_hamiltonian)
+3. saves self.positions to self.initial_positions.
+
+The finalize_dynamics() theano-function re-calculates the Hamiltonian for each particle
+based on the self.positions and self.velocities, and then implements the
+Metropolis-Hastings accept/reject for each particle in the simulation by consulting the
+self.initial_hamiltonian storing the result to self.
+
+
 """
 import sys
 import logging
+import numpy
 import numpy as np
 from theano import function, shared
 from theano import tensor as TT
-import theano.sparse #installs the sparse shared var handler
+import theano
+from theano.printing import Print
+
+def Print(msg):
+    return lambda x: x
+
+def kinetic_energy(velocities, masses):
+    if masses != 1:
+        raise NotImplementedError()
+    return 0.5 * (velocities**2).sum(axis=1)
+
+def metropolis_hastings_accept(energy_prev, energy_next, s_rng, shape=None):
+    """
+    Return acceptance of moves - energy_prev and energy_next are vectors of the energy of each
+    particle.
+    """
+    if shape is None:
+        shape = energy_prev.shape
+    ediff = Print('diff')(energy_prev - energy_next)
+    return (TT.exp(ediff) - s_rng.uniform(size=shape)) >= 0
 
+def hamiltonian(pos, vel, energy_fn):
+    """Return a vector of energies - sum of kinetic and potential energy
+    """
+    # assuming mass is 1
+    return energy_fn(pos) + kinetic_energy(vel, 1)
+
+def simulate_dynamics(initial_p, initial_v, stepsize, n_steps, energy_fn):
+    """Return final (position, velocity) of `n_step` trajectory
+    """
+    def leapfrog(pos, vel, step):
+        egy = energy_fn(pos)
+        dE_dpos = TT.grad(egy.sum(), pos)
+        new_vel = vel - step * dE_dpos
+        new_pos = pos + step * new_vel
+        return [new_pos, new_vel],{}
+
+    (final_p, final_v), scan_updates = theano.scan(leapfrog, 
+            outputs_info=[
+                dict(initial=initial_p+ 0.5*stepsize*initial_v,
+                    return_steps=1),
+                dict(initial=initial_v,
+                    return_steps=1),
+                ],
+            non_sequences=[stepsize],
+            n_steps=n_steps)
 
-#TODO: 
-#  Consider how to refactor this code so that the key functions /functionality are provided as
-#  theano expressions??
-#  It could be limiting that the implementation requires that the position be a list of shared
-#  variables, and basically uses procedural logic on top of that.
+    if scan_updates:
+        raise NotImplementedError((
+                'TODO: check the scan updates to make sure that the s_rng is'
+                ' not being updated incorrectly'))
+    # undo half of the last leap-frog step
+    final_p = final_p - 0.5* stepsize * final_v
+    return final_p, final_v
+
+def mcmc_move(s_rng, positions, energy_fn, stepsize, n_steps, positions_shape=None):
+    """Return new position
+    """
+    if positions_shape is None:
+        positions_shape = positions.shape
+
+    batchsize = positions_shape[0]
+
+    initial_v = s_rng.normal(size=positions_shape)
+
+    final_p, final_v = simulate_dynamics(
+            initial_p = positions, 
+            initial_v = initial_v,
+            stepsize = stepsize,
+            n_steps = n_steps,
+            energy_fn = energy_fn)
 
-#TODO: 
-# Consider a heuristic for updating the *MASS* of the particles.  We might want the mass to be
-# such that the momentum is in the same range as the gradient on the energy.  Look at Radford's
-# recent book chapter, maybe there are hints. (2010).
+    accept = metropolis_hastings_accept(
+            energy_prev = Print('ep')(hamiltonian(positions, initial_v, energy_fn)),
+            energy_next = Print('en')(hamiltonian(final_p, final_v, energy_fn)),
+            s_rng=s_rng, shape=(batchsize,))
+    
+    return Print('accept')(accept), final_p
+
+def mcmc_updates(shrd_pos, shrd_stepsize, shrd_avg_acceptance_rate, final_p, accept, 
+        target_acceptance_rate,
+        stepsize_inc,
+        stepsize_dec,
+        stepsize_min,
+        stepsize_max,
+        avg_acceptance_slowness
+        ):
+    return [
+            (shrd_pos,
+                TT.switch(
+                    accept.dimshuffle(0, *(('x',)*(final_p.ndim-1))),
+                    final_p,
+                    shrd_pos)),
+            (shrd_stepsize, 
+                TT.clip(
+                    TT.switch( 
+                        shrd_avg_acceptance_rate > target_acceptance_rate,
+                        shrd_stepsize * stepsize_inc,
+                        shrd_stepsize * stepsize_dec,
+                        ),
+                    stepsize_min,
+                    stepsize_max)),
+            (shrd_avg_acceptance_rate,
+                Print('arate')(TT.add(
+                    avg_acceptance_slowness * shrd_avg_acceptance_rate,
+                    (1.0 - avg_acceptance_slowness) * accept.mean()))),
+            ]
 
 class HMC_sampler(object):
-    """Batch-wise Hybrid Monte-Carlo sampler
-
+    """Convenience wrapper for HMC
 
     The `draw` function advances the markov chain and returns the current sample by calling
     `simulate` and `get_position` in sequence.
 
-
-    :note:
-
-      The 'mass' of so-called particles is taken to be 1, so that kinetic energy (K) is the sum
-      of squared velocities (p).
-
-        :math:`K = \sum_i p_i^2 / 2`
-
     """
 
     # Constants taken from Marc'Aurelio's 'train_mcRBM.py' file found in the code online for his
     # paper.
-    stepsize_dec = 0.98
-    stepsize_min = 0.001
-    stepsize_max = 0.25
-    stepsize_inc = 1.02
-    avg_acceptance_slowness = 0.9  # used in geometric avg. 1.0 would be not moving at all
-    n_steps=20
+
+    def __init__(self, **kwargs):
+        # add things to __dict__
+        self.__dict__.update(kwargs)
 
-    def __init__(self, positions, energy_fn, 
-            velocity=None,
-            initial_stepsize=0.01,
-            target_acceptance_rate=0.9,
-            seed=12345,
-            dtype=theano.config.floatX):
+    @classmethod
+    def new_from_shared_positions(cls, shared_positions, energy_fn, 
+            initial_stepsize=0.01, target_acceptance_rate=.9, n_steps=20,
+            stepsize_dec = 0.98,
+            stepsize_min = 0.001,
+            stepsize_max = 0.25,
+            stepsize_inc = 1.02,
+            avg_acceptance_slowness = 0.9, # used in geometric avg. 1.0 would be not moving at all
+            seed=12345, dtype=theano.config.floatX,
+            shared_positions_shape=None, 
+            compile_simulate=True):
         """
-        :param positions: list of shared ndarray variables.
-
-        :param energy: 
-
+        :param shared_positions: theano ndarray shared var with many particle [initial] positions
+        :param energy_fn:
             callable such that energy_fn(positions) 
             returns theano vector of energies.  
             The len of this vector is the batchsize.
 
             The sum of this energy vector must be differentiable (with theano.tensor.grad) with
             respect to the positions for HMC sampling to work.
-
         """
-
-        self.stepsize = initial_stepsize
-        batchsize = positions[0].value.shape[0]
-        self.target_acceptance_rate = target_acceptance_rate
-        self.avg_acceptance_rate = self.target_acceptance_rate
-        self.s_rng = TT.shared_randomstreams.RandomStreams(seed)
-        self.positions = positions
-        if velocity is None:
-            self.velocity = [shared(np.zeros_like(q.value)) for q in self.positions]
-        else:
-            self.velocity = velocity
-
-        self.start_positions = [shared(np.zeros_like(q.value)) for q in self.positions]
-        self.initial_hamiltonian = shared(np.zeros(batchsize).astype(dtype) + float('inf'))
-
-        energy = energy_fn(positions)
-        # assuming mass is 1
-        kinetic_energy = 0.5 * sum([(p**2).sum(axis=1) for p in self.velocity])
-        hamiltonian = energy + kinetic_energy
-
-        dE_dpos = TT.grad(energy.sum(), self.positions)
-
-        s_stepsize = TT.scalar('stepsize')
-
-        self.velocity_step = function([s_stepsize],
-                [dE_dpos[0].norm(2)],
-                updates=[(p, p - s_stepsize*dE_dq) for p,dE_dq in zip(self.velocity,dE_dpos)])
-        self.position_step = function([s_stepsize], [],
-                updates=[(q, q + s_stepsize*p) for (q,p) in zip(self.positions, self.velocity)])
+        # allocate shared vars
 
-        self.save_start_positions = function([], [],
-                updates=[(self.initial_hamiltonian, hamiltonian)] + \
-                        [(sq, q) for sq, q in zip(self.start_positions, self.positions)])
-
-        # sample the initial velocity from a
-        # gaussian with mean 0.
-        # Note: I think the fact that this distribution is symmetric about zero justifies not
-        # sampling forward versus backward dynamics.
-        self.randomize_velocity = function([], [],
-                updates=[(p, self.s_rng.normal(size=p.value.shape)) for p in self.velocity])
-
-
-        # accept-reject according to Metropolis algo
-
-        accept = TT.exp(self.initial_hamiltonian - hamiltonian) - self.s_rng.uniform(size=(batchsize,)) >= 0
+        if shared_positions_shape==None:
+            shared_positions_shape = shared_positions.value.shape
+        batchsize = shared_positions_shape[0]
 
-        self.accept_reject_positions = function([], accept.mean(),
-                updates=[
-                    (self.initial_hamiltonian, 
-                        TT.switch(accept, hamiltonian, self.initial_hamiltonian))] + [
-                    (q, 
-                        TT.switch(accept.dimshuffle(0, *(('x',)*(sq.ndim-1))), q, sq)) 
-                        for sq, q in zip(self.start_positions, self.positions)])
-
-    def simulate(self, n_steps=None):
-        if n_steps is None:
-            n_steps = self.n_steps
-
-        # updates self.velocity with new random numbers
-        self.randomize_velocity()
-        self.save_start_positions()
+        stepsize = shared(numpy.asarray(initial_stepsize).astype(theano.config.floatX), 'hmc_stepsize')
+        avg_acceptance_rate = shared(target_acceptance_rate, 'avg_acceptance_rate')
+        s_rng = TT.shared_randomstreams.RandomStreams(seed)
 
-        if 0:
-            # not necessary for random initial direction
-            if np.random.rand() > 0.5:
-                step_amount = self.stepsize
-            else:
-                step_amount = -self.stepsize
+        accept, final_p = mcmc_move(
+                s_rng, 
+                shared_positions, 
+                energy_fn,
+                stepsize, 
+                n_steps,
+                shared_positions_shape)
+        simulate_updates = mcmc_updates(
+                shared_positions,
+                stepsize,
+                avg_acceptance_rate, 
+                final_p=final_p, 
+                accept=accept,
+                stepsize_min=stepsize_min,
+                stepsize_max=stepsize_max,
+                stepsize_inc=stepsize_inc,
+                stepsize_dec=stepsize_dec,
+                target_acceptance_rate=target_acceptance_rate,
+                avg_acceptance_slowness=avg_acceptance_slowness)
+        if compile_simulate:
+            simulate = function([], [], updates=simulate_updates)
         else:
-            step_amount = self.stepsize
-
-        if 0:
-            print "initial",
-            print "kinetic E", self.prev_energy.value ,
-            print (self.velocity[0].value**2).sum(axis=1),
-            print (self.velocity[0].value**2).sum(axis=1) + self.prev_energy.value
-
-
-        # Note on the order of leap-frog steps:
-        #
-        # the leap-frog algorithm can start with either position or velocity updates first,
-        # but one of them has to run an extra time (because of two half-steps).
-        # The position_step is cheap to evaluate, whereas the velocity_step is expensive,
-        # so we evaluate the position_step the extra time.
-        #
-        # At the same time, we cannot drop the last half-step update of the position, because
-        # the position is actually the terms we care about.
-
-        #opening half-step in leap-frog algorithm
-        self.position_step(step_amount/2.0)
+            simulate = None
+        return cls(
+                positions=shared_positions,
+                stepsize=stepsize,
+                stepsize_min=stepsize_min,
+                stepsize_max=stepsize_max,
+                avg_acceptance_rate=avg_acceptance_rate,
+                target_acceptance_rate=target_acceptance_rate,
+                s_rng=s_rng,
+                _updates=simulate_updates,
+                simulate=simulate)
 
-        # full leap-frog steps
-        for ss in range(n_steps):
-            self.velocity_step(step_amount)
-            if ss == n_steps-1:
-                # closing half-step in the leap-frog algorithm
-                self.position_step(step_amount/2.0)
-            else:
-                self.position_step(step_amount)
-
-        acceptance_rate = self.accept_reject_positions()
-        self.avg_acceptance_rate = self.avg_acceptance_slowness * self.avg_acceptance_rate \
-                + (1.0 - self.avg_acceptance_slowness) * acceptance_rate
-
-        if self.avg_acceptance_rate < self.target_acceptance_rate:
-            self.stepsize = max(self.stepsize*self.stepsize_dec,self.stepsize_min)
-        else:
-            self.stepsize = min(self.stepsize*self.stepsize_inc,self.stepsize_max)
-
-        if 0:
-            print "final kinetic E", self.prev_energy.value ,
-            print (self.velocity[0].value**2).sum(axis=1),
-            print (self.velocity[0].value**2).sum(axis=1) + self.prev_energy.value
-
-
-        # post-condition: 
-        # self.positions contains a new sample from our markov chain
-
-        # it is not returned from this function because accessing the .value of a shared
-        # variable can require making a copy
-        # see `draw()` or `get_position` for that behaviour.
-
-    def get_position(self):
-        return [q.value for q in self.positions]
-
-    def draw(self, n_steps=None):
+    def draw(self, **kwargs):
         """Return the current sample in the Markov chain as a list of numpy arrays
 
         The size of the arrays will match the size of the `initial_position` argument to
         __init__.
+
+        The `kwargs` dictionary is passed to the shared variable (self.positions) `get_value()`
+        function.  So for example, to avoid copying the shared variable value, consider passing
+        `borrow=True`.
         """
-        self.simulate(n_steps=n_steps)
-        return self.get_position()
+        self.simulate()
+        return self.positions.value.copy()
+
+    def updates(self):
+        """Returns the update expressions required to simulate the Markov Chain
 
+        :TODO: :WRITEME: *prescriptive* definition of what this method does (API)
+        """
+        return list(self._updates)
+
+#TODO: 
+# Consider a heuristic for updating the *MASS* of the particles.  We might want the mass to be
+# such that the momentum is in the same range as the gradient on the energy.  Look at Radford's
+# recent book chapter, maybe there are hints. (2010).
+
--- a/pylearn/sampling/tests/test_hmc.py	Thu Oct 14 23:55:41 2010 -0400
+++ b/pylearn/sampling/tests/test_hmc.py	Thu Oct 14 23:55:55 2010 -0400
@@ -15,15 +15,15 @@
     mu = np.asarray([5, 9.5], dtype=theano.config.floatX)
 
     def gaussian_energy(xlist):
-        x, = xlist
+        x = xlist
         return 0.5 * (TT.dot((x-mu),cov_inv)*(x-mu)).sum(axis=1)
 
 
     position = shared(rng.randn(batchsize, 2).astype(theano.config.floatX))
-    sampler = sampler_cls([position], gaussian_energy)
+    sampler = sampler_cls(position, gaussian_energy)
 
     print 'initial position', position.value
-    print 'initial stepsize', sampler.stepsize
+    print 'initial stepsize', sampler.stepsize.value
 
     # DRAW SAMPLES
 
@@ -35,31 +35,31 @@
 
     # TEST THAT THEY ARE FROM THE RIGHT DISTRIBUTION
 
-    # samples.shape == (1000, 1, 3, 2)
+    # samples.shape == (1000, 3, 2)
 
     print 'target mean:', mu
-    print 'empirical mean: ', samples.mean(axis=0)[0]
+    print 'empirical mean: ', samples.mean(axis=0)
     #assert np.all(abs(mu - samples.mean(axis=0)) < 1)
 
 
-    print 'final stepsize', sampler.stepsize
-    print 'final acceptance_rate', sampler.avg_acceptance_rate
+    print 'final stepsize', sampler.stepsize.value
+    print 'final acceptance_rate', sampler.avg_acceptance_rate.value
 
     print 'target cov', cov
-    s = samples[:,0,0,:]
-    empirical_cov = np.cov(samples[:,0,0,:].T)
+    s = samples[:,0,:]
+    empirical_cov = np.cov(samples[:,0,:].T)
     print ''
     print 'cov/empirical_cov', cov/empirical_cov
-    empirical_cov = np.cov(samples[:,0,1,:].T)
+    empirical_cov = np.cov(samples[:,1,:].T)
     print 'cov/empirical_cov', cov/empirical_cov
-    empirical_cov = np.cov(samples[:,0,2,:].T)
+    empirical_cov = np.cov(samples[:,2,:].T)
     print 'cov/empirical_cov', cov/empirical_cov
     return sampler
 
 def test_hmc():
     print ('HMC')
-    sampler = _sampler_on_2d_gaussian(HMC_sampler, burnin=3000/20, n_samples=90000/20)
+    sampler = _sampler_on_2d_gaussian(HMC_sampler.new_from_shared_positions, burnin=3000/20, n_samples=90000/20)
     assert abs(sampler.avg_acceptance_rate - sampler.target_acceptance_rate) < .1
-    assert sampler.stepsize >= sampler.stepsize_min
-    assert sampler.stepsize <= sampler.stepsize_max
+    assert sampler.stepsize.value >= sampler.stepsize_min
+    assert sampler.stepsize.value <= sampler.stepsize_max