Mercurial > pylearn

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/.build/PLACEHOLDER	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,1 @@
+sphinx doesn't like it when this repertory isn't available
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/.static/PLACEHOLDER	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,1 @@
+sphinx doesn't like it when this repertory isn't available
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/.templates/PLACEHOLDER	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,1 @@
+sphinx doesn't like it when this repertory isn't available
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/.templates/layout.html	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,24 @@
+{% extends "!layout.html" %}
+
+{%- block extrahead %}
+{{ super() }}
+<script type="text/javascript">
+  var _gaq = _gaq || [];
+  _gaq.push(['_setAccount', 'UA-168290-9']);
+  _gaq.push(['_trackPageview']);
+</script>
+{% endblock %}
+
+{% block footer %}
+{{ super() }}
+<script type="text/javascript">
+  (function() {
+    var ga = document.createElement('script');
+    ga.src = ('https:' == document.location.protocol ?
+              'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+    ga.setAttribute('async', 'true');
+    document.documentElement.firstChild.appendChild(ga);
+  })();
+</script>
+{% endblock %}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/LICENSE.txt	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,30 @@
+.. _license:
+
+LICENSE
+=======
+
+Copyright (c) 2008--2009, Theano Development Team
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Theano nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/api/epydoc.conf	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,152 @@
+# TODO:
+#   Get all graphs to work!
+
+
+[epydoc] # Epydoc section marker (required by ConfigParser)
+
+# The list of objects to document.  Objects can be named using
+# dotted names, module filenames, or package directory names.
+# Alases for this option include "objects" and "values".
+modules: pylearn
+
+# The type of output that should be generated.  Should be one
+# of: html, text, latex, dvi, ps, pdf.
+output: html
+
+# An integer indicating how verbose epydoc should be.  The default
+# value is 0; negative values will supress warnings and errors;
+# positive values will give more verbose output.
+verbosity: 1
+
+# A boolean value indicating that Epydoc should show a tracaback
+# in case of unexpected error. By default don't show tracebacks
+debug: 1
+
+# If True, don't try to use colors or cursor control when doing
+# textual output. The default False assumes a rich text prompt
+simple-term: 0
+
+
+### Generation options
+
+# The default markup language for docstrings, for modules that do
+# not define __docformat__.  Defaults to epytext.
+docformat: epytext
+
+# Whether or not parsing should be used to examine objects.
+parse: yes
+
+# Whether or not introspection should be used to examine objects.
+introspect: yes
+
+# Don't examine in any way the modules whose dotted name match this
+# regular expression pattern.
+#exclude
+
+# Don't perform introspection on the modules whose dotted name match this
+# regular expression pattern.
+#exclude-introspect
+
+# Don't perform parsing on the modules whose dotted name match this
+# regular expression pattern.
+#exclude-parse
+
+# The format for showing inheritance objects.
+# It should be one of: 'grouped', 'listed', 'included'.
+inheritance: grouped
+
+# Whether or not to inclue private variables.  (Even if included,
+# private variables will be hidden by default.)
+private: yes
+
+# Whether or not to list each module's imports.
+imports: yes
+
+# Whether or not to include syntax highlighted source code in
+# the output (HTML only).
+sourcecode: yes
+
+# Whether or not to includea a page with Epydoc log, containing
+# effective option at the time of generation and the reported logs.
+include-log: yes
+
+
+### Output options
+
+# The documented project's name.
+name: Pylearn
+
+# The CSS stylesheet for HTML output.  Can be the name of a builtin
+# stylesheet, or the name of a file.
+css: white
+
+# The documented project's URL.
+url: http://deeplearning.net/software/pylearn/
+
+# HTML code for the project link in the navigation bar.  If left
+# unspecified, the project link will be generated based on the
+# project's name and URL.
+#link: <a href="somewhere">My Cool Project</a>
+
+# The "top" page for the documentation.  Can be a URL, the name
+# of a module or class, or one of the special names "trees.html",
+# "indices.html", or "help.html"
+#top: os.path
+
+# An alternative help file.  The named file should contain the
+# body of an HTML file; navigation bars will be added to it.
+#help: my_helpfile.html
+
+# Whether or not to include a frames-based table of contents.
+#frames: yes
+frames: no
+
+# Whether each class should be listed in its own section when
+# generating LaTeX or PDF output.
+separate-classes: no
+
+
+### API linking options
+
+# Define a new API document.  A new interpreted text role
+# will be created
+#external-api: epydoc
+
+# Use the records in this file to resolve objects in the API named NAME.
+#external-api-file: epydoc:api-objects.txt
+
+# Use this URL prefix to configure the string returned for external API.
+#external-api-root: epydoc:http://epydoc.sourceforge.net/api
+# external-api: wiki doc
+# external-api-root: wiki:http://lgcm.iro.umontreal.ca/theano/wiki/ doc:http://lgcm.iro.umontreal.ca/auto_theano/doc/
+# external-api-file: wiki:wiki.idx doc:doc/doc.idx
+
+### Graph options
+
+# The list of graph types that should be automatically included
+# in the output.  Graphs are generated using the Graphviz "dot"
+# executable.  Graph types include: "classtree", "callgraph",
+# "umlclass".  Use "all" to include all graph types
+graph: all
+
+# The path to the Graphviz "dot" executable, used to generate
+# graphs.
+dotpath: /usr/bin/dot
+
+# The name of one or more pstat files (generated by the profile
+# or hotshot module).  These are used to generate call graphs.
+#pstat: autotest.pstat
+
+# Specify the font used to generate Graphviz graphs.
+# (e.g., helvetica or times).
+graph-font: Helvetica
+
+# Specify the font size used to generate Graphviz graphs.
+graph-font-size: 10
+
+
+### Return value options
+
+# The condition upon which Epydoc should exit with a non-zero
+# exit status. Possible values are error, warning, docstring_warning
+#fail-on: error
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/conf.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+#
+# theano documentation build configuration file, created by
+# sphinx-quickstart on Tue Oct  7 16:34:06 2008.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# The contents of this file are pickled, so don't put values in the namespace
+# that aren't pickleable (module imports are okay, they're removed automatically).
+#
+# All configuration values have a default value; values that are commented out
+# serve to show the default value.
+
+import sys, os
+
+# If your extensions are in another directory, add it here. If the directory
+# is relative to the documentation root, use os.path.abspath to make it
+# absolute, like shown here.
+#sys.path.append(os.path.abspath('some/directory'))
+
+# General configuration
+# ---------------------
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'ext']
+
+todo_include_todos = True
+
+try:
+    from sphinx.ext import pngmath
+    extensions.append('sphinx.ext.pngmath')
+except ImportError:
+    pass
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['.templates']
+
+# The suffix of source filenames.
+source_suffix = '.txt'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General substitutions.
+project = 'Pylearn'
+copyright = '2008--2009, LISA lab'
+
+# The default replacements for |version| and |release|, also used in various
+# other places throughout the built documents.
+#
+# The short X.Y version.
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.1'
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+today_fmt = '%B %d, %Y'
+
+# List of documents that shouldn't be included in the build.
+#unused_docs = []
+
+# List of directories, relative to source directories, that shouldn't be searched
+# for source files.
+exclude_dirs = ['images', 'scripts', 'api']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+
+# Options for HTML output
+# -----------------------
+
+# The style sheet to use for HTML and HTML Help pages. A file of that name
+# must exist either in Sphinx' static/ path, or in one of the custom paths
+# given in html_static_path.
+#html_style = 'default.css'
+html_theme = 'sphinxdoc'
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (within the static path) to place at the top of
+# the sidebar.
+#html_logo = 'images/theano_logo-200x67.png'
+html_logo = 'images/logo_pylearn_200x57.png'
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['.static', 'images']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_use_modindex = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, the reST sources are included in the HTML build as _sources/<name>.
+#html_copy_source = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = ''
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pylearndoc'
+
+
+# Options for LaTeX output
+# ------------------------
+
+# The paper size ('letter' or 'a4').
+#latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+latex_font_size = '11pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, document class [howto/manual]).
+latex_documents = [
+  ('index', 'pylearn.tex', 'pylearn Documentation',
+   'LISA lab, University of Montreal', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = 'images/snake_theta2-trans.png'
+latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_use_modindex = True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/ext.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,75 @@
+
+import sys
+import re
+import os
+from docutils import nodes, utils
+from docutils.parsers.rst import roles
+import epydoc.docwriter.xlink as xlink
+
+#def role_fn(name, rawtext, text, lineno, inliner,
+#            options={}, content=[]):
+#    node = nodes.reference(rawtext, text, refuri = "http://pylearn.org/theano/wiki/%s" % text)
+#    return [node], []
+
+
+_TARGET_RE = re.compile(r'^(.*?)\s*<(?:URI:|URL:)?([^<>]+)>$')
+def create_api_role(name, problematic):
+    """
+    Create and register a new role to create links for an API documentation.
+
+    Create a role called `name`, which will use the URL resolver registered as
+    ``name`` in `api_register` to create a link for an object.
+
+    :Parameters:
+      `name` : `str`
+        name of the role to create.
+      `problematic` : `bool`
+        if True, the registered role will create problematic nodes in
+        case of failed references. If False, a warning will be raised
+        anyway, but the output will appear as an ordinary literal.
+    """
+    def resolve_api_name(n, rawtext, text, lineno, inliner,
+                options={}, content=[]):
+
+        # Check if there's separate text & targets
+        m = _TARGET_RE.match(text)
+        if m: text, target = m.groups()
+        else: target = text
+
+        # node in monotype font
+        text = utils.unescape(text)
+        node = nodes.literal(rawtext, text, **options)
+
+        # Get the resolver from the register and create an url from it.
+        try:
+            url = xlink.api_register[name].get_url(target)
+        except IndexError, exc:
+            msg = inliner.reporter.warning(str(exc), line=lineno)
+            if problematic:
+                prb = inliner.problematic(rawtext, text, msg)
+                return [prb], [msg]
+            else:
+                return [node], []
+
+        if url is not None:
+            node = nodes.reference(rawtext, '', node, refuri=url, **options)
+        return [node], []
+
+    roles.register_local_role(name, resolve_api_name)
+
+
+def setup(app):
+
+    try:
+        xlink.set_api_file('api', os.path.join(app.outdir, 'api', 'api-objects.txt'))
+        apiroot = os.getenv('PYLEARN_API_ROOT')
+        if not apiroot:
+            apiroot = os.path.join(os.path.realpath('api'), '')
+        xlink.set_api_root('api', apiroot)
+        #xlink.create_api_role('api', True)
+        create_api_role('api', True)
+    except IOError:
+        print >>sys.stderr, 'WARNING: Could not find api file! API links will not work.'
+
+    #app.add_role("wiki", role_fn)
+
Binary file doc/images/logo_pylearn_200x57.png has changed
Binary file doc/images/vitables_example_series.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/index.txt	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,32 @@
+
+Welcome
+=======
+
+Pylearn is a Python library for machine learning, built on top of Theano, our
+library for defining, optimizing and evaluating mathematical expressions
+involving multi-dimensional arrays.
+
+This documentation is under construction, but you can already access the
+automatically-generated API doc, along with more extensive explanations for
+some modules.
+
+Download
+========
+
+We recommend the latest development version, available via::
+
+    hg clone http://hg.assembla.com/pylearn Pylearn
+
+The ``pylearn`` subfolder should be on your ``$PYTHONPATH``.
+
+Documentation
+=============
+
+For the moment, the following documentation is available.
+
+* :doc:`io.SeriesTables module <seriestables>` -- Saves error series and other statistics during training
+* `API <api/>`_ -- The automatically-generated API documentation
+
+You can download the latest `PDF documentation <http://deeplearning.net/software/pylearn/pylearn.pdf>`_, rather than reading it online.
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/scripts/docgen.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,89 @@
+import sys
+import os
+import shutil
+import inspect
+
+from epydoc import docintrospecter
+from epydoc.apidoc import RoutineDoc
+
+import getopt
+from collections import defaultdict
+
+if __name__ == '__main__':
+
+    # make sure we're in the right directory
+    this_file_directory = os.path.abspath(os.path.dirname(__file__))
+    pylearn_root = os.path.join(os.path.join(this_file_directory, ".."), "..")
+
+    #pylearn_root = "/".join(sys.path[0].split("/")[:-2])
+
+    options = defaultdict(bool)
+    options.update(dict([x, y or True] for x, y in getopt.getopt(sys.argv[1:], 'o:', ['epydoc', 'rst', 'help', 'nopdf'])[0]))
+    if options['--help']:
+        print 'Usage: %s [OPTIONS]' % sys.argv[0]
+        print '  -o <dir>: output the html files in the specified dir'
+        print '  --rst: only compile the doc (requires sphinx)'
+        print '  --nopdf: do not produce a PDF file from the doc, only HTML'
+        print '  --epydoc: only compile the api documentation (requires epydoc)'
+        print '  --help: this help'
+        sys.exit(0)
+
+    options['--all'] = not (bool(options['--epydoc']) ^ bool(options['--rst']))
+
+    def mkdir(path):
+        try:
+            os.mkdir(path)
+        except OSError:
+            pass
+
+    outdir = options['-o'] or (pylearn_root + '/html')
+    mkdir(outdir)
+    os.chdir(outdir)
+    mkdir("doc")
+    mkdir("api")
+
+    # Make sure the appropriate 'theano' directory is in the PYTHONPATH
+    pythonpath = os.environ.get('PYTHONPATH', '')
+    pythonpath = pylearn_root + ':' + pythonpath
+    os.environ['PYTHONPATH'] = pythonpath
+
+    if options['--all'] or options['--epydoc']:
+        from epydoc.cli import cli
+        sys.path[0:0] = [pylearn_root]
+
+        #Generate HTML doc
+
+        ## This causes problems with the subsequent generation of sphinx doc
+        #sys.argv[:] = ['', '--config', '%s/doc/api/epydoc.conf' % pylearn_root, '-o', 'api']
+        #cli()
+        ## So we use this instead
+        os.system("epydoc --config %s/doc/api/epydoc.conf -o api" % pylearn_root)
+
+        # Generate PDF doc
+        # TODO
+
+    if options['--all'] or options['--rst']:
+        import sphinx
+        sys.path[0:0] = [os.path.join(pylearn_root, 'doc')]
+        sphinx.main(['', '-E', os.path.join(pylearn_root, 'doc'), '.'])
+
+        if not options['--nopdf']:
+            # Generate latex file in a temp directory
+            import tempfile
+            workdir = tempfile.mkdtemp()
+            sphinx.main(['', '-E', '-b', 'latex',
+                os.path.join(pylearn_root, 'doc'), workdir])
+            # Compile to PDF
+            os.chdir(workdir)
+            os.system('make')
+            try:
+                shutil.copy(os.path.join(workdir, 'pylearn.pdf'), outdir)
+                os.chdir(outdir)
+                shutil.rmtree(workdir)
+            except OSError, e:
+                print 'OSError:', e
+            except IOError, e:
+                print 'IOError:', e
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/seriestables.txt	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,291 @@
+.. SeriesTables documentation master file, created by
+   sphinx-quickstart on Wed Mar 10 17:56:41 2010.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Introduction to ``SeriesTables``
+--------------------------------
+
+SeriesTables was created to make it easier to **record scalar data series**, such as, notably, the **evolution of errors (training, valid, test) during training**. There are other common usecases I foresee, such as **recording basic statistics (mean, min/max, variance) of parameters** during training, to diagnose problems.
+
+I also think that if such recording is easily accessible, it might lead us to record other statistics, such as stats concerning activations in the network (i.e. to diagnose unit saturation problems).
+
+Each **element of a series is indexed and timestamped**. By default, for example, the index is named "epoch", which means that with each row an epoch number is stored (but this can be easily customized). By default, the timestamp at row creation time will also be stored, along with the CPU clock() time. This is to allow graphs plotting error series against epoch or training time.
+
+Series are saved in HDF5 files, which I'll introduce briefly.
+
+Introduction to PyTables and HDF5
+---------------------------------
+
+HDF5_ is a file format intended for storage of big numerical datasets. In practice, for our concern, you'll create a single ``.h5`` file, in which many tables, corresponding to different series, will be stored. Datasets in a single file are organized hierarchically, in the equivalent of "folders" called "groups". The "files" in the analogy would be our tables.
+
+.. _HDF5: http://www.hdfgroup.org/HDF5/
+
+A useful property of HDF5 is that metadata is stored along with the data itself. Notably, we have the table names and column names inside the file. We can also attach more complex data, such as title, or even complex objects (which will be pickled), as attributes.
+
+PyTables_ is a Python library to use the HDF5 format.
+
+.. _PyTables: http://www.pytables.org/moin/HowToUse
+
+Here's a basic Python session in which I create a new file and store a few rows in a single table:
+
+>>> import tables
+>>>
+>>> hdf5_file = tables.openFile("mytables.h5", "w")
+>>>
+>>> # Create a new subgroup under the root group "/"
+... mygroup = hdf5_file.createGroup("/", "mygroup")
+>>>
+>>> # Define the type of data we want to store
+... class MyDescription(tables.IsDescription):
+...     int_column_1 = tables.Int32Col(pos=0)
+...     float_column_1 = tables.Float32Col(pos=1)
+...
+>>> # Create a table under mygroup
+... mytable = hdf5_file.createTable("/mygroup", "mytable", MyDescription)
+>>>
+>>> newrow = mytable.row
+>>>
+>>> # a first row
+... newrow["int_column_1"] = 15
+>>> newrow["float_column_1"] = 30.0
+>>> newrow.append()
+>>>
+>>> # and a second row
+... newrow["int_column_1"] = 16
+>>> newrow["float_column_1"] = 32.0
+>>> newrow.append()
+>>>
+>>> # make sure we write to disk
+... hdf5_file.flush()
+>>>
+>>> hdf5_file.close()
+
+
+And here's a session in which I reload the data and explore it:
+
+>>> import tables
+>>>
+>>> hdf5_file = tables.openFile("mytables.h5", "r")
+>>>
+>>> mytable = hdf5_file.getNode("/mygroup", "mytable")
+>>>
+>>> # tables can be "sliced" this way
+... mytable[0:2]
+array([(15, 30.0), (16, 32.0)],
+      dtype=[('int_column_1', '<i4'), ('float_column_1', '<f4')])
+>>>
+>>> # or we can access columns individually
+... mytable.cols.int_column_1[0:2]
+array([15, 16], dtype=int32)
+
+
+Using ``SeriesTables``: a basic example
+---------------------------------------
+
+Here's a very basic example usage:
+
+>>> import tables
+>>> from pylearn.io.seriestables import *
+>>>
+>>> tables_file = tables.openFile("series.h5", "w")
+>>>
+>>> error_series = ErrorSeries(error_name="validation_error", \
+...                         table_name="validation_error", \
+...                         hdf5_file=tables_file)
+>>>
+>>> error_series.append((1,), 32.0)
+>>> error_series.append((2,), 28.0)
+>>> error_series.append((3,), 26.0)
+
+I can then open the file ``series.h5``, which will contain a table named ``validation_error`` with a column name ``epoch`` and another named ``validation_error``. There will also be ``timestamp`` and ``cpuclock`` columns, as this is the default behavior. The table rows will correspond to the data added with ``append()`` above.
+
+Indices
+.......
+
+You may notice that the first parameter in ``append()`` is a tuple. This is because the *index* may have multiple levels. The index is a way for rows to have an order.
+
+In the default case for ErrorSeries, the index only has an "epoch", so the tuple only has one element. But in the ErrorSeries(...) constructor, you could have specified the ``index_names`` parameter, e.g. ``('epoch','minibatch')``, which would allow you to specify both the epoch and the minibatch as index.
+
+
+Summary of the most useful classes
+----------------------------------
+
+By default, for each of these series, there are also columns for timestamp and CPU clock() value when append() is called. This can be changed with the store_timestamp and store_cpuclock parameters of their constructors.
+
+ErrorSeries
+  This records one floating point (32 bit) value along with an index in a new table.
+
+AccumulatorSeriesWrapper
+  This wraps another Series and calls its ``append()`` method when its own ``append()`` as been called N times, N being a parameter when constructing the ``AccumulatorSeriesWrapper``. A simple use case: say you want to store the mean of the training error every 100 minibatches. You create an ErrorSeries, wrap it with an Accumulator and then call its ``append()`` for every minibatch. It will collect the errors, wait until it has 100, then take the mean (with ``numpy.mean``) and store it in the ErrorSeries, and start over again.
+  Other "reducing" functions can be used instead of "mean".
+
+BasicStatisticsSeries
+  This stores the mean, the min, the max and the standard deviation of arrays you pass to its ``append()`` method. This is useful, notably, to see how the weights (and other parameters) evolve during training without actually storing the parameters themselves.
+
+SharedParamsStatisticsWrapper
+  This wraps a few BasicStatisticsSeries. It is specifically designed so you can pass it a list of shared (as in theano.shared) parameter arrays. Each array will get its own table, under a new HDF5 group. You can name each table, e.g. "layer1_b", "layer1_W", etc.
+
+Example of real usage
+---------------------
+
+The following is a function where I create the series used to record errors and statistics about parameters in a stacked denoising autoencoder script:
+
+.. code-block:: python
+
+	def create_series(num_hidden_layers):
+
+		# Replace series we don't want to save with DummySeries, e.g.
+		# series['training_error'] = DummySeries()
+
+		series = {}
+
+		basedir = os.getcwd()
+
+		h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w")
+
+		# training error is accumulated over 100 minibatches,
+		# then the mean is computed and saved in the training_base series
+		training_base = \
+					ErrorSeries(error_name="training_error",
+						table_name="training_error",
+						hdf5_file=h5f,
+						index_names=('epoch','minibatch'),
+						title="Training error (mean over 100 minibatches)")
+
+		# this series wraps training_base, performs accumulation
+		series['training_error'] = \
+					AccumulatorSeriesWrapper(base_series=training_base,
+						reduce_every=100)
+
+		# valid and test are not accumulated/mean, saved directly
+		series['validation_error'] = \
+					ErrorSeries(error_name="validation_error",
+						table_name="validation_error",
+						hdf5_file=h5f,
+						index_names=('epoch',))
+
+		series['test_error'] = \
+					ErrorSeries(error_name="test_error",
+						table_name="test_error",
+						hdf5_file=h5f,
+						index_names=('epoch',))
+
+		# next we want to store the parameters statistics
+		# so first we create the names for each table, based on
+		# position of each param in the array
+		param_names = []
+		for i in range(num_hidden_layers):
+			param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i]
+		param_names += ['logreg_layer_W', 'logreg_layer_b']
+
+
+		series['params'] = SharedParamsStatisticsWrapper(
+							new_group_name="params",
+							base_group="/",
+							arrays_names=param_names,
+							hdf5_file=h5f,
+							index_names=('epoch',))
+
+		return series
+
+Then, here's an example of append() usage for each of these series, wrapped in pseudocode:
+
+.. code-block:: python
+
+	series = create_series(num_hidden_layers=3)
+
+	...
+
+	for epoch in range(num_epochs):
+		for mb_index in range(num_minibatches):
+			train_error = finetune(mb_index)
+			series['training_error'].append((epoch, mb_index), train_error)
+
+		valid_error = compute_validation_error()
+		series['validation_error'].append((epoch,), valid_error)
+
+		test_error = compute_test_error()
+		series['test_error'].append((epoch,), test_error)
+
+		# suppose all_params is a list [layer1_W, layer1_b, ...]
+		# where each element is a shared (as in theano.shared) array
+		series['params'].append((epoch,), all_params)
+
+Other targets for appending (e.g. printing to stdout)
+-----------------------------------------------------
+
+SeriesTables was created with an HDF5 file in mind, but often, for debugging,
+it's useful to be able to redirect the series elsewhere, notably the standard
+output. A mechanism was added to do just that.
+
+What you do is you create a ``AppendTarget`` instance (or more than one) and
+pass it as an argument to the Series constructor. For example, to print every
+row appended to the standard output, you use StdoutAppendTarget.
+
+If you want to skip appending to the HDF5 file entirely, this is also
+possible. You simply specify ``skip_hdf5_append=True`` in the constructor. You
+still need to pass in a valid HDF5 file, though, even though nothing will be
+written to it (for, err, legacy reasons).
+
+Here's an example:
+
+.. code-block:: python
+
+	def create_series(num_hidden_layers):
+
+		# Replace series we don't want to save with DummySeries, e.g.
+		# series['training_error'] = DummySeries()
+
+		series = {}
+
+		basedir = os.getcwd()
+
+		h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w")
+
+		# Here we create the new target, with a message prepended
+		# before every row is printed to stdout
+		stdout_target = \
+			StdoutAppendTarget( \
+				prepend='\n-----------------\nValidation error',
+				indent_str='\t')
+
+		# Notice here we won't even write to the HDF5 file
+		series['validation_error'] = \
+			ErrorSeries(error_name="validation_error",
+				table_name="validation_error",
+				hdf5_file=h5f,
+				index_names=('epoch',),
+				other_targets=[stdout_target],
+				skip_hdf5_append=True)
+
+		return series
+
+
+Now calls to series['validation_error'].append() will print to stdout outputs
+like::
+
+	----------------
+	Validation error
+		timestamp : 1271202144
+		cpuclock : 0.12
+		epoch : 1
+		validation_error : 30.0
+
+	----------------
+	Validation error
+		timestamp : 1271202144
+		cpuclock : 0.12
+		epoch : 2
+		validation_error : 26.0
+
+
+Visualizing in vitables
+-----------------------
+
+vitables_ is a program with which you can easily explore an HDF5 ``.h5`` file. Here's a screenshot in which I visualize series produced for the preceding example:
+
+.. _vitables: http://vitables.berlios.de/
+
+.. image:: images/vitables_example_series.png
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/algorithms/sigmoid_output_SdA.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,540 @@
+"""
+ This tutorial introduces stacked denoising auto-encoders (SdA) using Theano.
+
+ Denoising autoencoders are the building blocks for SdA.
+ They are based on auto-encoders as the ones used in Bengio et al. 2007.
+ An autoencoder takes an input x and first maps it to a hidden representation
+ y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
+ latent representation y is then mapped back to a "reconstructed" vector
+ z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight
+ matrix W' can optionally be constrained such that W' = W^T, in which case
+ the autoencoder is said to have tied weights. The network is trained such
+ that to minimize the reconstruction error (the error between x and z).
+
+ For the denosing autoencoder, during training, first x is corrupted into
+ \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
+ of a stochastic mapping. Afterwards y is computed as before (using
+ \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
+ error is now measured between z and the uncorrupted input x, which is
+ computed as the cross-entropy :
+      - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
+
+
+ References :
+   - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
+   Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
+   2008
+   - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
+   Training of Deep Networks, Advances in Neural Information Processing
+   Systems 19, 2007
+
+"""
+
+import numpy, time, cPickle, gzip, sys, os
+
+import theano
+import theano.tensor as T
+from theano.tensor.shared_randomstreams import RandomStreams
+
+from logistic_sgd import load_data
+from mlp import HiddenLayer
+from dA import dA
+
+
+
+class BinaryLogisticRegressions(object):
+    """Multiple 2-class Logistic Regressions Class
+
+    The logistic regressions are fully described by a weight matrix :math:`W`
+    and bias vector :math:`b`. Classification is done by projecting data
+    points onto a set of hyperplanes, the distance to which is used to
+    determine a class membership probability.
+    """
+
+
+
+
+    def __init__(self, input, n_in, n_out):
+        """ Initialize the parameters of the logistic regression
+
+        :type input: theano.tensor.TensorType
+        :param input: symbolic variable that describes the input of the
+                      architecture (one minibatch)
+
+        :type n_in: int
+        :param n_in: number of input units, the dimension of the space in
+                     which the datapoints lie
+
+        :type n_out: int
+        :param n_out: number of output units, the dimension of the space in
+                      which the labels lie
+
+        """
+
+        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
+        self.W = theano.shared(value=numpy.zeros((n_in,n_out), dtype = theano.config.floatX),
+                                name='W')
+        # initialize the baises b as a vector of n_out 0s
+        self.b = theano.shared(value=numpy.zeros((n_out,), dtype = theano.config.floatX),
+                               name='b')
+
+
+        # compute vector of class-membership probabilities in symbolic form
+        self.p_y_given_x = T.nnet.sigmoid(T.dot(input, self.W)+self.b)
+
+        # compute prediction as class whose probability is maximal in
+        # symbolic form
+        self.y_pred=T.argmax(self.p_y_given_x, axis=1)
+
+        # parameters of the model
+        self.params = [self.W, self.b]
+
+
+
+
+
+    def negative_log_likelihood(self, y):
+        """Return the mean of the negative log-likelihood of the prediction
+        of this model under a given target distribution.
+
+        .. math::
+
+            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y^{(i)}=y^{(i)}|x^{(i)}, W,b)) \\
+                \ell (\theta=\{W,b\}, \mathcal{D})
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+
+        Note: we use the mean instead of the sum so that
+              the learning rate is less dependent on the batch size
+        """
+        return -T.mean(T.sum( y*T.log(self.p_y_given_x) + (1-y)*T.log(1-self.p_y_given_x), axis=1 ) )
+
+
+    def errors(self, y):
+        """Return a float representing the number of errors in the minibatch
+        over the total number of examples of the minibatch ; zero one
+        loss over the size of the minibatch
+
+        :type y: theano.tensor.TensorType
+        :param y: corresponds to a vector that gives for each example the
+                  correct label
+        """
+
+        # check if y has same dimension of y_pred
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError('y should have the same shape as self.y_pred',
+                ('y', target.type, 'y_pred', self.y_pred.type))
+        # check if y is of the correct datatype
+        if y.dtype.startswith('int'):
+            # the T.neq operator returns a vector of 0s and 1s, where 1
+            # represents a mistake in prediction
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            raise NotImplementedError()
+
+
+
+class SdA(object):
+    """Stacked denoising auto-encoder class (SdA)
+
+    A stacked denoising autoencoder model is obtained by stacking several
+    dAs. The hidden layer of the dA at layer `i` becomes the input of
+    the dA at layer `i+1`. The first layer dA gets as input the input of
+    the SdA, and the hidden layer of the last dA represents the output.
+    Note that after pretraining, the SdA is dealt with as a normal MLP,
+    the dAs are only used to initialize the weights.
+    """
+
+    def __init__(self, numpy_rng, theano_rng = None, n_ins = 784,
+                 hidden_layers_sizes = [500,500], n_outs = 10,
+                 corruption_levels = [0.1, 0.1]):
+        """ This class is made to support a variable number of layers.
+
+        :type numpy_rng: numpy.random.RandomState
+        :param numpy_rng: numpy random number generator used to draw initial
+                    weights
+
+        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
+        :param theano_rng: Theano random generator; if None is given one is
+                           generated based on a seed drawn from `rng`
+
+        :type n_ins: int
+        :param n_ins: dimension of the input to the sdA
+
+        :type n_layers_sizes: list of ints
+        :param n_layers_sizes: intermidiate layers size, must contain
+                               at least one value
+
+        :type n_outs: int
+        :param n_outs: dimension of the output of the network
+
+        :type corruption_levels: list of float
+        :param corruption_levels: amount of corruption to use for each
+                                  layer
+        """
+
+        self.sigmoid_layers = []
+        self.dA_layers      = []
+        self.params         = []
+        self.n_layers       = len(hidden_layers_sizes)
+
+        assert self.n_layers > 0
+
+        if not theano_rng:
+            theano_rng = RandomStreams(numpy_rng.randint(2**30))
+        # allocate symbolic variables for the data
+        self.x  = T.matrix('x')  # the data is presented as rasterized images
+        self.y  = T.ivector('y') # the labels are presented as 1D vector of
+                                 # [int] labels
+
+        # The SdA is an MLP, for which all weights of intermidiate layers
+        # are shared with a different denoising autoencoders
+        # We will first construct the SdA as a deep multilayer perceptron,
+        # and when constructing each sigmoidal layer we also construct a
+        # denoising autoencoder that shares weights with that layer
+        # During pretraining we will train these autoencoders (which will
+        # lead to chainging the weights of the MLP as well)
+        # During finetunining we will finish training the SdA by doing
+        # stochastich gradient descent on the MLP
+
+        for i in xrange( self.n_layers ):
+            # construct the sigmoidal layer
+
+            # the size of the input is either the number of hidden units of
+            # the layer below or the input size if we are on the first layer
+            if i == 0 :
+                input_size = n_ins
+            else:
+                input_size = hidden_layers_sizes[i-1]
+
+            # the input to this layer is either the activation of the hidden
+            # layer below or the input of the SdA if you are on the first
+            # layer
+            if i == 0 :
+                layer_input = self.x
+            else:
+                layer_input = self.sigmoid_layers[-1].output
+
+            sigmoid_layer = HiddenLayer(rng   = numpy_rng,
+                                           input = layer_input,
+                                           n_in  = input_size,
+                                           n_out = hidden_layers_sizes[i],
+                                           activation = T.nnet.sigmoid)
+            # add the layer to our list of layers
+            self.sigmoid_layers.append(sigmoid_layer)
+            # its arguably a philosophical question...
+            # but we are going to only declare that the parameters of the
+            # sigmoid_layers are parameters of the StackedDAA
+            # the visible biases in the dA are parameters of those
+            # dA, but not the SdA
+            self.params.extend(sigmoid_layer.params)
+
+            # Construct a denoising autoencoder that shared weights with this
+            # layer
+            dA_layer = dA(numpy_rng = numpy_rng, theano_rng = theano_rng, input = layer_input,
+                          n_visible = input_size,
+                          n_hidden  = hidden_layers_sizes[i],
+                          W = sigmoid_layer.W, bhid = sigmoid_layer.b)
+            self.dA_layers.append(dA_layer)
+
+
+        # We now need to add a logistic layer on top of the MLP
+        #self.logLayer = LogisticRegression(\
+        #                 input = self.sigmoid_layers[-1].output,\
+        #                 n_in = hidden_layers_sizes[-1], n_out = n_outs)
+
+        self.logLayer = BinaryLogisticRegressions(\
+                         input = self.sigmoid_layers[-1].output,\
+                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
+
+        self.params.extend(self.logLayer.params)
+        # construct a function that implements one step of finetunining
+
+        # compute the cost for second phase of training,
+        # defined as the negative log likelihood
+        #self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
+        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
+
+        # compute the gradients with respect to the model parameters
+        # symbolic variable that points to the number of errors made on the
+        # minibatch given by self.x and self.y
+        self.errors = self.logLayer.errors(self.y)
+
+    def pretraining_functions(self, train_set_x, batch_size):
+        ''' Generates a list of functions, each of them implementing one
+        step in trainnig the dA corresponding to the layer with same index.
+        The function will require as input the minibatch index, and to train
+        a dA you just need to iterate, calling the corresponding function on
+        all minibatch indexes.
+
+        :type train_set_x: theano.tensor.TensorType
+        :param train_set_x: Shared variable that contains all datapoints used
+                            for training the dA
+
+        :type batch_size: int
+        :param batch_size: size of a [mini]batch
+
+        :type learning_rate: float
+        :param learning_rate: learning rate used during training for any of
+                              the dA layers
+        '''
+
+        # index to a [mini]batch
+        index            = T.lscalar('index')   # index to a minibatch
+        corruption_level = T.scalar('corruption')    # amount of corruption to use
+        learning_rate    = T.scalar('lr')    # learning rate to use
+        # number of batches
+        n_batches = train_set_x.value.shape[0] / batch_size
+        # begining of a batch, given `index`
+        batch_begin = index * batch_size
+        # ending of a batch given `index`
+        batch_end = batch_begin+batch_size
+
+        pretrain_fns = []
+        for dA in self.dA_layers:
+            # get the cost and the updates list
+            cost,updates = dA.get_cost_updates( corruption_level, learning_rate)
+            # compile the theano function
+            fn = theano.function( inputs = [index,
+                              theano.Param(corruption_level, default = 0.2),
+                              theano.Param(learning_rate, default = 0.1)],
+                    outputs = cost,
+                    updates = updates,
+                    givens  = {self.x :train_set_x[batch_begin:batch_end]})
+            # append `fn` to the list of functions
+            pretrain_fns.append(fn)
+
+        return pretrain_fns
+
+
+    def build_finetune_functions(self, datasets, batch_size, learning_rate):
+        '''Generates a function `train` that implements one step of
+        finetuning, a function `validate` that computes the error on
+        a batch from the validation set, and a function `test` that
+        computes the error on a batch from the testing set
+
+        :type datasets: list of pairs of theano.tensor.TensorType
+        :param datasets: It is a list that contain all the datasets;
+                         the has to contain three pairs, `train`,
+                         `valid`, `test` in this order, where each pair
+                         is formed of two Theano variables, one for the
+                         datapoints, the other for the labels
+
+        :type batch_size: int
+        :param batch_size: size of a minibatch
+
+        :type learning_rate: float
+        :param learning_rate: learning rate used during finetune stage
+        '''
+
+        (train_set_x, train_set_y) = datasets[0]
+        (valid_set_x, valid_set_y) = datasets[1]
+        (test_set_x , test_set_y ) = datasets[2]
+
+        # compute number of minibatches for training, validation and testing
+        n_valid_batches = valid_set_x.value.shape[0] / batch_size
+        n_test_batches  = test_set_x.value.shape[0]  / batch_size
+
+        index   = T.lscalar('index')    # index to a [mini]batch
+
+        # compute the gradients with respect to the model parameters
+        gparams = T.grad(self.finetune_cost, self.params)
+
+        # compute list of fine-tuning updates
+        updates = {}
+        for param, gparam in zip(self.params, gparams):
+            updates[param] = param - gparam*learning_rate
+
+        train_fn = theano.function(inputs = [index],
+              outputs =   self.finetune_cost,
+              updates = updates,
+              givens  = {
+                self.x : train_set_x[index*batch_size:(index+1)*batch_size],
+                self.y : train_set_y[index*batch_size:(index+1)*batch_size]})
+
+        test_score_i = theano.function([index], self.errors,
+                 givens = {
+                   self.x: test_set_x[index*batch_size:(index+1)*batch_size],
+                   self.y: test_set_y[index*batch_size:(index+1)*batch_size]})
+
+        valid_score_i = theano.function([index], self.errors,
+              givens = {
+                 self.x: valid_set_x[index*batch_size:(index+1)*batch_size],
+                 self.y: valid_set_y[index*batch_size:(index+1)*batch_size]})
+
+        # Create a function that scans the entire validation set
+        def valid_score():
+            return [valid_score_i(i) for i in xrange(n_valid_batches)]
+
+        # Create a function that scans the entire test set
+        def test_score():
+            return [test_score_i(i) for i in xrange(n_test_batches)]
+
+        return train_fn, valid_score, test_score
+
+
+
+
+
+
+def test_SdA( finetune_lr = 0.1, pretraining_epochs = 15, \
+              pretrain_lr = 0.05, training_epochs = 1000, \
+              dataset='../data/mnist.pkl.gz', batch_size = 1):
+    """
+    Demonstrates how to train and test a stochastic denoising autoencoder.
+
+    This is demonstrated on MNIST.
+
+    :type learning_rate: float
+    :param learning_rate: learning rate used in the finetune stage
+    (factor for the stochastic gradient)
+
+    :type pretraining_epochs: int
+    :param pretraining_epochs: number of epoch to do pretraining
+
+    :type pretrain_lr: float
+    :param pretrain_lr: learning rate to be used during pre-training
+
+    :type n_iter: int
+    :param n_iter: maximal number of iterations ot run the optimizer
+
+    :type dataset: string
+    :param dataset: path the the pickled dataset
+
+    """
+
+    datasets = load_data(dataset)
+
+    train_set_x, train_set_y = datasets[0]
+    valid_set_x, valid_set_y = datasets[1]
+    test_set_x , test_set_y  = datasets[2]
+
+
+    # compute number of minibatches for training, validation and testing
+    n_train_batches = train_set_x.value.shape[0] / batch_size
+
+    # numpy random generator
+    numpy_rng = numpy.random.RandomState(123)
+    print '... building the model'
+    # construct the stacked denoising autoencoder class
+    sda = SdA( numpy_rng = numpy_rng, n_ins = 28*28,
+                      hidden_layers_sizes = [1000,1000,1000],
+                      n_outs = 10)
+
+
+    #########################
+    # PRETRAINING THE MODEL #
+    #########################
+    print '... getting the pretraining functions'
+    pretraining_fns = sda.pretraining_functions(
+                                        train_set_x   = train_set_x,
+                                        batch_size    = batch_size )
+
+    print '... pre-training the model'
+    start_time = time.clock()
+    ## Pre-train layer-wise
+    corruption_levels = [.1,.1,.0]
+    for i in xrange(sda.n_layers):
+        # go through pretraining epochs
+        for epoch in xrange(pretraining_epochs):
+            # go through the training set
+            c = []
+            for batch_index in xrange(n_train_batches):
+                c.append( pretraining_fns[i](index = batch_index,
+                         corruption = corruption_levels[i],
+                         lr = pretrain_lr ) )
+            print 'Pre-training layer %i, epoch %d, cost '%(i,epoch),numpy.mean(c)
+
+    end_time = time.clock()
+
+    print >> sys.stderr, ('The pretraining code for file '+os.path.split(__file__)[1]+' ran for %.2fm expected 4.58m in our buildbot' % ((end_time-start_time)/60.))
+
+    ########################
+    # FINETUNING THE MODEL #
+    ########################
+
+    # get the training, validation and testing function for the model
+    print '... getting the finetuning functions'
+    train_fn, validate_model, test_model = sda.build_finetune_functions (
+                datasets = datasets, batch_size = batch_size,
+                learning_rate = finetune_lr)
+
+    print '... finetunning the model'
+    # early-stopping parameters
+    patience              = 10*n_train_batches # look as this many examples regardless
+    patience_increase     = 2.    # wait this much longer when a new best is
+                                  # found
+    improvement_threshold = 0.995 # a relative improvement of this much is
+                                  # considered significant
+    validation_frequency  = min(n_train_batches, patience/2)
+                                  # go through this many
+                                  # minibatche before checking the network
+                                  # on the validation set; in this case we
+                                  # check every epoch
+
+
+    best_params          = None
+    best_validation_loss = float('inf')
+    test_score           = 0.
+    start_time = time.clock()
+
+    done_looping = False
+    epoch = 0
+
+    while (epoch < training_epochs) and (not done_looping):
+        for minibatch_index in xrange(n_train_batches):
+            minibatch_avg_cost = train_fn(minibatch_index)
+            iter    = epoch * n_train_batches + minibatch_index
+
+            if (iter+1) % validation_frequency == 0:
+                validation_losses = validate_model()
+                this_validation_loss = numpy.mean(validation_losses)
+                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
+                   (epoch, minibatch_index+1, n_train_batches, \
+                    this_validation_loss*100.))
+
+
+                # if we got the best validation score until now
+                if this_validation_loss < best_validation_loss:
+
+                    #improve patience if loss improvement is good enough
+                    if this_validation_loss < best_validation_loss *  \
+                                                improvement_threshold :
+                        patience = max(patience, iter * patience_increase)
+
+                    # save best validation score and iteration number
+                    best_validation_loss = this_validation_loss
+                    best_iter = iter
+
+                    # test it on the test set
+                    test_losses = test_model()
+                    test_score = numpy.mean(test_losses)
+                    print(('     epoch %i, minibatch %i/%i, test error of best '
+                          'model %f %%') %
+                             (epoch, minibatch_index+1, n_train_batches,
+                              test_score*100.))
+
+
+            if patience <= iter :
+                done_looping = True
+                break
+        epoch = epoch + 1
+
+    end_time = time.clock()
+    print(('Optimization complete with best validation score of %f %%,'
+           'with test performance %f %%') %
+                 (best_validation_loss * 100., test_score*100.))
+    print >> sys.stderr, ('The training code for file '+os.path.split(__file__)[1]+' ran for %.2fm expected 3.91m in our buildbot' % ((end_time-start_time)/60.))
+
+
+
+
+
+
+if __name__ == '__main__':
+    test_SdA()
+
+
--- a/pylearn/algorithms/stopper.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/algorithms/stopper.py	Wed Aug 11 13:16:05 2010 -0400
@@ -100,7 +100,9 @@

         starting = self.iter < self.initial_wait
         waiting = self.iter < (self.patience * self.best_iter)
-        times_up = (time.time() - self.start_time) > self.hard_limit_seconds if self.hard_limit_seconds != None else False
+        if self.hard_limit_seconds != None:
+            times_up = (time.time() - self.start_time) > self.hard_limit_seconds
+        else: times_up = False
         if (starting or waiting) and not times_up:
             # continue to iterate
             self.iter += 1
--- a/pylearn/algorithms/tests/test_daa.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/algorithms/tests/test_daa.py	Wed Aug 11 13:16:05 2010 -0400
@@ -6,12 +6,14 @@
 import time

 import pylearn.algorithms.logistic_regression
-from theano.compile.mode import default_mode
-
-def test_train_daa(mode = default_mode):
+from theano import config
+from pylearn.algorithms.stacker import Stacker
+from pylearn.algorithms.daa import SigmoidXEDenoisingAA
+from pylearn.algorithms.regressor import BinRegressor
+def test_train_daa(mode = config.mode):

     ndaa = 3
-    daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(models.BinRegressor, 'output')],
+    daa = Stacker([(SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(BinRegressor, 'output')],
                          regularize = False)

     model = daa.make([4, 20, 20, 20, 1],
@@ -39,7 +41,7 @@
 def test_train_daa2(mode = theano.Mode('c|py', 'fast_run')):

     ndaa = 3
-    daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(pylearn.algorithms.logistic_regression.Module_Nclass, 'pred')],
+    daa = Stacker([(SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(pylearn.algorithms.logistic_regression.Module_Nclass, 'pred')],
                          regularize = False)

     model = daa.make([4] + [20] * ndaa + [10],
--- a/pylearn/algorithms/tests/test_exponential_mean.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/algorithms/tests/test_exponential_mean.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,4 +1,5 @@
 import theano, numpy
+from theano.compile.debugmode import DebugMode
 from pylearn.algorithms import exponential_mean

 def test_mean():
@@ -50,6 +51,9 @@
     assert i > rows_to_test

 def test_dynamic_normalizer():
+    mode = theano.compile.mode.get_default_mode()
+    if isinstance(mode,DebugMode):
+        mode = 'FAST_RUN'
     x = theano.tensor.dvector()

     rows_to_test = 100
@@ -76,7 +80,7 @@

     M.f = theano.Method([x], [D.output, M.dn_mean.curval, M.dn_var.curval, M.x_mean.curval] , updates)

-    m = M.make()
+    m = M.make(mode=mode)
     m.dn.initialize()
     m.dn_mean.initialize()
     m.dn_var.initialize()
--- a/pylearn/algorithms/tests/test_sgd.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/algorithms/tests/test_sgd.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,6 +1,11 @@
 import theano
+from theano.compile.debugmode import DebugMode
 from pylearn.algorithms import sgd

+mode = theano.compile.mode.get_default_mode()
+if isinstance(mode,DebugMode):
+    mode = 'FAST_RUN'
+
 def test_sgd0():

     x = theano.tensor.dscalar('x')
@@ -8,7 +13,7 @@

     M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=0.01)
     M.y = y
-    m = M.make()
+    m = M.make(mode=mode)
     m.y = 5.0
     for i in xrange(100):
         c = m.step_cost(3.0)
@@ -26,7 +31,7 @@
     M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=lr)
     M.y = y
     M.lr = lr
-    m = M.make()
+    m = M.make(mode=mode)
     m.y = 5.0
     m.lr = 0.01
     for i in xrange(100):
@@ -54,7 +59,7 @@

     M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y])
     M.y = y
-    m = M.make()
+    m = M.make(mode=mode)
     m.y = 5.0
     #there should be a learning rate here by default
     assert m.stepsize is None
--- a/pylearn/dataset_ops/gldataset.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/dataset_ops/gldataset.py	Wed Aug 11 13:16:05 2010 -0400
@@ -17,8 +17,8 @@
 import numpy

 import theano
-from theano.compile.sandbox import shared
-from theano.compile.sandbox import pfunc as function
+from theano.compile import shared
+from theano.compile import pfunc as function

 _logger = logging.getLogger('gldataset')
 def debug(*msg): _logger.debug(' '.join(str(m) for m in msg))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/dataset_ops/majorminer.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,63 @@
+from __future__ import absolute_import
+
+import os
+import numpy
+
+import theano
+import theano.sparse
+import scipy.sparse
+
+from ..datasets.majorminer import Meta
+
+_meta = None
+
+class MajorMiner(theano.Op):
+    """Meta-information of major-miner dataset"""
+
+    def __init__(self, meta=None):
+        global _meta
+        # on construction we make sure a *global* configuration is set
+        # this is done because self.* might get pickled and we don't want to pickle
+        # the whole dataset
+        if _meta is None:
+            if meta is None: _meta = Meta()
+            else: _meta = meta
+        else:
+            if meta is None: pass # no problem, we use global _meta
+            else: raise NotImplementedError('global MajorMiner meta-information already set')
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+
+    def make_node(self, idx):
+        _idx = theano.tensor.as_tensor_variable(idx, ndim=0)
+        return theano.Apply(self,
+                [_idx],
+                [theano.sparse.csr_matrix('MajorMiner.tag_counts'),
+                 theano.generic('MajorMiner.track_path')])
+    def perform(self, node, (idx,), out_storage):
+        global _meta
+        lil = scipy.sparse.lil_matrix((1, len(_meta.tags)), dtype='int8')
+
+        for tag_id, count in _meta.track_tags[idx]:
+            lil[0,tag_id] = count
+
+        out_storage[0][0] = lil.tocsr()
+        out_storage[1][0] = _meta.tracks[idx]
+
+    def grad(self, inputs, output):
+        return [None for i in inputs]
+
+
+def test_basic():
+    a = theano.tensor.lvector()
+    f = theano.function([a], MajorMiner()(a))
+    print 'f([0]):', f([0])
+    rval_0_1 = f([0,1])
+    rval_0_8 = f([0,8])
+
+    assert rval_0_1[1][0] == rval_0_8[1][0] #compare strings
+    assert rval_0_1[1][1] != rval_0_8[1][1] #track 1 != track 8
+
--- a/pylearn/dataset_ops/protocol.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/dataset_ops/protocol.py	Wed Aug 11 13:16:05 2010 -0400
@@ -50,12 +50,12 @@
         single_broadcastable = tuple(single_broadcastable)
         single_type = theano.tensor.Tensor(
                 broadcastable=single_broadcastable,
-                dtype=dtype,
-                shape=single_shape)
+                dtype=dtype)
+                #shape=single_shape)
         batch_type = theano.tensor.Tensor(
                 broadcastable=(False,)+single_type.broadcastable,
-                dtype=dtype,
-                shape=(batch_size,)+single_type.shape)
+                dtype=dtype)
+                #shape=(batch_size,)+single_type.shape)
         super(TensorDataset, self).__init__(single_type, batch_type)

 class TensorFnDataset(TensorDataset):
--- a/pylearn/dataset_ops/shapeset1.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/dataset_ops/shapeset1.py	Wed Aug 11 13:16:05 2010 -0400
@@ -14,14 +14,14 @@
         if dtype.startswith('uint') or dtype.startswith('int'):
             x *= 255
         _train_cache[dtype] = numpy.asarray(x, dtype=dtype)
-        _train_cache['lbl'] = y
+        _train_cache['lbl'] = numpy.asarray(y, dtype='int32')
     return _train_cache[dtype]
 def train_lbl():
     if 'lbl' not in _train_cache:
         x, y = head_train()
         # cache x in some format now that it's read (it isn't that big).
         _train_cache[x.dtype] = x
-        _train_cache['lbl'] = y
+        _train_cache['lbl'] = numpy.asarray(y, dtype='int32')
     return _train_cache['lbl']
 _valid_cache = {}
 def valid_img(dtype):
@@ -30,14 +30,14 @@
         if dtype.startswith('uint') or dtype.startswith('int'):
             x *= 255
         _valid_cache[dtype] = numpy.asarray(x, dtype=dtype)
-        _valid_cache['lbl'] = y
+        _valid_cache['lbl'] = numpy.asarray(y, dtype='int32')
     return _valid_cache[dtype]
 def valid_lbl():
     if 'lbl' not in _valid_cache:
         x, y = head_valid()
         # cache x in some format now that it's read (it isn't that big).
         _valid_cache[x.dtype] = x
-        _valid_cache['lbl'] = y
+        _valid_cache['lbl'] = numpy.asarray(y, dtype='int32')
     return _valid_cache['lbl']
 _test_cache = {}
 def test_img(dtype):
@@ -46,14 +46,14 @@
         if dtype.startswith('uint') or dtype.startswith('int'):
             x *= 255
         _test_cache[dtype] = numpy.asarray(x, dtype=dtype)
-        _test_cache['lbl'] = y
+        _test_cache['lbl'] = numpy.asarray(y, dtype='int32')
     return _test_cache[dtype]
 def test_lbl():
     if 'lbl' not in _test_cache:
         x, y = head_test()
         # cache x in some format now that it's read (it isn't that big).
         _test_cache[x.dtype] = x
-        _test_cache['lbl'] = y
+        _test_cache['lbl'] = numpy.asarray(y, dtype='int32')
     return _test_cache['lbl']

 _split_fns = dict(
@@ -77,7 +77,7 @@

     x = TensorFnDataset(dtype=dtype, bcast=(False,), fn=(x_fn, (dtype,)),
             single_shape=(1024,))(s_idx)
-    y = TensorFnDataset(dtype='int64', bcast=(), fn=y_fn)(s_idx)
+    y = TensorFnDataset(dtype='int32', bcast=(), fn=y_fn)(s_idx)
     if x.ndim == 1:
         if not rasterized:
             x = x.reshape((32,32))
--- a/pylearn/dataset_ops/tests/test_cifar10.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/dataset_ops/tests/test_cifar10.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,7 +1,7 @@
 import unittest
 import numpy
 import theano
-from theano.compile.sandbox import pfunc, shared
+from theano.compile import pfunc, shared
 from theano import tensor

 from pylearn.dataset_ops.cifar10 import cifar10, forget
--- a/pylearn/datasets/MNIST.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/datasets/MNIST.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,7 +1,6 @@
 """
 Various routines to load/access MNIST data.
 """
-from __future__ import absolute_import

 import os
 import numpy
--- a/pylearn/datasets/cifar10.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/datasets/cifar10.py	Wed Aug 11 13:16:05 2010 -0400
@@ -7,19 +7,46 @@
 import numpy
 import cPickle

+import logging
+_logger = logging.getLogger('pylearn.datasets.cifar10')
+
 from pylearn.datasets.config import data_root # config
-from pylearn.datasets.dataset import Dataset
+from pylearn.datasets.dataset import Dataset # dataset.py

 def unpickle(file):
-    path = os.path.join(data_root(), 'cifar10', 'cifar-10-batches-py')
-    fname = os.path.join(path, file)
-    print 'loading file %s' % fname
+    fname = os.path.join(data_root(),
+            'cifar10',
+            'cifar-10-batches-py',
+            file)
+    _logger.info('loading file %s' % fname)
     fo = open(fname, 'rb')
     dict = cPickle.load(fo)
     fo.close()
     return dict

-class cifar10():
+class cifar10(object):
+    """
+
+    This class gives access to meta-data of cifar10 dataset.
+    The constructor loads it from <data>/cifar10/cifar-10-batches-py/
+    where <data> is the pylearn data root (os.getenv('PYLEARN_DATA_ROOT')).
+
+    Attributes:
+
+    self.img_shape - the unrasterized image shape of each row in all.x
+    self.img_size - the number of pixels in (aka length of) each row
+    self.n_classes - the number of labels in the dataset (10)
+
+    self.all.x    matrix - all train and test images as rasterized rows
+    self.all.y    vector - all train and test labels as integers
+    self.train.x  matrix - first ntrain rows of all.x
+    self.train.y  matrix - first ntrain elements of all.y
+    self.valid.x  matrix - rows ntrain to ntrain+nvalid of all.x
+    self.valid.y  vector - elements ntrain to ntrain+nvalid of all.y
+    self.test.x   matrix - rows ntrain+valid to end of all.x
+    self.test.y   vector - elements ntrain+valid to end of all.y
+
+    """

     def __init__(self, dtype='uint8', ntrain=40000, nvalid=10000, ntest=10000):
         assert ntrain + nvalid <= 50000
@@ -44,6 +71,8 @@

             nloaded += 10000
             if nloaded >= ntrain + nvalid + ntest: break;
+
+        self.all = Dataset.Obj(x=x, y=y)

         self.train = Dataset.Obj(x=x[0:ntrain], y=y[0:ntrain])
         self.valid = Dataset.Obj(x=x[ntrain:ntrain+nvalid],
--- a/pylearn/datasets/dataset.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/datasets/dataset.py	Wed Aug 11 13:16:05 2010 -0400
@@ -108,6 +108,13 @@

     img_shape = None # (rows, cols)

+    """
+    When inputs 'x' must somehow be preprocessed, processor is a function that
+    will take care of it.
+    A cleaner (transparent) alternative would be for x to wrap the data intelligently.
+    """
+    preprocess = None
+

     """
     TIMESERIES
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/majorminer.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,157 @@
+"""
+Load the MajorMiner dataset
+"""
+
+import logging, os,sys
+from .config import data_root
+_logger = logging.getLogger('pylearn.datasets.majorminer')
+
+def three_column(tagfile=None, trackroot=None, expected_tagfile_len=51556):
+    """Load meta-information of major-miner dataset
+
+    Data is stored as a three-column file:
+
+        <tag> <count> <mp3 path>
+
+    This function returns the parsed file as a list of 3-tuples.
+
+    """
+    if tagfile is None:
+        tagfile = os.path.join(data_root(), 'majorminer', 'three_column.txt')
+        _logger.info('Majorminer loading %s'%tagfile)
+
+    if trackroot is None:
+        trackroot = os.path.join(data_root(), 'majorminer')
+        _logger.info('Majorminer using trackroot %s'%tagfile)
+
+    tag_count_track = []
+
+    for line in open(tagfile):
+        if line:
+            tag, count, track = line[:-1].split('\t')
+            tag_count_track.append((tag, int(count), os.path.join(trackroot, track)))
+
+    if expected_tagfile_len:
+        if len(tag_count_track) != expected_tagfile_len:
+            raise Exception('Wrong number of files listed')
+
+    return tag_count_track
+
+try:
+    import mad
+except ImportError:
+    pass
+
+def remove_bad_tracks(three_col, min_seconds=8):
+    """Heuristically filter the three_col data to contain only valid tracks
+    """
+    bad_tracks = set()
+    all_tracks = set()
+
+    silent_tracks = []
+    missing_in_action = []
+    too_short = []
+
+    try:
+        _file = mad.MadFile
+        test_len = True
+    except:
+        _file = file
+        test_len = False
+
+
+    for tag, count, track in three_col:
+        if track in all_tracks:
+            continue
+        all_tracks.add(track)
+        if tag in set(['silence', 'end', 'nothing']):
+            bad_tracks.add(track)
+            silent_tracks.append(track)
+            _logger.debug("silent file: %s" % track)
+            continue
+
+        try:
+            t = _file(track)
+        except IOError:
+            bad_tracks.add(track)
+            missing_in_action.append(track)
+            _logger.debug("missing file: %s"% track)
+            # it is normal to have 2
+            #if len(missing_in_action) > 5:
+                #raise Exception('Too many missing files:', missing_in_action)
+            continue
+
+        if test_len and t.total_time() < min_seconds*1000:
+            # too short
+            bad_tracks.add(track)
+            _logger.debug("short file: %f %s" %(t.total_time(), track))
+            too_short.append((track, t.total_time()))
+            # it is normal to have maybe 10?
+            #if len(too_short) > 40:
+                #raise Exception('Too many short files:', too_short)
+            continue
+
+    if silent_tracks:
+        _logger.warning("removed %i silent files"% len(silent_tracks))
+
+    if missing_in_action:
+        _logger.warning("missing %i files"% len(missing_in_action))
+
+    if too_short:
+        _logger.warning("discarded %i files less than %f seconds long"%(
+            len(too_short), min_seconds))
+
+    _logger.info("kept %i of %i tracks"% (len(all_tracks)-len(bad_tracks),
+        len(all_tracks)))
+
+    # return a cleaned three_column list
+    rval = []
+    for tag, count, track in three_col:
+        if track not in bad_tracks:
+            rval.append((tag, count, track))
+    return rval
+
+
+
+def list_tracks(three_col):
+    tracks = list(set(tup[2] for tup in three_col))
+    tracks.sort()
+    return tracks
+
+def list_tags(three_col):
+    tags = list(set(tup[0] for tup in three_col))
+    tags.sort()
+    return tags
+
+def track_tags(three_col, tracks, tags):
+    """Return the count of each tag for each track
+    [ [(tag_id, count), (tag_id, count), ...],   <---- for tracks[0]
+      [(tag_id, count), (tag_id, count), ...],   <---- for tracks[1]
+      ...
+    ]
+    """
+    tag_id = dict(((t,i) for i,t in enumerate(tags)))
+    track_id = dict(((t,i) for i,t in enumerate(tracks)))
+    rval = [[] for t in tracks]
+    for tag, count, track in three_col:
+        rval[track_id[track]].append((tag_id[tag], count))
+    return rval
+
+
+
+class Meta(object):
+    def __init__(self, tagfile=None, trackroot=None, expected_tagfile_len=51556,
+            filter_broken=True):
+        self.three_column = three_column(tagfile, trackroot, expected_tagfile_len)
+        if filter_broken:
+            self.three_column = remove_bad_tracks(self.three_column)
+        self.tracks = list_tracks(self.three_column)
+        self.tags = list_tags(self.three_column)
+        self.track_tags = track_tags(self.three_column, self.tracks, self.tags)
+
+        _logger.info('MajorMiner meta-information: %i tracks, %i tags' %(
+            len(self.tracks), len(self.tags)))
+
+        #for tt in self.track_tags:
+        #    print tt
+
--- a/pylearn/datasets/miniblocks.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/datasets/miniblocks.py	Wed Aug 11 13:16:05 2010 -0400
@@ -5,7 +5,7 @@

 from pylearn.datasets import Dataset

-def miniblocks(reweight=None):
+def miniblocks(reweight=None, use_inverse=False):
     # If 'reweight' is not None, then it is an integer N such that each
     # sample is duplicated k times, with k taken uniformly in {1, 2, ..., N}.
     # Some adjustment is made to ensure the dataset size is a multiple of its
@@ -50,6 +50,7 @@
     set = Dataset()
     set.train = Dataset.Obj(x = input, y = target)
     set.test = Dataset.Obj(x = input, y = target)
+    set.img_shape = (4,4)

     return set
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/nist_digits.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,66 @@
+"""
+Provides a Dataset to access the nist digits dataset.
+"""
+
+import os, numpy
+from pylearn.io import filetensor as ft
+from pylearn.datasets.config import data_root # config
+from pylearn.datasets.dataset import Dataset
+
+from pylearn.datasets.nist_sd import nist_to_float_11, nist_to_float_01
+
+
+def load(dataset = 'train', attribute = 'data'):
+  """Load the filetensor corresponding to the set and attribute.
+
+  :param dataset: str that is 'train', 'valid' or 'test'
+  :param attribute: str that is 'data' or 'labels'
+  """
+  fn = 'digits_' + dataset + '_' + attribute + '.ft'
+  fn = os.path.join(data_root(), 'nist', 'by_class', 'digits', fn)
+
+  fd = open(fn)
+  data = ft.read(fd)
+  fd.close()
+
+  return data
+
+def train_valid_test(ntrain=285661, nvalid=58646, ntest=58646, path=None,
+    range = '01'):
+  """
+  Load the nist digits dataset as a Dataset.
+
+  @note: the examples are uint8 and the labels are int32.
+  @todo: possibility of loading part of the data.
+  """
+  rval = Dataset()
+
+  #
+  rval.n_classes = 10
+  rval.img_shape = (32,32)
+
+  if range == '01':
+    rval.preprocess = nist_to_float_01
+  elif range == '11':
+    rval.preprocess = nist_to_float_11
+  else:
+    raise ValueError('Nist Digits dataset does not support range = %s' % range)
+  print "Nist Digits dataset: using preproc will provide inputs in the %s range." \
+      % range
+
+  # train
+  examples = load(dataset = 'train', attribute = 'data')
+  labels = load(dataset = 'train', attribute = 'labels')
+  rval.train = Dataset.Obj(x=examples[:ntrain], y=labels[:ntrain])
+
+  # valid
+  rval.valid = Dataset.Obj(x=examples[285661:285661+nvalid], y=labels[285661:285661+nvalid])
+
+  # test
+  examples = load(dataset = 'test', attribute = 'data')
+  labels = load(dataset = 'test', attribute = 'labels')
+  rval.test = Dataset.Obj(x=examples[:ntest], y=labels[:ntest])
+
+  return rval
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/nist_sd.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,71 @@
+"""
+Provides a Dataset to access the nist digits_reshuffled dataset.
+"""
+
+import os, numpy
+from pylearn.io import filetensor as ft
+from pylearn.datasets.config import data_root # config
+from pylearn.datasets.dataset import Dataset
+
+def nist_to_float_11(x):
+  return (x - 128.0)/ 128.0
+
+def nist_to_float_01(x):
+  return x / 255.0
+
+def load(dataset = 'train', attribute = 'data'):
+  """Load the filetensor corresponding to the set and attribute.
+
+  :param dataset: str that is 'train', 'valid' or 'test'
+  :param attribute: str that is 'data' or 'labels'
+  """
+  fn = 'digits_reshuffled_' + dataset + '_' + attribute + '.ft'
+  fn = os.path.join(data_root(), 'nist', 'by_class', 'digits_reshuffled', fn)
+
+  fd = open(fn)
+  data = ft.read(fd)
+  fd.close()
+
+  return data
+
+def train_valid_test(ntrain=285661, nvalid=58646, ntest=58646, path=None,
+    range = '01'):
+  """
+  Load the nist reshuffled digits dataset as a Dataset.
+
+  @note: the examples are uint8 and the labels are int32.
+  @todo: possibility of loading part of the data.
+  """
+  rval = Dataset()
+
+  #
+  rval.n_classes = 10
+  rval.img_shape = (32,32)
+
+  if range == '01':
+    rval.preprocess = nist_to_float_01
+  elif range == '11':
+    rval.preprocess = nist_to_float_11
+  else:
+    raise ValueError('Nist SD dataset does not support range = %s' % range)
+  print "Nist SD dataset: using preproc will provide inputs in the %s range." \
+      % range
+
+  # train
+  examples = load(dataset = 'train', attribute = 'data')
+  labels = load(dataset = 'train', attribute = 'labels')
+  rval.train = Dataset.Obj(x=examples[:ntrain], y=labels[:ntrain])
+
+  # valid
+  examples = load(dataset = 'valid', attribute = 'data')
+  labels = load(dataset = 'valid', attribute = 'labels')
+  rval.valid = Dataset.Obj(x=examples[:nvalid], y=labels[:nvalid])
+
+  # test
+  examples = load(dataset = 'test', attribute = 'data')
+  labels = load(dataset = 'test', attribute = 'labels')
+  rval.test = Dataset.Obj(x=examples[:ntest], y=labels[:ntest])
+
+  return rval
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/datasets/peaked_modes.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,1 @@
+test_modes.py
\ No newline at end of file
--- a/pylearn/datasets/shapeset1.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/datasets/shapeset1.py	Wed Aug 11 13:16:05 2010 -0400
@@ -2,14 +2,12 @@
 Routines to load/access Shapeset1
 """

-from __future__ import absolute_import
-
 import os
 import numpy

-from ..io.amat import AMat
-from .config import data_root
-from .dataset import Dataset
+from pylearn.io.amat import AMat
+from pylearn.datasets.config import data_root
+from pylearn.datasets.dataset import Dataset

 def _head(path, n):
     dat = AMat(path=path, head=n)
--- a/pylearn/io/audio.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/io/audio.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,10 +1,148 @@
-
+import subprocess, sys
 import numpy
 import theano

 from wavread import WavRead, wav_read_int16, wav_read_double
+import mad

-try: #define audioread and company only if pygmy.audio can be imported
+def gen_mp3(madfile, dtype, scale):
+    printed = False
+
+    while True:
+        b = madfile.read()
+        if b is None:
+            break
+        b = numpy.frombuffer(b, dtype='int16')
+        #print len(b), b.min(), b.max()
+        if not printed:
+            bb = b.reshape((len(b)/2,2))
+            print bb[1000:1020]
+            #print 'first 10 mp3samples', b[:10]
+            #print b[:10] * (1.0 / 2**15)
+            printed = True
+        n = len(b)
+        assert not (n%2)
+        yield scale*numpy.asarray(b, dtype=dtype).reshape((n/2, 2)) #cast and reshape
+
+class AudioRead(theano.Op):
+    #TODO: add the samplerate as an output
+    """Read an mp3 (other formats not implemented yet)
+
+    Depends on 'madplay' being on system path.
+
+    input - filename
+    output - the contents of the audiofile in pcm format
+
+    """
+    def __init__(self, channels=2, sr=22050, dtype=theano.config.floatX):
+        """
+        :param channels: output this many channels
+        :param sr: output will be encoded at this samplerate
+        :param dtype: output will have this dtype
+        """
+        self.dtype = dtype
+        if dtype not in ('float32', 'float64', 'int16'):
+            raise NotImplementedError('dtype', dtype)
+        self.channels = channels
+        self.sr = sr
+
+    def __eq__(self, other):
+        return (type(self) == type(other)) and self.dtype == other.dtype \
+                and self.channels == other.channels and self.sr == other.sr
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.dtype) ^ hash(self.channels) ^ hash(self.sr)
+
+    def make_node(self, path):
+        bcast = (False,) *self.channels
+        otype = theano.tensor.TensorType(broadcastable=bcast, dtype=self.dtype)
+        return theano.Apply(self, [path], [otype(),])
+
+    def perform(self, node, (path,), (data_storage, )):
+        if path.upper().endswith('.MP3'):
+            cmd = ['madplay']
+            cmd.extend(['--sample-rate', str(self.sr)])
+            cmd.extend(['-o', 'raw:/dev/stdout'])
+            cmd.extend(['-d',])
+            if self.channels==1:
+                cmd.extend(['--mono'])
+            elif self.channels==2:
+                cmd.extend(['--stereo'])
+            else:
+                raise NotImplementedError("weird number of channels", self.channels)
+            cmd.append(path)
+
+            proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+            proc_stdout, proc_stderr = proc.communicate()
+            assert proc.returncode is not None # process should be finished
+            if proc.returncode:
+                print >> sys.stderr, proc_stderr
+                raise Exception('cmd %s returned code %i'%(' '.join(cmd),proc.returncode))
+
+            int16samples= numpy.frombuffer(proc_stdout, dtype='int16')
+            if self.dtype == 'float32':
+                typedsamples = numpy.asarray(int16samples, dtype='float32') / numpy.float32(2**15)
+            elif self.dtype == 'float64':
+                typedsamples = int16samples * (1.0/2**15)
+            elif self.dtype == 'int16':
+                typedsamples = int16samples
+            else:
+                raise NotImplementedError()
+
+            if self.channels==2:
+                typedsamples = typedsamples.reshape((len(typedsamples)/2,2))
+        else:
+            #TODO: if extension is .wav use the 'wave' module in the stdlib
+            #      see test_audioread below for usage
+            raise NotImplementedError()
+
+        assert typedsamples.dtype == self.dtype
+        assert len(typedsamples.shape) == self.channels, (typedsamples.shape, self.channels)
+        data_storage[0] = typedsamples
+
+    def grad(self, inputs, g_output):
+        return [None for i in inputs]
+
+
+def test_audioread():
+    #
+    # Not really a unit test because it depends on files that are probably not around anymore.
+    # Still, the basic idea is to decode externally, and compare with wavread.
+    #
+
+    mp3path = "/home/bergstra/data/majorminer/mp3/Mono/Formica Blues/03 Slimcea Girl_003.20_003.30.mp3"
+
+    dstorage = [None]
+    AudioRead(channels=1, dtype='float32', sr=44100).perform(None, (mp3path,), (dstorage, ))
+    mp3samples = dstorage[0]
+
+    wavpath = "/home/bergstra/tmp/blah2.wav"
+    import wave, numpy
+    wavfile = wave.open(wavpath)
+    assert wavfile.getsampwidth()==2 # bytes
+    wavsamples = numpy.frombuffer(
+            wavfile.readframes(wavfile.getnframes()),
+            dtype='int16')
+    wavsamples = wavsamples.reshape((wavfile.getnframes(), wavfile.getnchannels()))
+    wavsamples_as_float = numpy.asarray(wavsamples, dtype='float32') / 2**15
+
+    print 'wavsamples 1000:1020:', wavsamples[1000:1020].mean(axis=1)
+    print 'mp3samples 1000:1020:', mp3samples[1000:1020]*2**15
+    print 'wavsample range', wavsamples.min(), wavsamples.max()
+    print 'mp3sample range', mp3samples.min(), mp3samples.max()
+
+    print mp3samples.shape, mp3samples.dtype
+    print wavsamples.shape, wavsamples.dtype
+
+    #assert mp3samples.shape == wavsamples.shape
+    #assert mp3samples.dtype == wavsamples_as_float.dtype
+
+    #print wavsamples_as_float[:5]
+    #print mp3samples[:5]
+
+
+
+if 0: ### OLD CODE USING PYGMY
     import pygmy.audio

     class AudioRead(theano.Op):
@@ -42,6 +180,7 @@
         def make_node(self, path):
             out_type = theano.tensor.dvector if self.mono else theano.tensor.dmatrix
             return theano.Apply(self, [path], [out_type(), theano.tensor.dscalar()])
+
         def perform(self, node, (path,), (data_storage, sr_storage)):
             data, sr, dz = pygmy.audio.audioread(path,
                     mono=self.mono,
@@ -64,6 +203,3 @@

     audioread = AudioRead()
     audioread_mono = AudioRead(mono=True)
-except ImportError:
-    pass
-
--- a/pylearn/io/filetensor.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/io/filetensor.py	Wed Aug 11 13:16:05 2010 -0400
@@ -183,6 +183,7 @@

     """
     magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug)
+    f_start = f.tell()

     rval = None
     if subtensor is None:
@@ -192,8 +193,8 @@
             raise NotImplementedError('slice with step', subtensor.step)
         if subtensor.start not in (None, 0):
             bytes_per_row = _prod(dim[1:]) * elsize
-            raise NotImplementedError('slice with start', subtensor.start)
-        dim[0] = min(dim[0], subtensor.stop)
+            f.seek(f_start+subtensor.start * bytes_per_row)
+        dim[0] = min(dim[0], subtensor.stop) - subtensor.start
         rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim)
     else:
         raise NotImplementedError('subtensor access not written yet:', subtensor)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/io/seriestables/__init__.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,2 @@
+from series import ErrorSeries, BasicStatisticsSeries, AccumulatorSeriesWrapper, SeriesArrayWrapper, SharedParamsStatisticsWrapper, DummySeries, StdoutAppendTarget, AppendTarget
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/io/seriestables/series.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,688 @@
+import tables
+
+import numpy
+import time
+
+##############################################################################
+# Utility functions to create IsDescription objects (pytables data types)
+
+'''
+The way these "IsDescription constructor" work is simple: write the
+code as if it were in a file, then exec()ute it, leaving us with
+a local-scoped LocalDescription which may be used to call createTable.
+
+It's a small hack, but it's necessary as the names of the columns
+are retrieved based on the variable name, which we can't programmatically set
+otherwise.
+'''
+
+def _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock, pos=0):
+    toexec = ""
+
+    if store_timestamp:
+        toexec += "\ttimestamp = tables.Time32Col(pos="+str(pos)+")\n"
+        pos += 1
+
+    if store_cpuclock:
+        toexec += "\tcpuclock = tables.Float64Col(pos="+str(pos)+")\n"
+        pos += 1
+
+    return toexec, pos
+
+def _get_description_n_ints(int_names, int_width=64, pos=0):
+    """
+    Begins construction of a class inheriting from IsDescription
+    to construct an HDF5 table with index columns named with int_names.
+
+    See Series().__init__ to see how those are used.
+    """
+    int_constructor = "tables.Int64Col"
+    if int_width == 32:
+        int_constructor = "tables.Int32Col"
+    elif not int_width in (32, 64):
+        raise "int_width must be left unspecified, or should equal 32 or 64"
+
+    toexec = ""
+
+    for n in int_names:
+        toexec += "\t" + n + " = " + int_constructor + "(pos=" + str(pos) + ")\n"
+        pos += 1
+
+    return toexec, pos
+
+def _get_description_with_n_ints_n_floats(int_names, float_names,
+                        int_width=64, float_width=32,
+                        store_timestamp=True, store_cpuclock=True):
+    """
+    Constructs a class to be used when constructing a table with PyTables.
+
+    This is useful to construct a series with an index with multiple levels.
+    E.g. if you want to index your "validation error" with "epoch" first, then
+    "minibatch_index" second, you'd use two "int_names".
+
+    Parameters
+    ----------
+    int_names : tuple of str
+        Names of the int (e.g. index) columns
+    float_names : tuple of str
+        Names of the float (e.g. error) columns
+    int_width : {'32', '64'}
+        Type of ints.
+    float_width : {'32', '64'}
+        Type of floats.
+    store_timestamp : bool
+        See __init__ of Series
+    store_cpuclock : bool
+        See __init__ of Series
+
+    Returns
+    -------
+    A class object, to pass to createTable()
+    """
+
+    toexec = "class LocalDescription(tables.IsDescription):\n"
+
+    toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock)
+    toexec += toexec_
+
+    toexec_, pos = _get_description_n_ints(int_names, int_width=int_width, pos=pos)
+    toexec += toexec_
+
+    float_constructor = "tables.Float32Col"
+    if float_width == 64:
+        float_constructor = "tables.Float64Col"
+    elif not float_width in (32, 64):
+        raise "float_width must be left unspecified, or should equal 32 or 64"
+
+    for n in float_names:
+        toexec += "\t" + n + " = " + float_constructor + "(pos=" + str(pos) + ")\n"
+        pos += 1
+
+    exec(toexec)
+
+    return LocalDescription
+
+
+##############################################################################
+# Generic target helpers, other than HDF5 itself
+
+class AppendTarget(object):
+    def __init__(self):
+        pass
+
+    def append(self, table, row):
+        pass
+
+class StdoutAppendTarget(AppendTarget):
+    '''
+    Every append() translates into the row being printed on stdout,
+    each field on a line of the form "column_name : value"
+    '''
+    def __init__(self, prepend=None, indent_str='\t'):
+        '''
+        Parameters
+        ----------
+        prepend : str
+            String to prepend before each "append()" is dumped on stdout.
+        indent_str : str
+            Chars to prepend to each line
+        '''
+        self.prepend = prepend
+        self.indent_str = indent_str
+
+    def append(self, table, row):
+        if not self.prepend:
+            print table._v_pathname
+        else:
+            print self.prepend
+        pretty_print_row(table, row, self.indent_str)
+
+def pretty_print_row(table, row, indent):
+    for key in table.colnames:
+        print indent, key, ":", row[key]
+
+class CallbackAppendTarget(AppendTarget):
+    '''
+    Mostly to be used for tests.
+    '''
+    def __init__(self, callback):
+        self.callback = callback
+
+    def append(self, table, row):
+        self.callback(table, row)
+
+##############################################################################
+# Series classes
+
+# Shortcut to allow passing a single int as index, instead of a tuple
+def _index_to_tuple(index):
+    if type(index) == tuple:
+        return index
+
+    if type(index) == list:
+        index = tuple(index)
+        return index
+
+    try:
+        if index % 1 > 0.001 and index % 1 < 0.999:
+            raise
+        idx = long(index)
+        return (idx,)
+    except:
+        raise TypeError("index must be a tuple of integers, or at least a single integer")
+
+class Series(object):
+    """
+    Base Series class, with minimal arguments and type checks.
+
+    Yet cannot be used by itself (it's append() method raises an error)
+    """
+
+    def __init__(self, table_name, hdf5_file, index_names=('epoch',),
+                    title="", hdf5_group='/',
+                    store_timestamp=True, store_cpuclock=True,
+                    other_targets=[], skip_hdf5_append=False):
+        """Basic arguments each Series must get.
+
+        Parameters
+        ----------
+        table_name : str
+            Name of the table to create under group "hd5_group" (other
+            parameter). No spaces, ie. follow variable naming restrictions.
+        hdf5_file : open HDF5 file
+            File opened with openFile() in PyTables (ie. return value of
+            openFile).
+        index_names : tuple of str
+            Columns to use as index for elements in the series, other
+            example would be ('epoch', 'minibatch'). This would then allow
+            you to call append(index, element) with index made of two ints,
+            one for epoch index, one for minibatch index in epoch.
+        title : str
+            Title to attach to this table as metadata. Can contain spaces
+            and be longer then the table_name.
+        hdf5_group : str
+            Path of the group (kind of a file) in the HDF5 file under which
+            to create the table.
+        store_timestamp : bool
+            Whether to create a column for timestamps and store them with
+            each record.
+        store_cpuclock : bool
+            Whether to create a column for cpu clock and store it with
+            each record.
+        other_targets : list of str or AppendTarget instances
+
+        """
+
+        #########################################
+        # checks
+
+        if type(table_name) != str:
+            raise TypeError("table_name must be a string")
+        if table_name == "":
+            raise ValueError("table_name must not be empty")
+
+        if not isinstance(hdf5_file, tables.file.File):
+            raise TypeError("hdf5_file must be an open HDF5 file (use tables.openFile)")
+        #if not ('w' in hdf5_file.mode or 'a' in hdf5_file.mode):
+        #    raise ValueError("hdf5_file must be opened in write or append mode")
+
+        if type(index_names) != tuple:
+            raise TypeError("index_names must be a tuple of strings." + \
+                    "If you have only one element in the tuple, don't forget " +\
+                    "to add a comma, e.g. ('epoch',).")
+        for name in index_names:
+            if type(name) != str:
+                raise TypeError("index_names must only contain strings, but also"+\
+                        "contains a "+str(type(name))+".")
+
+        if type(title) != str:
+            raise TypeError("title must be a string, even if empty")
+
+        if type(hdf5_group) != str:
+            raise TypeError("hdf5_group must be a string")
+
+        if type(store_timestamp) != bool:
+            raise TypeError("store_timestamp must be a bool")
+
+        if type(store_cpuclock) != bool:
+            raise TypeError("store_timestamp must be a bool")
+
+        if type(other_targets) != list:
+            raise TypeError("other_targets must be a list")
+        else:
+            for t in other_targets:
+                if not isinstance(t, AppendTarget):
+                    raise TypeError("other_targets elements must be instances of AppendTarget")
+
+        if type(skip_hdf5_append) != bool:
+            raise TypeError("skip_hdf5_append must be a bool")
+
+        #########################################
+
+        self.table_name = table_name
+        self.hdf5_file = hdf5_file
+        self.index_names = index_names
+        self.title = title
+        self.hdf5_group = hdf5_group
+
+        self.store_timestamp = store_timestamp
+        self.store_cpuclock = store_cpuclock
+
+        self.other_targets = other_targets
+        self.skip_hdf5_append = skip_hdf5_append
+
+    def append(self, index, element):
+        raise NotImplementedError
+
+    def _timestamp_cpuclock(self, newrow):
+        if self.store_timestamp:
+            newrow["timestamp"] = time.time()
+
+        if self.store_cpuclock:
+            newrow["cpuclock"] = time.clock()
+
+class DummySeries():
+    """
+    To put in a series dictionary instead of a real series, to do nothing
+    when we don't want a given series to be saved.
+
+    E.g. if we'd normally have a "training_error" series in a dictionary
+    of series, the training loop would have something like this somewhere:
+
+        series["training_error"].append((15,), 20.0)
+
+    but if we don't want to save the training errors this time, we simply
+    do
+
+        series["training_error"] = DummySeries()
+    """
+    def append(self, index, element):
+        pass
+
+class ErrorSeries(Series):
+    """
+    Most basic Series: saves a single float (called an Error as this is
+    the most common use case I foresee) along with an index (epoch, for
+    example) and timestamp/cpu.clock for each of these floats.
+    """
+
+    def __init__(self, error_name, table_name,
+                    hdf5_file, index_names=('epoch',),
+                    title="", hdf5_group='/',
+                    store_timestamp=True, store_cpuclock=True,
+                    other_targets=[], skip_hdf5_append=False):
+        """
+        For most parameters, see Series.__init__
+
+        Parameters
+        ----------
+        error_name : str
+            In the HDF5 table, column name for the error float itself.
+        """
+
+        # most type/value checks are performed in Series.__init__
+        Series.__init__(self, table_name, hdf5_file, index_names, title,
+                            hdf5_group=hdf5_group,
+                            store_timestamp=store_timestamp,
+                            store_cpuclock=store_cpuclock,
+                            other_targets=other_targets,
+                            skip_hdf5_append=skip_hdf5_append)
+
+        if type(error_name) != str:
+            raise TypeError("error_name must be a string")
+        if error_name == "":
+            raise ValueError("error_name must not be empty")
+
+        self.error_name = error_name
+
+        self._create_table()
+
+    def _create_table(self):
+       table_description = _get_description_with_n_ints_n_floats( \
+                                  self.index_names, (self.error_name,),
+                                  store_timestamp=self.store_timestamp,
+                                  store_cpuclock=self.store_cpuclock)
+
+       self._table = self.hdf5_file.createTable(self.hdf5_group,
+                            self.table_name,
+                            table_description,
+                            title=self.title)
+
+
+    def append(self, index, error):
+        """
+        Parameters
+        ----------
+        index : tuple of int
+            Following index_names passed to __init__, e.g. (12, 15) if
+            index_names were ('epoch', 'minibatch_size').
+            A single int (not tuple) is acceptable if index_names has a single
+            element.
+            An array will be casted to a tuple, as a convenience.
+
+        error : float
+            Next error in the series.
+        """
+        index = _index_to_tuple(index)
+
+        if len(index) != len(self.index_names):
+            raise ValueError("index provided does not have the right length (expected " \
+                            + str(len(self.index_names)) + " got " + str(len(index)))
+
+        # other checks are implicit when calling newrow[..] =,
+        # which should throw an error if not of the right type
+
+        newrow = self._table.row
+
+        # Columns for index in table are based on index_names
+        for col_name, value in zip(self.index_names, index):
+            newrow[col_name] = value
+        newrow[self.error_name] = error
+
+        # adds timestamp and cpuclock to newrow if necessary
+        self._timestamp_cpuclock(newrow)
+
+        for t in self.other_targets:
+            t.append(self._table, newrow)
+
+        if not self.skip_hdf5_append:
+            newrow.append()
+
+            self.hdf5_file.flush()
+
+# Does not inherit from Series because it does not itself need to
+# access the hdf5_file and does not need a series_name (provided
+# by the base_series.)
+class AccumulatorSeriesWrapper():
+    '''
+    Wraps a Series by accumulating objects passed its Accumulator.append()
+    method and "reducing" (e.g. calling numpy.mean(list)) once in a while,
+    every "reduce_every" calls in fact.
+    '''
+
+    def __init__(self, base_series, reduce_every, reduce_function=numpy.mean):
+        """
+        Parameters
+        ----------
+        base_series : Series
+            This object must have an append(index, value) function.
+
+        reduce_every : int
+            Apply the reduction function (e.g. mean()) every time we get this
+            number of elements. E.g. if this is 100, then every 100 numbers
+            passed to append(), we'll take the mean and call append(this_mean)
+            on the BaseSeries.
+
+        reduce_function : function
+            Must take as input an array of "elements", as passed to (this
+            accumulator's) append(). Basic case would be to take an array of
+            floats and sum them into one float, for example.
+        """
+        self.base_series = base_series
+        self.reduce_function = reduce_function
+        self.reduce_every = reduce_every
+
+        self._buffer = []
+
+
+    def append(self, index, element):
+        """
+        Parameters
+        ----------
+        index : tuple of int
+            The index used is the one of the last element reduced. E.g. if
+            you accumulate over the first 1000 minibatches, the index
+            passed to the base_series.append() function will be 1000.
+            A single int (not tuple) is acceptable if index_names has a single
+            element.
+            An array will be casted to a tuple, as a convenience.
+
+        element : float
+            Element that will be accumulated.
+        """
+        self._buffer.append(element)
+
+        if len(self._buffer) == self.reduce_every:
+            reduced = self.reduce_function(self._buffer)
+            self.base_series.append(index, reduced)
+            self._buffer = []
+
+        # The >= case should never happen, except if lists
+        # were appended by accessing _buffer externally (when it's
+        # intended to be private), which should be a red flag.
+        assert len(self._buffer) < self.reduce_every
+
+# Outside of class to fix an issue with exec in Python 2.6.
+# My sorries to the god of pretty code.
+def _BasicStatisticsSeries_construct_table_toexec(index_names, store_timestamp, store_cpuclock):
+    toexec = "class LocalDescription(tables.IsDescription):\n"
+
+    toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock)
+    toexec += toexec_
+
+    toexec_, pos = _get_description_n_ints(index_names, pos=pos)
+    toexec += toexec_
+
+    toexec += "\tmean = tables.Float32Col(pos=" + str(pos) + ")\n"
+    toexec += "\tmin = tables.Float32Col(pos=" + str(pos+1) + ")\n"
+    toexec += "\tmax = tables.Float32Col(pos=" + str(pos+2) + ")\n"
+    toexec += "\tstd = tables.Float32Col(pos=" + str(pos+3) + ")\n"
+
+    # This creates "LocalDescription", which we may then use
+    exec(toexec)
+
+    return LocalDescription
+
+# Defaults functions for BasicStatsSeries. These can be replaced.
+_basic_stats_functions = {'mean': lambda(x): numpy.mean(x),
+                    'min': lambda(x): numpy.min(x),
+                    'max': lambda(x): numpy.max(x),
+                    'std': lambda(x): numpy.std(x)}
+
+class BasicStatisticsSeries(Series):
+
+    def __init__(self, table_name, hdf5_file,
+                    stats_functions=_basic_stats_functions,
+                    index_names=('epoch',), title="", hdf5_group='/',
+                    store_timestamp=True, store_cpuclock=True,
+                    other_targets=[], skip_hdf5_append=False):
+        """
+        For most parameters, see Series.__init__
+
+        Parameters
+        ----------
+        series_name : str
+            Not optional here. Will be prepended with "Basic statistics for "
+
+        stats_functions : dict, optional
+            Dictionary with a function for each key "mean", "min", "max",
+            "std". The function must take whatever is passed to append(...)
+            and return a single number (float).
+        """
+
+        # Most type/value checks performed in Series.__init__
+        Series.__init__(self, table_name, hdf5_file, index_names, title,
+                            store_timestamp=store_timestamp,
+                            store_cpuclock=store_cpuclock,
+                            other_targets=other_targets,
+                            skip_hdf5_append=skip_hdf5_append)
+
+        if type(hdf5_group) != str:
+            raise TypeError("hdf5_group must be a string")
+
+        if type(stats_functions) != dict:
+            # just a basic check. We'll suppose caller knows what he's doing.
+            raise TypeError("stats_functions must be a dict")
+
+        self.hdf5_group = hdf5_group
+
+        self.stats_functions = stats_functions
+
+        self._create_table()
+
+    def _create_table(self):
+        table_description = \
+                _BasicStatisticsSeries_construct_table_toexec( \
+                    self.index_names,
+                    self.store_timestamp, self.store_cpuclock)
+
+        self._table = self.hdf5_file.createTable(self.hdf5_group,
+                         self.table_name, table_description)
+
+    def append(self, index, array):
+        """
+        Parameters
+        ----------
+        index : tuple of int
+            Following index_names passed to __init__, e.g. (12, 15)
+            if index_names were ('epoch', 'minibatch_size')
+            A single int (not tuple) is acceptable if index_names has a single
+            element.
+            An array will be casted to a tuple, as a convenience.
+
+        array
+            Is of whatever type the stats_functions passed to
+            __init__ can take. Default is anything numpy.mean(),
+            min(), max(), std() can take.
+        """
+        index = _index_to_tuple(index)
+
+        if len(index) != len(self.index_names):
+            raise ValueError("index provided does not have the right length (expected " \
+                            + str(len(self.index_names)) + " got " + str(len(index)))
+
+        newrow = self._table.row
+
+        for col_name, value in zip(self.index_names, index):
+            newrow[col_name] = value
+
+        newrow["mean"] = self.stats_functions['mean'](array)
+        newrow["min"] = self.stats_functions['min'](array)
+        newrow["max"] = self.stats_functions['max'](array)
+        newrow["std"] = self.stats_functions['std'](array)
+
+        self._timestamp_cpuclock(newrow)
+
+        for t in self.other_targets:
+            t.append(self._table, newrow)
+
+        if not self.skip_hdf5_append:
+            newrow.append()
+
+            self.hdf5_file.flush()
+
+class SeriesArrayWrapper():
+    """
+    Simply redistributes any number of elements to sub-series to respective
+    append()s.
+
+    To use if you have many elements to append in similar series, e.g. if you
+    have an array containing [train_error, valid_error, test_error], and 3
+    corresponding series, this allows you to simply pass this array of 3
+    values to append() instead of passing each element to each individual
+    series in turn.
+    """
+
+    def __init__(self, base_series_list):
+        """
+        Parameters
+        ----------
+        base_series_list : array or tuple of Series
+            You must have previously created and configured each of those
+            series, then put them in an array. This array must follow the
+            same order as the array passed as ``elements`` parameter of
+            append().
+        """
+        self.base_series_list = base_series_list
+
+    def append(self, index, elements):
+        """
+        Parameters
+        ----------
+        index : tuple of int
+            See for example ErrorSeries.append()
+
+        elements : array or tuple
+            Array or tuple of elements that will be passed down to
+            the base_series passed to __init__, in the same order.
+        """
+        if len(elements) != len(self.base_series_list):
+            raise ValueError("not enough or too much elements provided (expected " \
+                            + str(len(self.base_series_list)) + " got " + str(len(elements)))
+
+        for series, el in zip(self.base_series_list, elements):
+            series.append(index, el)
+
+class SharedParamsStatisticsWrapper(SeriesArrayWrapper):
+    '''
+    Save mean, min/max, std of shared parameters place in an array.
+
+    Here "shared" means "theano.shared", which means elements of the
+    array will have a .value to use for numpy.mean(), etc.
+
+    This inherits from SeriesArrayWrapper, which provides the append()
+    method.
+    '''
+
+    def __init__(self, arrays_names, new_group_name, hdf5_file,
+                    base_group='/', index_names=('epoch',), title="",
+                    store_timestamp=True, store_cpuclock=True,
+                    other_targets=[], skip_hdf5_append=False):
+        """
+        For other parameters, see Series.__init__
+
+        Parameters
+        ----------
+        array_names : array or tuple of str
+            Name of each array, in order of the array passed to append(). E.g.
+            ('layer1_b', 'layer1_W', 'layer2_b', 'layer2_W')
+
+        new_group_name : str
+            Name of a new HDF5 group which will be created under base_group to
+            store the new series.
+
+        base_group : str
+            Path of the group under which to create the new group which will
+            store the series.
+
+        title : str
+            Here the title is attached to the new group, not a table.
+
+        store_timestamp : bool
+            Here timestamp and cpuclock are stored in *each* table
+
+        store_cpuclock : bool
+            Here timestamp and cpuclock are stored in *each* table
+        """
+
+        # most other checks done when calling BasicStatisticsSeries
+        if type(new_group_name) != str:
+            raise TypeError("new_group_name must be a string")
+        if new_group_name == "":
+            raise ValueError("new_group_name must not be empty")
+
+        base_series_list = []
+
+        new_group = hdf5_file.createGroup(base_group, new_group_name, title=title)
+
+        stats_functions = {'mean': lambda(x): numpy.mean(x.value),
+                    'min': lambda(x): numpy.min(x.value),
+                    'max': lambda(x): numpy.max(x.value),
+                    'std': lambda(x): numpy.std(x.value)}
+
+        for name in arrays_names:
+            base_series_list.append(
+                        BasicStatisticsSeries(
+                                table_name=name,
+                                hdf5_file=hdf5_file,
+                                index_names=index_names,
+                                stats_functions=stats_functions,
+                                hdf5_group=new_group._v_pathname,
+                                store_timestamp=store_timestamp,
+                                store_cpuclock=store_cpuclock,
+                                other_targets=other_targets,
+                                skip_hdf5_append=skip_hdf5_append))
+
+        SeriesArrayWrapper.__init__(self, base_series_list)
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pylearn/io/seriestables/test_series.py	Wed Aug 11 13:16:05 2010 -0400
@@ -0,0 +1,390 @@
+import tempfile
+
+import numpy
+import numpy.random
+
+from jobman import DD
+
+import tables
+
+from series import *
+import series
+
+#################################################
+# Utils
+
+def compare_floats(f1,f2):
+    if f1-f2 < 1e-3:
+        return True
+    return False
+
+def compare_lists(it1, it2, floats=False):
+    if len(it1) != len(it2):
+        return False
+
+    for el1,  el2 in zip(it1, it2):
+        if floats:
+            if not compare_floats(el1,el2):
+                return False
+        elif el1 != el2:
+            return False
+
+    return True
+
+#################################################
+# Basic Series class tests
+
+def test_Series_types():
+    pass
+
+#################################################
+# ErrorSeries tests
+
+def test_ErrorSeries_common_case(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error",
+                                hdf5_file=h5f, index_names=('epoch','minibatch'),
+                                title="Validation error indexed by epoch and minibatch")
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    validation_error.append((1,1), 32.0)
+    validation_error.append((1,2), 30.0)
+    validation_error.append((2,1), 28.0)
+    validation_error.append((2,2), 26.0)
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/', 'validation_error')
+
+    assert compare_lists(table.cols.epoch[:], [1,1,2,2])
+    assert compare_lists(table.cols.minibatch[:], [1,2,1,2])
+    assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0])
+    assert len(table) == 4
+
+
+def test_ErrorSeries_with_group(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    new_group = h5f.createGroup('/','generic_errors')
+
+    validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error",
+                                hdf5_file=h5f, index_names=('epoch','minibatch'),
+                                hdf5_group=new_group._v_pathname,
+                                title="Validation error indexed by epoch and minibatch")
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    validation_error.append((1,1), 32.0)
+    validation_error.append((1,2), 30.0)
+    validation_error.append((2,1), 28.0)
+    validation_error.append((2,2), 26.0)
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/generic_errors', 'validation_error')
+
+    assert compare_lists(table.cols.epoch[:], [1,1,2,2])
+    assert compare_lists(table.cols.minibatch[:], [1,2,1,2])
+    assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0])
+    assert len(table) == 4
+
+
+def test_ErrorSeries_no_index(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    validation_error = series.ErrorSeries(error_name="validation_error",
+                                table_name="validation_error",
+                                hdf5_file=h5f,
+                                # empty tuple
+                                index_names=tuple(),
+                                title="Validation error with no index")
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    validation_error.append(tuple(), 32.0)
+    validation_error.append(tuple(), 30.0)
+    validation_error.append(tuple(), 28.0)
+    validation_error.append(tuple(), 26.0)
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/', 'validation_error')
+
+    assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0])
+    assert not ("epoch" in dir(table.cols))
+
+def test_ErrorSeries_notimestamp(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error",
+                                hdf5_file=h5f, index_names=('epoch','minibatch'),
+                                title="Validation error indexed by epoch and minibatch",
+                                store_timestamp=False)
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    validation_error.append((1,1), 32.0)
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/', 'validation_error')
+
+    assert compare_lists(table.cols.epoch[:], [1])
+    assert not ("timestamp" in dir(table.cols))
+    assert "cpuclock" in dir(table.cols)
+
+def test_ErrorSeries_nocpuclock(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error",
+                                hdf5_file=h5f, index_names=('epoch','minibatch'),
+                                title="Validation error indexed by epoch and minibatch",
+                                store_cpuclock=False)
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    validation_error.append((1,1), 32.0)
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/', 'validation_error')
+
+    assert compare_lists(table.cols.epoch[:], [1])
+    assert not ("cpuclock" in dir(table.cols))
+    assert "timestamp" in dir(table.cols)
+
+def test_AccumulatorSeriesWrapper_common_case(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    validation_error = ErrorSeries(error_name="accumulated_validation_error",
+                                table_name="accumulated_validation_error",
+                                hdf5_file=h5f,
+                                index_names=('epoch','minibatch'),
+                                title="Validation error, summed every 3 minibatches, indexed by epoch and minibatch")
+
+    accumulator = AccumulatorSeriesWrapper(base_series=validation_error,
+                                    reduce_every=3, reduce_function=numpy.sum)
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    accumulator.append((1,1), 32.0)
+    accumulator.append((1,2), 30.0)
+    accumulator.append((2,1), 28.0)
+    accumulator.append((2,2), 26.0)
+    accumulator.append((3,1), 24.0)
+    accumulator.append((3,2), 22.0)
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/', 'accumulated_validation_error')
+
+    assert compare_lists(table.cols.epoch[:], [2,3])
+    assert compare_lists(table.cols.minibatch[:], [1,2])
+    assert compare_lists(table.cols.accumulated_validation_error[:], [90.0,72.0], floats=True)
+
+def test_BasicStatisticsSeries_common_case(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    stats_series = BasicStatisticsSeries(table_name="b_vector_statistics",
+                                hdf5_file=h5f, index_names=('epoch','minibatch'),
+                                title="Basic statistics for b vector indexed by epoch and minibatch")
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    stats_series.append((1,1), [0.15, 0.20, 0.30])
+    stats_series.append((1,2), [-0.18, 0.30, 0.58])
+    stats_series.append((2,1), [0.18, -0.38, -0.68])
+    stats_series.append((2,2), [0.15, 0.02, 1.9])
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/', 'b_vector_statistics')
+
+    assert compare_lists(table.cols.epoch[:], [1,1,2,2])
+    assert compare_lists(table.cols.minibatch[:], [1,2,1,2])
+    assert compare_lists(table.cols.mean[:], [0.21666667,  0.23333333, -0.29333332,  0.69], floats=True)
+    assert compare_lists(table.cols.min[:], [0.15000001, -0.18000001, -0.68000001,  0.02], floats=True)
+    assert compare_lists(table.cols.max[:], [0.30, 0.58, 0.18, 1.9], floats=True)
+    assert compare_lists(table.cols.std[:], [0.06236095, 0.31382939,  0.35640177, 0.85724366], floats=True)
+
+def test_SharedParamsStatisticsWrapper_commoncase(h5f=None):
+    import numpy.random
+
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/",
+                                arrays_names=('b1','b2','b3'), hdf5_file=h5f,
+                                index_names=('epoch','minibatch'))
+
+    b1 = DD({'value':numpy.random.rand(5)})
+    b2 = DD({'value':numpy.random.rand(5)})
+    b3 = DD({'value':numpy.random.rand(5)})
+    stats.append((1,1), [b1,b2,b3])
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    b1_table = h5f.getNode('/params', 'b1')
+    b3_table = h5f.getNode('/params', 'b3')
+
+    assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3
+    assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3
+    assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3
+    assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3
+
+def test_SharedParamsStatisticsWrapper_notimestamp(h5f=None):
+    import numpy.random
+
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/",
+                                arrays_names=('b1','b2','b3'), hdf5_file=h5f,
+                                index_names=('epoch','minibatch'),
+                                store_timestamp=False)
+
+    b1 = DD({'value':numpy.random.rand(5)})
+    b2 = DD({'value':numpy.random.rand(5)})
+    b3 = DD({'value':numpy.random.rand(5)})
+    stats.append((1,1), [b1,b2,b3])
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    b1_table = h5f.getNode('/params', 'b1')
+    b3_table = h5f.getNode('/params', 'b3')
+
+    assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3
+    assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3
+    assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3
+    assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3
+
+    assert not ('timestamp' in dir(b1_table.cols))
+
+def test_get_desc():
+    h5f_path = tempfile.NamedTemporaryFile().name
+    h5f = tables.openFile(h5f_path, "w")
+
+    desc = series._get_description_with_n_ints_n_floats(("col1","col2"), ("col3","col4"))
+
+    mytable = h5f.createTable('/', 'mytable', desc)
+
+    # just make sure the columns are there... otherwise this will throw an exception
+    mytable.cols.col1
+    mytable.cols.col2
+    mytable.cols.col3
+    mytable.cols.col4
+
+    try:
+        # this should fail... LocalDescription must be local to get_desc_etc
+        test = LocalDescription
+        assert False
+    except:
+        assert True
+
+    assert True
+
+def test_index_to_tuple_floaterror():
+    try:
+        series._index_to_tuple(5.1)
+        assert False
+    except TypeError:
+        assert True
+
+def test_index_to_tuple_arrayok():
+    tpl = series._index_to_tuple([1,2,3])
+    assert type(tpl) == tuple and tpl[1] == 2 and tpl[2] == 3
+
+def test_index_to_tuple_intbecomestuple():
+    tpl = series._index_to_tuple(32)
+
+    assert type(tpl) == tuple and tpl == (32,)
+
+def test_index_to_tuple_longbecomestuple():
+    tpl = series._index_to_tuple(928374928374928L)
+
+    assert type(tpl) == tuple and tpl == (928374928374928L,)
+
+
+
+
+
+def test_ErrorSeries_appendtarget(h5f=None):
+    if not h5f:
+        h5f_path = tempfile.NamedTemporaryFile().name
+        h5f = tables.openFile(h5f_path, "w")
+
+    validation_errors_from_callback = []
+
+    def my_callback(table, row):
+        validation_errors_from_callback.append(row['validation_error'])
+
+    my_callback_target = CallbackAppendTarget(my_callback)
+
+    validation_error = series.ErrorSeries(error_name="validation_error",
+                                table_name="validation_error",
+                                hdf5_file=h5f,
+                                index_names=('minibatch',),
+                                title="Validation error with no index",
+                                other_targets=[my_callback_target],
+                                skip_hdf5_append=True)
+
+    # (1,1), (1,2) etc. are (epoch, minibatch) index
+    validation_error.append(2, 32.0)
+    validation_error.append(3, 30.0)
+    validation_error.append(4, 28.0)
+    validation_error.append(5, 26.0)
+
+    h5f.close()
+
+    h5f = tables.openFile(h5f_path, "r")
+
+    table = h5f.getNode('/', 'validation_error')
+
+    # h5f should be empty
+    assert len(table) == 0
+
+    assert compare_lists(validation_errors_from_callback, [32.0,30.0,28.0,26.0])
+
+
+
+
+
+
+if __name__ == '__main__':
+    import tempfile
+    test_get_desc()
+    test_ErrorSeries_common_case()
+    test_BasicStatisticsSeries_common_case()
+    test_AccumulatorSeriesWrapper_common_case()
+    test_SharedParamsStatisticsWrapper_commoncase()
+    test_ErrorSeries_appendtarget()
+
--- a/pylearn/sandbox/test_scan_inputs_groups.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/sandbox/test_scan_inputs_groups.py	Wed Aug 11 13:16:05 2010 -0400
@@ -9,6 +9,7 @@
 import theano.tensor as T
 from pylearn.sandbox.scan_inputs_groups import FillMissing
 import theano.compile.mode as mode_module
+import theano

 class TestFillMissing(unittest.TestCase):
     def setUp(self):
@@ -16,9 +17,9 @@

         #we need to desactivate the check for NaN value as we have them in input
         #TODO: Make an option to don't check NaN value in input only, bug check in output.
-        m=mode_module.default_mode
-        if m=="DEBUG_MODE":
-            m=copy.copy(mode_module.predefined_modes[m])
+        m=mode_module.get_default_mode()
+        if isinstance(m,theano.compile.debugmode.DebugMode):
+            m=copy.copy(m)
             m.check_isfinite=False
         self.mode = m
--- a/pylearn/shared/layers/__init__.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/__init__.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,14 +1,14 @@
 # logreg.py
-from .logreg import LogisticRegression
+from pylearn.shared.layers.logreg import LogisticRegression

 # sigmoidal_layer.py
-from .sigmoidal_layer import SigmoidalLayer
+from pylearn.shared.layers.sigmoidal_layer import SigmoidalLayer

 # exponential_mean.py
-from .exponential_mean import ExponentialMean
+from pylearn.shared.layers.exponential_mean import ExponentialMean

 # sgd.py
-from .sgd import StochasticGradientDescent, HalflifeStopper
+from pylearn.shared.layers.sgd import StochasticGradientDescent, HalflifeStopper

 # kording
 from kording2004 import Kording2004
--- a/pylearn/shared/layers/exponential_mean.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/exponential_mean.py	Wed Aug 11 13:16:05 2010 -0400
@@ -6,7 +6,7 @@
 import numpy
 import theano
 import theano.tensor
-from theano.compile.sandbox import shared
+from theano.compile import shared

 class ExponentialMean(object):
     """Maintain an exponentially-decaying estimate of the mean
--- a/pylearn/shared/layers/kording2004.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/kording2004.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,8 +1,7 @@
 import numpy
 import theano.tensor
-from hpu.theano_outgoing import mean, var, cov
-
-from .exponential_mean import ExponentialMean # exponential_mean.py
+from theano.tensor.basic import mean
+from pylearn.shared.layers.exponential_mean import ExponentialMean # exponential_mean.py

 import logging
 _logger = logging.getLogger('kording2004')
--- a/pylearn/shared/layers/kouh2008.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/kouh2008.py	Wed Aug 11 13:16:05 2010 -0400
@@ -20,8 +20,8 @@
 from theano import tensor
 from theano.tensor.nnet import softplus
 from theano.sandbox.softsign import softsign
-from theano.compile.sandbox import shared
-from .util import add_logging, update_locals
+from theano.compile import shared
+from pylearn.shared.layers.util import add_logging, update_locals

 try:
     from PIL import Image
@@ -218,8 +218,14 @@

         #softsign's range is (-1, 1)
         # we want filter responses to span (x_low, x_high)
-        x_list = [x_low + (x_high-x_low)*(d(0.5) + d(0.5)*softsign(tensor.dot(input, f_list[i])+b_list[i]))
-                    for i in xrange(n_terms)]
+        if x_low < x_high:
+            x_list = [x_low + (x_high-x_low)*(d(0.5) + d(0.5)*softsign(tensor.dot(input, f_list[i])+b_list[i]))
+                        for i in xrange(n_terms)]
+        else:
+            if x_low == x_high:
+                x_list = [(tensor.dot(input, f_list[i])+b_list[i]) for i in xrange(n_terms)]
+            else: #x_low > x_high
+                x_list = [softplus(tensor.dot(input, f_list[i])+b_list[i]) for i in xrange(n_terms)]

         rval = cls.new_expbounds(rng, x_list, n_out, dtype=dtype, params=f_list + b_list,
                 p_range=p_range,
@@ -243,9 +249,12 @@
         if rows is None and cols is None:
             rows = int(numpy.sqrt(n_out))
         if cols is None:
-            cols = n_out // rows + (1 if n_out % rows else 0)
+            cols = n_out // rows
+            if n_out % rows: cols +=1
         if rows is None:
-            rows = n_out // cols + (1 if n_out % cols else 0)
+            rows = n_out // cols
+            if n_out % cols:
+                rows+=1

         filter_shape = self.filter_shape
         height = rows * (row_gap + filter_shape[0]) - row_gap
--- a/pylearn/shared/layers/lecun1998.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/lecun1998.py	Wed Aug 11 13:16:05 2010 -0400
@@ -6,13 +6,13 @@

 import theano
 from theano import tensor
-from theano.compile.sandbox import shared, pfunc
+from theano.compile import shared, pfunc

-from theano.sandbox.conv import ConvOp
-from theano.sandbox.downsample import DownsampleFactorMax
+from theano.tensor.nnet.conv import ConvOp
+from theano.tensor.signal.downsample import DownsampleFactorMax

-from .util import update_locals
-from .squash import squash
+from pylearn.shared.layers.util import update_locals
+from pylearn.shared.layers.squash import squash

 class LeNetConvPool(object):
     """
--- a/pylearn/shared/layers/logreg.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/logreg.py	Wed Aug 11 13:16:05 2010 -0400
@@ -2,9 +2,9 @@
 """
 import numpy
 import theano
-from theano.compile.sandbox import shared
+from theano.compile import shared
 from theano.tensor import nnet
-from .util import update_locals, add_logging
+from pylearn.shared.layers.util import update_locals, add_logging

 class LogisticRegression(object):
     def __init__(self, input, w, b, params=[]):
@@ -15,12 +15,14 @@
         update_locals(self, locals())

     @classmethod
-    def new(cls, input, n_in, n_out, dtype=None):
+    def new(cls, input, n_in, n_out, dtype=None, name=None):
         if dtype is None:
             dtype = input.dtype
+        if name is None:
+            name = cls.__name__
         cls._debug('allocating params w, b', n_in, n_out, dtype)
-        w = shared(numpy.zeros((n_in, n_out), dtype=dtype))
-        b = shared(numpy.zeros((n_out,), dtype=dtype))
+        w = shared(numpy.zeros((n_in, n_out), dtype=dtype), name='%s.w'%name)
+        b = shared(numpy.zeros((n_out,), dtype=dtype), name='%s.b'%name)
         return cls(input, w, b, params=[w,b])
--- a/pylearn/shared/layers/rust2005.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/rust2005.py	Wed Aug 11 13:16:05 2010 -0400
@@ -25,12 +25,12 @@
 import theano
 import theano.tensor
 import theano.tensor.nnet
-from theano.compile.sandbox import shared
+from theano.compile import shared
 from theano.sandbox.softsign import softsign
 from theano.tensor.nnet import softplus
-from theano.sandbox.conv import ConvOp
+from theano.tensor.nnet.conv import ConvOp

-from .util import update_locals, add_logging
+from pylearn.shared.layers.util import update_locals, add_logging

 def rust2005_act_from_filters(linpart, E_quad, S_quad, eps):
     """Return rust2005 activation from linear filter responses, as well as E and S terms
--- a/pylearn/shared/layers/sandbox/linsvm.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/sandbox/linsvm.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,6 +1,6 @@
 import numpy
 import theano
-from theano.compile.sandbox import shared
+from theano.compile import shared
 from theano.tensor import nnet
 from .util import update_locals
--- a/pylearn/shared/layers/sgd.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/sgd.py	Wed Aug 11 13:16:05 2010 -0400
@@ -4,7 +4,7 @@
 import numpy
 import theano
 from theano import tensor
-from theano.compile.sandbox import shared
+from theano.compile import shared

 class StochasticGradientDescent(object):
     """Fixed stepsize gradient descent
@@ -72,6 +72,9 @@
     Significant improvement in the second half of a run is defined as achieving
     `progresh_thresh` proportion of the best score from the first half of the run.

+    Instances of this class can be picked.
+    Future version should maintain unpickling backward-compatability.
+
     .. code-block:: python

         stopper = HalflifeStopper()
--- a/pylearn/shared/layers/sigmoidal_layer.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/sigmoidal_layer.py	Wed Aug 11 13:16:05 2010 -0400
@@ -6,9 +6,9 @@

 import theano
 from theano import tensor
-from theano.compile.sandbox import shared, pfunc
-from .util import update_locals, add_logging
-from .squash import squash
+from theano.compile import shared, pfunc
+from pylearn.shared.layers.util import update_locals, add_logging
+from pylearn.shared.layers.squash import squash


 class SigmoidalLayer(object):
--- a/pylearn/shared/layers/tests/test_kouh2008.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/tests/test_kouh2008.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,7 +1,8 @@
 import numpy
 import theano.compile.debugmode
+from theano.compile.debugmode import DebugMode
 from theano import tensor
-from theano.compile.sandbox import pfunc
+from theano.compile import pfunc
 from pylearn.shared.layers import LogisticRegression, Kouh2008

 def test_dtype():
@@ -9,17 +10,20 @@
     n_out = 10
     n_terms = 3
     rng = numpy.random.RandomState(23455)
-    layer = Kouh2008.new_filters(rng, tensor.dmatrix(), n_in, n_out, n_terms, dtype='float64')
+    layer = Kouh2008.new_filters_expbounds(rng, tensor.dmatrix(), n_in, n_out, n_terms, dtype='float64')
     assert layer.output.dtype =='float64'
-    layer = Kouh2008.new_filters(rng, tensor.fmatrix(), n_in, n_out, n_terms, dtype='float32')
+    layer = Kouh2008.new_filters_expbounds(rng, tensor.fmatrix(), n_in, n_out, n_terms, dtype='float32')
     assert layer.output.dtype =='float32'

 def run_w_random(bsize=10, n_iter=200, n_in = 1024, n_out = 100, n_terms=2, dtype='float64'):
+    if isinstance(theano.compile.mode.get_default_mode(),DebugMode):
+        n_iter=2
+
     x = tensor.dmatrix()
     y = tensor.lvector()
     rng = numpy.random.RandomState(23455)

-    layer = Kouh2008.new_filters(rng, x, n_in, n_out, n_terms, dtype='float64')
+    layer = Kouh2008.new_filters_expbounds(rng, x, n_in, n_out, n_terms, dtype='float64')
     out = LogisticRegression.new(layer.output, n_out, 2)
     cost = out.nll(y).sum()

@@ -52,7 +56,7 @@
     y = tensor.lvector()
     rng = numpy.random.RandomState(23455)

-    layer = Kouh2008.new_filters(rng, x, n_in, n_out, n_terms, dtype='float64')
+    layer = Kouh2008.new_filters_expbounds(rng, x, n_in, n_out, n_terms, dtype='float64')
     out = LogisticRegression.new(layer.output, n_out, 2)
     cost = out.nll(y).sum()
     #joint optimization except for one of the linear filters
@@ -97,11 +101,16 @@
     test_A()

 def test_smaller():
-    assert run_w_random(n_in=10, n_out=8) < 6.1
+    rval = run_w_random(n_in=10, n_out=8)
+    if not isinstance(theano.compile.mode.get_default_mode(),DebugMode):
+        assert rval < 6.1

 def test_smaller32():
-    assert run_w_random(n_in=10, n_out=8, dtype='float32') < 6.1
+    rval = run_w_random(n_in=10, n_out=8, dtype='float32')
+    if not isinstance(theano.compile.mode.get_default_mode(),DebugMode):
+        assert rval < 6.1

 def test_big():
-    assert run_w_random() < 0.1
-
+    rval = run_w_random()
+    if not isinstance(theano.compile.mode.get_default_mode(),DebugMode):
+        assert rval < 0.1
--- a/pylearn/shared/layers/tests/test_lecun1998.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/tests/test_lecun1998.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,8 +1,12 @@
 from pylearn.shared.layers.lecun1998 import *
 from pylearn.shared.layers import LogisticRegression
+from theano.compile.debugmode import DebugMode
 import theano.sandbox.softsign

 def test_w_random(bsize=10, n_iter=100, dtype='float64'):
+    if isinstance(theano.compile.mode.get_default_mode(),DebugMode):
+        n_iter=2
+
     ishape=(28,28)
     fshape=(5,5)
     if dtype == 'float64':
@@ -30,7 +34,8 @@
         print i, 'rval', fN

     assert f0 > 6
-    assert fN < .3
+    if not isinstance(theano.compile.mode.get_default_mode(),DebugMode):
+        assert fN < .3


 def test_squash():
--- a/pylearn/shared/layers/tests/test_sigmoidal_layer.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/shared/layers/tests/test_sigmoidal_layer.py	Wed Aug 11 13:16:05 2010 -0400
@@ -1,7 +1,7 @@
 import numpy
 from pylearn.shared.layers import SigmoidalLayer, LogisticRegression
 from theano import tensor
-from theano.compile.sandbox import shared, pfunc
+from theano.compile import shared, pfunc

 def test_w_random(dtype='float64'):
     if dtype == 'float64':
--- a/pylearn/version.py	Fri Jul 16 14:20:48 2010 -0400
+++ b/pylearn/version.py	Wed Aug 11 13:16:05 2010 -0400
@@ -6,6 +6,17 @@

 _cache = dict()

+def record_versions(results, modules, prefix='version_'):
+    """Bizarre version-recording function...
+
+       For each module in `modules` it executes
+       result.<prefix><module.__name__> = import_id(module.__name)
+
+       :returns: None
+    """
+    for module in modules:
+        setattr(results, prefix+module.__name__, import_id(module.__name__))
+
 def src_version(module_name):
     """Return compact identifier of module code.

@@ -154,8 +165,10 @@
         raise OSError('hg returned %i, maybe %s is not under hg control?',
                 (id_proc.returncode, dirname))

-    care_about = (lambda some_file : True) if filenames is None \
-            else (lambda some_file : some_file in filenames)
+    if filenames is None:
+        care_about = (lambda some_file : True)
+    else:
+        care_about = (lambda some_file : some_file in filenames)

     # parse status codes for what we care about
     care_about_mod = False
@@ -214,15 +227,15 @@
     if resource_type == _imp.PY_COMPILED:
         return _import_id_py_compiled(location)
     if resource_type == _imp.C_EXTENSION:
-        raise NoteImplementedError
+        raise NotImplementedError
     if resource_type == _imp.PY_RESOURCE:
-        raise NoteImplementedError
+        raise NotImplementedError
     if resource_type == _imp.PKG_DIRECTORY:
         return _import_id_pkg_directory(location)
     if resource_type == _imp.C_BUILTIN:
-        raise NoteImplementedError
+        raise NotImplementedError
     if resource_type == _imp.PY_FROZEN:
-        raise NoteImplementedError
+        raise NotImplementedError

     assert False #the list of resource types above should be exhaustive