Mercurial > pylearn
changeset 934:e0b960ee57f5
merge
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Thu, 15 Apr 2010 10:52:02 -0400 |
parents | ca9fc8cae5b5 (diff) 1c62fa857cab (current diff) |
children | 7305246f21f8 |
files | pylearn/dataset_ops/protocol.py |
diffstat | 31 files changed, 2419 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/.build/PLACEHOLDER Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,1 @@ +sphinx doesn't like it when this repertory isn't available
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/.static/PLACEHOLDER Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,1 @@ +sphinx doesn't like it when this repertory isn't available
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/.templates/PLACEHOLDER Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,1 @@ +sphinx doesn't like it when this repertory isn't available
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/.templates/layout.html Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,24 @@ +{% extends "!layout.html" %} + +{%- block extrahead %} +{{ super() }} +<script type="text/javascript"> + var _gaq = _gaq || []; + _gaq.push(['_setAccount', 'UA-168290-9']); + _gaq.push(['_trackPageview']); +</script> +{% endblock %} + +{% block footer %} +{{ super() }} +<script type="text/javascript"> + (function() { + var ga = document.createElement('script'); + ga.src = ('https:' == document.location.protocol ? + 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; + ga.setAttribute('async', 'true'); + document.documentElement.firstChild.appendChild(ga); + })(); +</script> +{% endblock %} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/LICENSE.txt Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,30 @@ +.. _license: + +LICENSE +======= + +Copyright (c) 2008--2009, Theano Development Team +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Theano nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/api/epydoc.conf Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,152 @@ +# TODO: +# Get all graphs to work! + + +[epydoc] # Epydoc section marker (required by ConfigParser) + +# The list of objects to document. Objects can be named using +# dotted names, module filenames, or package directory names. +# Alases for this option include "objects" and "values". +modules: pylearn + +# The type of output that should be generated. Should be one +# of: html, text, latex, dvi, ps, pdf. +output: html + +# An integer indicating how verbose epydoc should be. The default +# value is 0; negative values will supress warnings and errors; +# positive values will give more verbose output. +verbosity: 1 + +# A boolean value indicating that Epydoc should show a tracaback +# in case of unexpected error. By default don't show tracebacks +debug: 1 + +# If True, don't try to use colors or cursor control when doing +# textual output. The default False assumes a rich text prompt +simple-term: 0 + + +### Generation options + +# The default markup language for docstrings, for modules that do +# not define __docformat__. Defaults to epytext. +docformat: epytext + +# Whether or not parsing should be used to examine objects. +parse: yes + +# Whether or not introspection should be used to examine objects. +introspect: yes + +# Don't examine in any way the modules whose dotted name match this +# regular expression pattern. +#exclude + +# Don't perform introspection on the modules whose dotted name match this +# regular expression pattern. +#exclude-introspect + +# Don't perform parsing on the modules whose dotted name match this +# regular expression pattern. +#exclude-parse + +# The format for showing inheritance objects. +# It should be one of: 'grouped', 'listed', 'included'. +inheritance: grouped + +# Whether or not to inclue private variables. (Even if included, +# private variables will be hidden by default.) +private: yes + +# Whether or not to list each module's imports. +imports: yes + +# Whether or not to include syntax highlighted source code in +# the output (HTML only). +sourcecode: yes + +# Whether or not to includea a page with Epydoc log, containing +# effective option at the time of generation and the reported logs. +include-log: yes + + +### Output options + +# The documented project's name. +name: Pylearn + +# The CSS stylesheet for HTML output. Can be the name of a builtin +# stylesheet, or the name of a file. +css: white + +# The documented project's URL. +url: http://deeplearning.net/software/pylearn/ + +# HTML code for the project link in the navigation bar. If left +# unspecified, the project link will be generated based on the +# project's name and URL. +#link: <a href="somewhere">My Cool Project</a> + +# The "top" page for the documentation. Can be a URL, the name +# of a module or class, or one of the special names "trees.html", +# "indices.html", or "help.html" +#top: os.path + +# An alternative help file. The named file should contain the +# body of an HTML file; navigation bars will be added to it. +#help: my_helpfile.html + +# Whether or not to include a frames-based table of contents. +#frames: yes +frames: no + +# Whether each class should be listed in its own section when +# generating LaTeX or PDF output. +separate-classes: no + + +### API linking options + +# Define a new API document. A new interpreted text role +# will be created +#external-api: epydoc + +# Use the records in this file to resolve objects in the API named NAME. +#external-api-file: epydoc:api-objects.txt + +# Use this URL prefix to configure the string returned for external API. +#external-api-root: epydoc:http://epydoc.sourceforge.net/api +# external-api: wiki doc +# external-api-root: wiki:http://lgcm.iro.umontreal.ca/theano/wiki/ doc:http://lgcm.iro.umontreal.ca/auto_theano/doc/ +# external-api-file: wiki:wiki.idx doc:doc/doc.idx + +### Graph options + +# The list of graph types that should be automatically included +# in the output. Graphs are generated using the Graphviz "dot" +# executable. Graph types include: "classtree", "callgraph", +# "umlclass". Use "all" to include all graph types +graph: all + +# The path to the Graphviz "dot" executable, used to generate +# graphs. +dotpath: /usr/bin/dot + +# The name of one or more pstat files (generated by the profile +# or hotshot module). These are used to generate call graphs. +#pstat: autotest.pstat + +# Specify the font used to generate Graphviz graphs. +# (e.g., helvetica or times). +graph-font: Helvetica + +# Specify the font size used to generate Graphviz graphs. +graph-font-size: 10 + + +### Return value options + +# The condition upon which Epydoc should exit with a non-zero +# exit status. Possible values are error, warning, docstring_warning +#fail-on: error
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/conf.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- +# +# theano documentation build configuration file, created by +# sphinx-quickstart on Tue Oct 7 16:34:06 2008. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# The contents of this file are pickled, so don't put values in the namespace +# that aren't pickleable (module imports are okay, they're removed automatically). +# +# All configuration values have a default value; values that are commented out +# serve to show the default value. + +import sys, os + +# If your extensions are in another directory, add it here. If the directory +# is relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +#sys.path.append(os.path.abspath('some/directory')) + +# General configuration +# --------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'ext'] + +todo_include_todos = True + +try: + from sphinx.ext import pngmath + extensions.append('sphinx.ext.pngmath') +except ImportError: + pass + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['.templates'] + +# The suffix of source filenames. +source_suffix = '.txt' + +# The master toctree document. +master_doc = 'index' + +# General substitutions. +project = 'Pylearn' +copyright = '2008--2009, LISA lab' + +# The default replacements for |version| and |release|, also used in various +# other places throughout the built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directories, that shouldn't be searched +# for source files. +exclude_dirs = ['images', 'scripts', 'api'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# Options for HTML output +# ----------------------- + +# The style sheet to use for HTML and HTML Help pages. A file of that name +# must exist either in Sphinx' static/ path, or in one of the custom paths +# given in html_static_path. +#html_style = 'default.css' +html_theme = 'sphinxdoc' + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (within the static path) to place at the top of +# the sidebar. +#html_logo = 'images/theano_logo-200x67.png' +html_logo = 'images/logo_pylearn_200x57.png' + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['.static', 'images'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, the reST sources are included in the HTML build as _sources/<name>. +#html_copy_source = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pylearndoc' + + +# Options for LaTeX output +# ------------------------ + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +latex_font_size = '11pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, document class [howto/manual]). +latex_documents = [ + ('index', 'pylearn.tex', 'pylearn Documentation', + 'LISA lab, University of Montreal', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = 'images/snake_theta2-trans.png' +latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/ext.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,75 @@ + +import sys +import re +import os +from docutils import nodes, utils +from docutils.parsers.rst import roles +import epydoc.docwriter.xlink as xlink + +#def role_fn(name, rawtext, text, lineno, inliner, +# options={}, content=[]): +# node = nodes.reference(rawtext, text, refuri = "http://pylearn.org/theano/wiki/%s" % text) +# return [node], [] + + +_TARGET_RE = re.compile(r'^(.*?)\s*<(?:URI:|URL:)?([^<>]+)>$') +def create_api_role(name, problematic): + """ + Create and register a new role to create links for an API documentation. + + Create a role called `name`, which will use the URL resolver registered as + ``name`` in `api_register` to create a link for an object. + + :Parameters: + `name` : `str` + name of the role to create. + `problematic` : `bool` + if True, the registered role will create problematic nodes in + case of failed references. If False, a warning will be raised + anyway, but the output will appear as an ordinary literal. + """ + def resolve_api_name(n, rawtext, text, lineno, inliner, + options={}, content=[]): + + # Check if there's separate text & targets + m = _TARGET_RE.match(text) + if m: text, target = m.groups() + else: target = text + + # node in monotype font + text = utils.unescape(text) + node = nodes.literal(rawtext, text, **options) + + # Get the resolver from the register and create an url from it. + try: + url = xlink.api_register[name].get_url(target) + except IndexError, exc: + msg = inliner.reporter.warning(str(exc), line=lineno) + if problematic: + prb = inliner.problematic(rawtext, text, msg) + return [prb], [msg] + else: + return [node], [] + + if url is not None: + node = nodes.reference(rawtext, '', node, refuri=url, **options) + return [node], [] + + roles.register_local_role(name, resolve_api_name) + + +def setup(app): + + try: + xlink.set_api_file('api', os.path.join(app.outdir, 'api', 'api-objects.txt')) + apiroot = os.getenv('PYLEARN_API_ROOT') + if not apiroot: + apiroot = os.path.join(os.path.realpath('api'), '') + xlink.set_api_root('api', apiroot) + #xlink.create_api_role('api', True) + create_api_role('api', True) + except IOError: + print >>sys.stderr, 'WARNING: Could not find api file! API links will not work.' + + #app.add_role("wiki", role_fn) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/index.txt Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,32 @@ + +Welcome +======= + +Pylearn is a Python library for machine learning, built on top of Theano, our +library for defining, optimizing and evaluating mathematical expressions +involving multi-dimensional arrays. + +This documentation is under construction, but you can already access the +automatically-generated API doc, along with more extensive explanations for +some modules. + +Download +======== + +We recommend the latest development version, available via:: + + hg clone http://hg.assembla.com/pylearn Pylearn + +The ``pylearn`` subfolder should be on your ``$PYTHONPATH``. + +Documentation +============= + +For the moment, the following documentation is available. + +* :doc:`io.SeriesTables module <seriestables>` -- Saves error series and other statistics during training +* `API <api/>`_ -- The automatically-generated API documentation + +You can download the latest `PDF documentation <http://deeplearning.net/software/pylearn/pylearn.pdf>`_, rather than reading it online. + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/scripts/docgen.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,89 @@ +import sys +import os +import shutil +import inspect + +from epydoc import docintrospecter +from epydoc.apidoc import RoutineDoc + +import getopt +from collections import defaultdict + +if __name__ == '__main__': + + # make sure we're in the right directory + this_file_directory = os.path.abspath(os.path.dirname(__file__)) + pylearn_root = os.path.join(os.path.join(this_file_directory, ".."), "..") + + #pylearn_root = "/".join(sys.path[0].split("/")[:-2]) + + options = defaultdict(bool) + options.update(dict([x, y or True] for x, y in getopt.getopt(sys.argv[1:], 'o:', ['epydoc', 'rst', 'help', 'nopdf'])[0])) + if options['--help']: + print 'Usage: %s [OPTIONS]' % sys.argv[0] + print ' -o <dir>: output the html files in the specified dir' + print ' --rst: only compile the doc (requires sphinx)' + print ' --nopdf: do not produce a PDF file from the doc, only HTML' + print ' --epydoc: only compile the api documentation (requires epydoc)' + print ' --help: this help' + sys.exit(0) + + options['--all'] = not (bool(options['--epydoc']) ^ bool(options['--rst'])) + + def mkdir(path): + try: + os.mkdir(path) + except OSError: + pass + + outdir = options['-o'] or (pylearn_root + '/html') + mkdir(outdir) + os.chdir(outdir) + mkdir("doc") + mkdir("api") + + # Make sure the appropriate 'theano' directory is in the PYTHONPATH + pythonpath = os.environ.get('PYTHONPATH', '') + pythonpath = pylearn_root + ':' + pythonpath + os.environ['PYTHONPATH'] = pythonpath + + if options['--all'] or options['--epydoc']: + from epydoc.cli import cli + sys.path[0:0] = [pylearn_root] + + #Generate HTML doc + + ## This causes problems with the subsequent generation of sphinx doc + #sys.argv[:] = ['', '--config', '%s/doc/api/epydoc.conf' % pylearn_root, '-o', 'api'] + #cli() + ## So we use this instead + os.system("epydoc --config %s/doc/api/epydoc.conf -o api" % pylearn_root) + + # Generate PDF doc + # TODO + + if options['--all'] or options['--rst']: + import sphinx + sys.path[0:0] = [os.path.join(pylearn_root, 'doc')] + sphinx.main(['', '-E', os.path.join(pylearn_root, 'doc'), '.']) + + if not options['--nopdf']: + # Generate latex file in a temp directory + import tempfile + workdir = tempfile.mkdtemp() + sphinx.main(['', '-E', '-b', 'latex', + os.path.join(pylearn_root, 'doc'), workdir]) + # Compile to PDF + os.chdir(workdir) + os.system('make') + try: + shutil.copy(os.path.join(workdir, 'pylearn.pdf'), outdir) + os.chdir(outdir) + shutil.rmtree(workdir) + except OSError, e: + print 'OSError:', e + except IOError, e: + print 'IOError:', e + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/seriestables.txt Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,291 @@ +.. SeriesTables documentation master file, created by + sphinx-quickstart on Wed Mar 10 17:56:41 2010. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Introduction to ``SeriesTables`` +-------------------------------- + +SeriesTables was created to make it easier to **record scalar data series**, such as, notably, the **evolution of errors (training, valid, test) during training**. There are other common usecases I foresee, such as **recording basic statistics (mean, min/max, variance) of parameters** during training, to diagnose problems. + +I also think that if such recording is easily accessible, it might lead us to record other statistics, such as stats concerning activations in the network (i.e. to diagnose unit saturation problems). + +Each **element of a series is indexed and timestamped**. By default, for example, the index is named "epoch", which means that with each row an epoch number is stored (but this can be easily customized). By default, the timestamp at row creation time will also be stored, along with the CPU clock() time. This is to allow graphs plotting error series against epoch or training time. + +Series are saved in HDF5 files, which I'll introduce briefly. + +Introduction to PyTables and HDF5 +--------------------------------- + +HDF5_ is a file format intended for storage of big numerical datasets. In practice, for our concern, you'll create a single ``.h5`` file, in which many tables, corresponding to different series, will be stored. Datasets in a single file are organized hierarchically, in the equivalent of "folders" called "groups". The "files" in the analogy would be our tables. + +.. _HDF5: http://www.hdfgroup.org/HDF5/ + +A useful property of HDF5 is that metadata is stored along with the data itself. Notably, we have the table names and column names inside the file. We can also attach more complex data, such as title, or even complex objects (which will be pickled), as attributes. + +PyTables_ is a Python library to use the HDF5 format. + +.. _PyTables: http://www.pytables.org/moin/HowToUse + +Here's a basic Python session in which I create a new file and store a few rows in a single table: + +>>> import tables +>>> +>>> hdf5_file = tables.openFile("mytables.h5", "w") +>>> +>>> # Create a new subgroup under the root group "/" +... mygroup = hdf5_file.createGroup("/", "mygroup") +>>> +>>> # Define the type of data we want to store +... class MyDescription(tables.IsDescription): +... int_column_1 = tables.Int32Col(pos=0) +... float_column_1 = tables.Float32Col(pos=1) +... +>>> # Create a table under mygroup +... mytable = hdf5_file.createTable("/mygroup", "mytable", MyDescription) +>>> +>>> newrow = mytable.row +>>> +>>> # a first row +... newrow["int_column_1"] = 15 +>>> newrow["float_column_1"] = 30.0 +>>> newrow.append() +>>> +>>> # and a second row +... newrow["int_column_1"] = 16 +>>> newrow["float_column_1"] = 32.0 +>>> newrow.append() +>>> +>>> # make sure we write to disk +... hdf5_file.flush() +>>> +>>> hdf5_file.close() + + +And here's a session in which I reload the data and explore it: + +>>> import tables +>>> +>>> hdf5_file = tables.openFile("mytables.h5", "r") +>>> +>>> mytable = hdf5_file.getNode("/mygroup", "mytable") +>>> +>>> # tables can be "sliced" this way +... mytable[0:2] +array([(15, 30.0), (16, 32.0)], + dtype=[('int_column_1', '<i4'), ('float_column_1', '<f4')]) +>>> +>>> # or we can access columns individually +... mytable.cols.int_column_1[0:2] +array([15, 16], dtype=int32) + + +Using ``SeriesTables``: a basic example +--------------------------------------- + +Here's a very basic example usage: + +>>> import tables +>>> from pylearn.io.seriestables import * +>>> +>>> tables_file = tables.openFile("series.h5", "w") +>>> +>>> error_series = ErrorSeries(error_name="validation_error", \ +... table_name="validation_error", \ +... hdf5_file=tables_file) +>>> +>>> error_series.append((1,), 32.0) +>>> error_series.append((2,), 28.0) +>>> error_series.append((3,), 26.0) + +I can then open the file ``series.h5``, which will contain a table named ``validation_error`` with a column name ``epoch`` and another named ``validation_error``. There will also be ``timestamp`` and ``cpuclock`` columns, as this is the default behavior. The table rows will correspond to the data added with ``append()`` above. + +Indices +....... + +You may notice that the first parameter in ``append()`` is a tuple. This is because the *index* may have multiple levels. The index is a way for rows to have an order. + +In the default case for ErrorSeries, the index only has an "epoch", so the tuple only has one element. But in the ErrorSeries(...) constructor, you could have specified the ``index_names`` parameter, e.g. ``('epoch','minibatch')``, which would allow you to specify both the epoch and the minibatch as index. + + +Summary of the most useful classes +---------------------------------- + +By default, for each of these series, there are also columns for timestamp and CPU clock() value when append() is called. This can be changed with the store_timestamp and store_cpuclock parameters of their constructors. + +ErrorSeries + This records one floating point (32 bit) value along with an index in a new table. + +AccumulatorSeriesWrapper + This wraps another Series and calls its ``append()`` method when its own ``append()`` as been called N times, N being a parameter when constructing the ``AccumulatorSeriesWrapper``. A simple use case: say you want to store the mean of the training error every 100 minibatches. You create an ErrorSeries, wrap it with an Accumulator and then call its ``append()`` for every minibatch. It will collect the errors, wait until it has 100, then take the mean (with ``numpy.mean``) and store it in the ErrorSeries, and start over again. + Other "reducing" functions can be used instead of "mean". + +BasicStatisticsSeries + This stores the mean, the min, the max and the standard deviation of arrays you pass to its ``append()`` method. This is useful, notably, to see how the weights (and other parameters) evolve during training without actually storing the parameters themselves. + +SharedParamsStatisticsWrapper + This wraps a few BasicStatisticsSeries. It is specifically designed so you can pass it a list of shared (as in theano.shared) parameter arrays. Each array will get its own table, under a new HDF5 group. You can name each table, e.g. "layer1_b", "layer1_W", etc. + +Example of real usage +--------------------- + +The following is a function where I create the series used to record errors and statistics about parameters in a stacked denoising autoencoder script: + +.. code-block:: python + + def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # training error is accumulated over 100 minibatches, + # then the mean is computed and saved in the training_base series + training_base = \ + ErrorSeries(error_name="training_error", + table_name="training_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Training error (mean over 100 minibatches)") + + # this series wraps training_base, performs accumulation + series['training_error'] = \ + AccumulatorSeriesWrapper(base_series=training_base, + reduce_every=100) + + # valid and test are not accumulated/mean, saved directly + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch',)) + + series['test_error'] = \ + ErrorSeries(error_name="test_error", + table_name="test_error", + hdf5_file=h5f, + index_names=('epoch',)) + + # next we want to store the parameters statistics + # so first we create the names for each table, based on + # position of each param in the array + param_names = [] + for i in range(num_hidden_layers): + param_names += ['layer%d_W'%i, 'layer%d_b'%i, 'layer%d_bprime'%i] + param_names += ['logreg_layer_W', 'logreg_layer_b'] + + + series['params'] = SharedParamsStatisticsWrapper( + new_group_name="params", + base_group="/", + arrays_names=param_names, + hdf5_file=h5f, + index_names=('epoch',)) + + return series + +Then, here's an example of append() usage for each of these series, wrapped in pseudocode: + +.. code-block:: python + + series = create_series(num_hidden_layers=3) + + ... + + for epoch in range(num_epochs): + for mb_index in range(num_minibatches): + train_error = finetune(mb_index) + series['training_error'].append((epoch, mb_index), train_error) + + valid_error = compute_validation_error() + series['validation_error'].append((epoch,), valid_error) + + test_error = compute_test_error() + series['test_error'].append((epoch,), test_error) + + # suppose all_params is a list [layer1_W, layer1_b, ...] + # where each element is a shared (as in theano.shared) array + series['params'].append((epoch,), all_params) + +Other targets for appending (e.g. printing to stdout) +----------------------------------------------------- + +SeriesTables was created with an HDF5 file in mind, but often, for debugging, +it's useful to be able to redirect the series elsewhere, notably the standard +output. A mechanism was added to do just that. + +What you do is you create a ``AppendTarget`` instance (or more than one) and +pass it as an argument to the Series constructor. For example, to print every +row appended to the standard output, you use StdoutAppendTarget. + +If you want to skip appending to the HDF5 file entirely, this is also +possible. You simply specify ``skip_hdf5_append=True`` in the constructor. You +still need to pass in a valid HDF5 file, though, even though nothing will be +written to it (for, err, legacy reasons). + +Here's an example: + +.. code-block:: python + + def create_series(num_hidden_layers): + + # Replace series we don't want to save with DummySeries, e.g. + # series['training_error'] = DummySeries() + + series = {} + + basedir = os.getcwd() + + h5f = tables.openFile(os.path.join(basedir, "series.h5"), "w") + + # Here we create the new target, with a message prepended + # before every row is printed to stdout + stdout_target = \ + StdoutAppendTarget( \ + prepend='\n-----------------\nValidation error', + indent_str='\t') + + # Notice here we won't even write to the HDF5 file + series['validation_error'] = \ + ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('epoch',), + other_targets=[stdout_target], + skip_hdf5_append=True) + + return series + + +Now calls to series['validation_error'].append() will print to stdout outputs +like:: + + ---------------- + Validation error + timestamp : 1271202144 + cpuclock : 0.12 + epoch : 1 + validation_error : 30.0 + + ---------------- + Validation error + timestamp : 1271202144 + cpuclock : 0.12 + epoch : 2 + validation_error : 26.0 + + +Visualizing in vitables +----------------------- + +vitables_ is a program with which you can easily explore an HDF5 ``.h5`` file. Here's a screenshot in which I visualize series produced for the preceding example: + +.. _vitables: http://vitables.berlios.de/ + +.. image:: images/vitables_example_series.png +
--- a/pylearn/algorithms/tests/test_daa.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/algorithms/tests/test_daa.py Thu Apr 15 10:52:02 2010 -0400 @@ -6,12 +6,14 @@ import time import pylearn.algorithms.logistic_regression -from theano.compile.mode import default_mode - -def test_train_daa(mode = default_mode): +from theano import config +from pylearn.algorithms.stacker import Stacker +from pylearn.algorithms.daa import SigmoidXEDenoisingAA +from pylearn.algorithms.regressor import BinRegressor +def test_train_daa(mode = config.mode): ndaa = 3 - daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(models.BinRegressor, 'output')], + daa = Stacker([(SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(BinRegressor, 'output')], regularize = False) model = daa.make([4, 20, 20, 20, 1], @@ -39,7 +41,7 @@ def test_train_daa2(mode = theano.Mode('c|py', 'fast_run')): ndaa = 3 - daa = models.Stacker([(models.SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(pylearn.algorithms.logistic_regression.Module_Nclass, 'pred')], + daa = Stacker([(SigmoidXEDenoisingAA, 'hidden')] * ndaa + [(pylearn.algorithms.logistic_regression.Module_Nclass, 'pred')], regularize = False) model = daa.make([4] + [20] * ndaa + [10],
--- a/pylearn/algorithms/tests/test_exponential_mean.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/algorithms/tests/test_exponential_mean.py Thu Apr 15 10:52:02 2010 -0400 @@ -1,4 +1,5 @@ import theano, numpy +from theano.compile.debugmode import DebugMode from pylearn.algorithms import exponential_mean def test_mean(): @@ -50,6 +51,9 @@ assert i > rows_to_test def test_dynamic_normalizer(): + mode = theano.compile.mode.get_default_mode() + if isinstance(mode,DebugMode): + mode = 'FAST_RUN' x = theano.tensor.dvector() rows_to_test = 100 @@ -76,7 +80,7 @@ M.f = theano.Method([x], [D.output, M.dn_mean.curval, M.dn_var.curval, M.x_mean.curval] , updates) - m = M.make() + m = M.make(mode=mode) m.dn.initialize() m.dn_mean.initialize() m.dn_var.initialize()
--- a/pylearn/algorithms/tests/test_sgd.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/algorithms/tests/test_sgd.py Thu Apr 15 10:52:02 2010 -0400 @@ -1,6 +1,11 @@ import theano +from theano.compile.debugmode import DebugMode from pylearn.algorithms import sgd +mode = theano.compile.mode.get_default_mode() +if isinstance(mode,DebugMode): + mode = 'FAST_RUN' + def test_sgd0(): x = theano.tensor.dscalar('x') @@ -8,7 +13,7 @@ M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=0.01) M.y = y - m = M.make() + m = M.make(mode=mode) m.y = 5.0 for i in xrange(100): c = m.step_cost(3.0) @@ -26,7 +31,7 @@ M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y], stepsize=lr) M.y = y M.lr = lr - m = M.make() + m = M.make(mode=mode) m.y = 5.0 m.lr = 0.01 for i in xrange(100): @@ -54,7 +59,7 @@ M = sgd.StochasticGradientDescent([x], (1.0 - x * y)**2, [y]) M.y = y - m = M.make() + m = M.make(mode=mode) m.y = 5.0 #there should be a learning rate here by default assert m.stepsize is None
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/dataset_ops/majorminer.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,63 @@ +from __future__ import absolute_import + +import os +import numpy + +import theano +import theano.sparse +import scipy.sparse + +from ..datasets.majorminer import Meta + +_meta = None + +class MajorMiner(theano.Op): + """Meta-information of major-miner dataset""" + + def __init__(self, meta=None): + global _meta + # on construction we make sure a *global* configuration is set + # this is done because self.* might get pickled and we don't want to pickle + # the whole dataset + if _meta is None: + if meta is None: _meta = Meta() + else: _meta = meta + else: + if meta is None: pass # no problem, we use global _meta + else: raise NotImplementedError('global MajorMiner meta-information already set') + + def __eq__(self, other): + return type(self) == type(other) + def __hash__(self): + return hash(type(self)) + + def make_node(self, idx): + _idx = theano.tensor.as_tensor_variable(idx, ndim=0) + return theano.Apply(self, + [_idx], + [theano.sparse.csr_matrix('MajorMiner.tag_counts'), + theano.generic('MajorMiner.track_path')]) + def perform(self, node, (idx,), out_storage): + global _meta + lil = scipy.sparse.lil_matrix((1, len(_meta.tags)), dtype='int8') + + for tag_id, count in _meta.track_tags[idx]: + lil[0,tag_id] = count + + out_storage[0][0] = lil.tocsr() + out_storage[1][0] = _meta.tracks[idx] + + def grad(self, inputs, output): + return [None for i in inputs] + + +def test_basic(): + a = theano.tensor.lvector() + f = theano.function([a], MajorMiner()(a)) + print 'f([0]):', f([0]) + rval_0_1 = f([0,1]) + rval_0_8 = f([0,8]) + + assert rval_0_1[1][0] == rval_0_8[1][0] #compare strings + assert rval_0_1[1][1] != rval_0_8[1][1] #track 1 != track 8 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/majorminer.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,157 @@ +""" +Load the MajorMiner dataset +""" + +import logging, os,sys +from .config import data_root +_logger = logging.getLogger('pylearn.datasets.majorminer') + +def three_column(tagfile=None, trackroot=None, expected_tagfile_len=51556): + """Load meta-information of major-miner dataset + + Data is stored as a three-column file: + + <tag> <count> <mp3 path> + + This function returns the parsed file as a list of 3-tuples. + + """ + if tagfile is None: + tagfile = os.path.join(data_root(), 'majorminer', 'three_column.txt') + _logger.info('Majorminer loading %s'%tagfile) + + if trackroot is None: + trackroot = os.path.join(data_root(), 'majorminer') + _logger.info('Majorminer using trackroot %s'%tagfile) + + tag_count_track = [] + + for line in open(tagfile): + if line: + tag, count, track = line[:-1].split('\t') + tag_count_track.append((tag, int(count), os.path.join(trackroot, track))) + + if expected_tagfile_len: + if len(tag_count_track) != expected_tagfile_len: + raise Exception('Wrong number of files listed') + + return tag_count_track + +try: + import mad +except ImportError: + pass + +def remove_bad_tracks(three_col, min_seconds=8): + """Heuristically filter the three_col data to contain only valid tracks + """ + bad_tracks = set() + all_tracks = set() + + silent_tracks = [] + missing_in_action = [] + too_short = [] + + try: + _file = mad.MadFile + test_len = True + except: + _file = file + test_len = False + + + for tag, count, track in three_col: + if track in all_tracks: + continue + all_tracks.add(track) + if tag in set(['silence', 'end', 'nothing']): + bad_tracks.add(track) + silent_tracks.append(track) + _logger.debug("silent file: %s" % track) + continue + + try: + t = _file(track) + except IOError: + bad_tracks.add(track) + missing_in_action.append(track) + _logger.debug("missing file: %s"% track) + # it is normal to have 2 + #if len(missing_in_action) > 5: + #raise Exception('Too many missing files:', missing_in_action) + continue + + if test_len and t.total_time() < min_seconds*1000: + # too short + bad_tracks.add(track) + _logger.debug("short file: %f %s" %(t.total_time(), track)) + too_short.append((track, t.total_time())) + # it is normal to have maybe 10? + #if len(too_short) > 40: + #raise Exception('Too many short files:', too_short) + continue + + if silent_tracks: + _logger.warning("removed %i silent files"% len(silent_tracks)) + + if missing_in_action: + _logger.warning("missing %i files"% len(missing_in_action)) + + if too_short: + _logger.warning("discarded %i files less than %f seconds long"%( + len(too_short), min_seconds)) + + _logger.info("kept %i of %i tracks"% (len(all_tracks)-len(bad_tracks), + len(all_tracks))) + + # return a cleaned three_column list + rval = [] + for tag, count, track in three_col: + if track not in bad_tracks: + rval.append((tag, count, track)) + return rval + + + +def list_tracks(three_col): + tracks = list(set(tup[2] for tup in three_col)) + tracks.sort() + return tracks + +def list_tags(three_col): + tags = list(set(tup[0] for tup in three_col)) + tags.sort() + return tags + +def track_tags(three_col, tracks, tags): + """Return the count of each tag for each track + [ [(tag_id, count), (tag_id, count), ...], <---- for tracks[0] + [(tag_id, count), (tag_id, count), ...], <---- for tracks[1] + ... + ] + """ + tag_id = dict(((t,i) for i,t in enumerate(tags))) + track_id = dict(((t,i) for i,t in enumerate(tracks))) + rval = [[] for t in tracks] + for tag, count, track in three_col: + rval[track_id[track]].append((tag_id[tag], count)) + return rval + + + +class Meta(object): + def __init__(self, tagfile=None, trackroot=None, expected_tagfile_len=51556, + filter_broken=True): + self.three_column = three_column(tagfile, trackroot, expected_tagfile_len) + if filter_broken: + self.three_column = remove_bad_tracks(self.three_column) + self.tracks = list_tracks(self.three_column) + self.tags = list_tags(self.three_column) + self.track_tags = track_tags(self.three_column, self.tracks, self.tags) + + _logger.info('MajorMiner meta-information: %i tracks, %i tags' %( + len(self.tracks), len(self.tags))) + + #for tt in self.track_tags: + # print tt +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/datasets/nist_digits.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,66 @@ +""" +Provides a Dataset to access the nist digits dataset. +""" + +import os, numpy +from pylearn.io import filetensor as ft +from pylearn.datasets.config import data_root # config +from pylearn.datasets.dataset import Dataset + +from pylearn.datasets.nist_sd import nist_to_float_11, nist_to_float_01 + + +def load(dataset = 'train', attribute = 'data'): + """Load the filetensor corresponding to the set and attribute. + + :param dataset: str that is 'train', 'valid' or 'test' + :param attribute: str that is 'data' or 'labels' + """ + fn = 'digits_' + dataset + '_' + attribute + '.ft' + fn = os.path.join(data_root(), 'nist', 'by_class', 'digits', fn) + + fd = open(fn) + data = ft.read(fd) + fd.close() + + return data + +def train_valid_test(ntrain=285661, nvalid=58646, ntest=58646, path=None, + range = '01'): + """ + Load the nist digits dataset as a Dataset. + + @note: the examples are uint8 and the labels are int32. + @todo: possibility of loading part of the data. + """ + rval = Dataset() + + # + rval.n_classes = 10 + rval.img_shape = (32,32) + + if range == '01': + rval.preprocess = nist_to_float_01 + elif range == '11': + rval.preprocess = nist_to_float_11 + else: + raise ValueError('Nist Digits dataset does not support range = %s' % range) + print "Nist Digits dataset: using preproc will provide inputs in the %s range." \ + % range + + # train + examples = load(dataset = 'train', attribute = 'data') + labels = load(dataset = 'train', attribute = 'labels') + rval.train = Dataset.Obj(x=examples[:ntrain], y=labels[:ntrain]) + + # valid + rval.valid = Dataset.Obj(x=examples[285661:285661+nvalid], y=labels[285661:285661+nvalid]) + + # test + examples = load(dataset = 'test', attribute = 'data') + labels = load(dataset = 'test', attribute = 'labels') + rval.test = Dataset.Obj(x=examples[:ntest], y=labels[:ntest]) + + return rval + +
--- a/pylearn/io/audio.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/io/audio.py Thu Apr 15 10:52:02 2010 -0400 @@ -1,10 +1,148 @@ - +import subprocess, sys import numpy import theano from wavread import WavRead, wav_read_int16, wav_read_double +import mad -try: #define audioread and company only if pygmy.audio can be imported +def gen_mp3(madfile, dtype, scale): + printed = False + + while True: + b = madfile.read() + if b is None: + break + b = numpy.frombuffer(b, dtype='int16') + #print len(b), b.min(), b.max() + if not printed: + bb = b.reshape((len(b)/2,2)) + print bb[1000:1020] + #print 'first 10 mp3samples', b[:10] + #print b[:10] * (1.0 / 2**15) + printed = True + n = len(b) + assert not (n%2) + yield scale*numpy.asarray(b, dtype=dtype).reshape((n/2, 2)) #cast and reshape + +class AudioRead(theano.Op): + #TODO: add the samplerate as an output + """Read an mp3 (other formats not implemented yet) + + Depends on 'madplay' being on system path. + + input - filename + output - the contents of the audiofile in pcm format + + """ + def __init__(self, channels=2, sr=22050, dtype=theano.config.floatX): + """ + :param channels: output this many channels + :param sr: output will be encoded at this samplerate + :param dtype: output will have this dtype + """ + self.dtype = dtype + if dtype not in ('float32', 'float64', 'int16'): + raise NotImplementedError('dtype', dtype) + self.channels = channels + self.sr = sr + + def __eq__(self, other): + return (type(self) == type(other)) and self.dtype == other.dtype \ + and self.channels == other.channels and self.sr == other.sr + + def __hash__(self): + return hash(type(self)) ^ hash(self.dtype) ^ hash(self.channels) ^ hash(self.sr) + + def make_node(self, path): + bcast = (False,) *self.channels + otype = theano.tensor.TensorType(broadcastable=bcast, dtype=self.dtype) + return theano.Apply(self, [path], [otype(),]) + + def perform(self, node, (path,), (data_storage, )): + if path.upper().endswith('.MP3'): + cmd = ['madplay'] + cmd.extend(['--sample-rate', str(self.sr)]) + cmd.extend(['-o', 'raw:/dev/stdout']) + cmd.extend(['-d',]) + if self.channels==1: + cmd.extend(['--mono']) + elif self.channels==2: + cmd.extend(['--stereo']) + else: + raise NotImplementedError("weird number of channels", self.channels) + cmd.append(path) + + proc = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + proc_stdout, proc_stderr = proc.communicate() + assert proc.returncode is not None # process should be finished + if proc.returncode: + print >> sys.stderr, proc_stderr + raise Exception('cmd %s returned code %i'%(' '.join(cmd),proc.returncode)) + + int16samples= numpy.frombuffer(proc_stdout, dtype='int16') + if self.dtype == 'float32': + typedsamples = numpy.asarray(int16samples, dtype='float32') / numpy.float32(2**15) + elif self.dtype == 'float64': + typedsamples = int16samples * (1.0/2**15) + elif self.dtype == 'int16': + typedsamples = int16samples + else: + raise NotImplementedError() + + if self.channels==2: + typedsamples = typedsamples.reshape((len(typedsamples)/2,2)) + else: + #TODO: if extension is .wav use the 'wave' module in the stdlib + # see test_audioread below for usage + raise NotImplementedError() + + assert typedsamples.dtype == self.dtype + assert len(typedsamples.shape) == self.channels, (typedsamples.shape, self.channels) + data_storage[0] = typedsamples + + def grad(self, inputs, g_output): + return [None for i in inputs] + + +def test_audioread(): + # + # Not really a unit test because it depends on files that are probably not around anymore. + # Still, the basic idea is to decode externally, and compare with wavread. + # + + mp3path = "/home/bergstra/data/majorminer/mp3/Mono/Formica Blues/03 Slimcea Girl_003.20_003.30.mp3" + + dstorage = [None] + AudioRead(channels=1, dtype='float32', sr=44100).perform(None, (mp3path,), (dstorage, )) + mp3samples = dstorage[0] + + wavpath = "/home/bergstra/tmp/blah2.wav" + import wave, numpy + wavfile = wave.open(wavpath) + assert wavfile.getsampwidth()==2 # bytes + wavsamples = numpy.frombuffer( + wavfile.readframes(wavfile.getnframes()), + dtype='int16') + wavsamples = wavsamples.reshape((wavfile.getnframes(), wavfile.getnchannels())) + wavsamples_as_float = numpy.asarray(wavsamples, dtype='float32') / 2**15 + + print 'wavsamples 1000:1020:', wavsamples[1000:1020].mean(axis=1) + print 'mp3samples 1000:1020:', mp3samples[1000:1020]*2**15 + print 'wavsample range', wavsamples.min(), wavsamples.max() + print 'mp3sample range', mp3samples.min(), mp3samples.max() + + print mp3samples.shape, mp3samples.dtype + print wavsamples.shape, wavsamples.dtype + + #assert mp3samples.shape == wavsamples.shape + #assert mp3samples.dtype == wavsamples_as_float.dtype + + #print wavsamples_as_float[:5] + #print mp3samples[:5] + + + +if 0: ### OLD CODE USING PYGMY import pygmy.audio class AudioRead(theano.Op): @@ -42,6 +180,7 @@ def make_node(self, path): out_type = theano.tensor.dvector if self.mono else theano.tensor.dmatrix return theano.Apply(self, [path], [out_type(), theano.tensor.dscalar()]) + def perform(self, node, (path,), (data_storage, sr_storage)): data, sr, dz = pygmy.audio.audioread(path, mono=self.mono, @@ -64,6 +203,3 @@ audioread = AudioRead() audioread_mono = AudioRead(mono=True) -except ImportError: - pass -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/seriestables/__init__.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,2 @@ +from series import ErrorSeries, BasicStatisticsSeries, AccumulatorSeriesWrapper, SeriesArrayWrapper, SharedParamsStatisticsWrapper, DummySeries, StdoutAppendTarget, AppendTarget +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/seriestables/series.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,684 @@ +import tables + +import numpy +import time + +############################################################################## +# Utility functions to create IsDescription objects (pytables data types) + +''' +The way these "IsDescription constructor" work is simple: write the +code as if it were in a file, then exec()ute it, leaving us with +a local-scoped LocalDescription which may be used to call createTable. + +It's a small hack, but it's necessary as the names of the columns +are retrieved based on the variable name, which we can't programmatically set +otherwise. +''' + +def _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock, pos=0): + toexec = "" + + if store_timestamp: + toexec += "\ttimestamp = tables.Time32Col(pos="+str(pos)+")\n" + pos += 1 + + if store_cpuclock: + toexec += "\tcpuclock = tables.Float64Col(pos="+str(pos)+")\n" + pos += 1 + + return toexec, pos + +def _get_description_n_ints(int_names, int_width=64, pos=0): + """ + Begins construction of a class inheriting from IsDescription + to construct an HDF5 table with index columns named with int_names. + + See Series().__init__ to see how those are used. + """ + int_constructor = "tables.Int64Col" + if int_width == 32: + int_constructor = "tables.Int32Col" + elif not int_width in (32, 64): + raise "int_width must be left unspecified, or should equal 32 or 64" + + toexec = "" + + for n in int_names: + toexec += "\t" + n + " = " + int_constructor + "(pos=" + str(pos) + ")\n" + pos += 1 + + return toexec, pos + +def _get_description_with_n_ints_n_floats(int_names, float_names, + int_width=64, float_width=32, + store_timestamp=True, store_cpuclock=True): + """ + Constructs a class to be used when constructing a table with PyTables. + + This is useful to construct a series with an index with multiple levels. + E.g. if you want to index your "validation error" with "epoch" first, then + "minibatch_index" second, you'd use two "int_names". + + Parameters + ---------- + int_names : tuple of str + Names of the int (e.g. index) columns + float_names : tuple of str + Names of the float (e.g. error) columns + int_width : {'32', '64'} + Type of ints. + float_width : {'32', '64'} + Type of floats. + store_timestamp : bool + See __init__ of Series + store_cpuclock : bool + See __init__ of Series + + Returns + ------- + A class object, to pass to createTable() + """ + + toexec = "class LocalDescription(tables.IsDescription):\n" + + toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock) + toexec += toexec_ + + toexec_, pos = _get_description_n_ints(int_names, int_width=int_width, pos=pos) + toexec += toexec_ + + float_constructor = "tables.Float32Col" + if float_width == 64: + float_constructor = "tables.Float64Col" + elif not float_width in (32, 64): + raise "float_width must be left unspecified, or should equal 32 or 64" + + for n in float_names: + toexec += "\t" + n + " = " + float_constructor + "(pos=" + str(pos) + ")\n" + pos += 1 + + exec(toexec) + + return LocalDescription + + +############################################################################## +# Generic target helpers, other than HDF5 itself + +class AppendTarget(object): + def __init__(self): + pass + + def append(self, table, row): + pass + +class StdoutAppendTarget(AppendTarget): + ''' + Every append() translates into the row being printed on stdout, + each field on a line of the form "column_name : value" + ''' + def __init__(self, prepend='\n', indent_str='\t'): + ''' + Parameters + ---------- + prepend : str + String to prepend before each "append()" is dumped on stdout. + indent_str : str + Chars to prepend to each line + ''' + self.prepend = prepend + self.indent_str = indent_str + + def append(self, table, row): + print self.prepend + pretty_print_row(table, row, self.indent_str) + +def pretty_print_row(table, row, indent): + for key in table.colnames: + print indent, key, ":", row[key] + +class CallbackAppendTarget(AppendTarget): + ''' + Mostly to be used for tests. + ''' + def __init__(self, callback): + self.callback = callback + + def append(self, table, row): + self.callback(table, row) + +############################################################################## +# Series classes + +# Shortcut to allow passing a single int as index, instead of a tuple +def _index_to_tuple(index): + if type(index) == tuple: + return index + + if type(index) == list: + index = tuple(index) + return index + + try: + if index % 1 > 0.001 and index % 1 < 0.999: + raise + idx = long(index) + return (idx,) + except: + raise TypeError("index must be a tuple of integers, or at least a single integer") + +class Series(object): + """ + Base Series class, with minimal arguments and type checks. + + Yet cannot be used by itself (it's append() method raises an error) + """ + + def __init__(self, table_name, hdf5_file, index_names=('epoch',), + title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True, + other_targets=[], skip_hdf5_append=False): + """Basic arguments each Series must get. + + Parameters + ---------- + table_name : str + Name of the table to create under group "hd5_group" (other + parameter). No spaces, ie. follow variable naming restrictions. + hdf5_file : open HDF5 file + File opened with openFile() in PyTables (ie. return value of + openFile). + index_names : tuple of str + Columns to use as index for elements in the series, other + example would be ('epoch', 'minibatch'). This would then allow + you to call append(index, element) with index made of two ints, + one for epoch index, one for minibatch index in epoch. + title : str + Title to attach to this table as metadata. Can contain spaces + and be longer then the table_name. + hdf5_group : str + Path of the group (kind of a file) in the HDF5 file under which + to create the table. + store_timestamp : bool + Whether to create a column for timestamps and store them with + each record. + store_cpuclock : bool + Whether to create a column for cpu clock and store it with + each record. + other_targets : list of str or AppendTarget instances + + """ + + ######################################### + # checks + + if type(table_name) != str: + raise TypeError("table_name must be a string") + if table_name == "": + raise ValueError("table_name must not be empty") + + if not isinstance(hdf5_file, tables.file.File): + raise TypeError("hdf5_file must be an open HDF5 file (use tables.openFile)") + #if not ('w' in hdf5_file.mode or 'a' in hdf5_file.mode): + # raise ValueError("hdf5_file must be opened in write or append mode") + + if type(index_names) != tuple: + raise TypeError("index_names must be a tuple of strings." + \ + "If you have only one element in the tuple, don't forget " +\ + "to add a comma, e.g. ('epoch',).") + for name in index_names: + if type(name) != str: + raise TypeError("index_names must only contain strings, but also"+\ + "contains a "+str(type(name))+".") + + if type(title) != str: + raise TypeError("title must be a string, even if empty") + + if type(hdf5_group) != str: + raise TypeError("hdf5_group must be a string") + + if type(store_timestamp) != bool: + raise TypeError("store_timestamp must be a bool") + + if type(store_cpuclock) != bool: + raise TypeError("store_timestamp must be a bool") + + if type(other_targets) != list: + raise TypeError("other_targets must be a list") + else: + for t in other_targets: + if not isinstance(t, AppendTarget): + raise TypeError("other_targets elements must be instances of AppendTarget") + + if type(skip_hdf5_append) != bool: + raise TypeError("skip_hdf5_append must be a bool") + + ######################################### + + self.table_name = table_name + self.hdf5_file = hdf5_file + self.index_names = index_names + self.title = title + self.hdf5_group = hdf5_group + + self.store_timestamp = store_timestamp + self.store_cpuclock = store_cpuclock + + self.other_targets = other_targets + self.skip_hdf5_append = skip_hdf5_append + + def append(self, index, element): + raise NotImplementedError + + def _timestamp_cpuclock(self, newrow): + if self.store_timestamp: + newrow["timestamp"] = time.time() + + if self.store_cpuclock: + newrow["cpuclock"] = time.clock() + +class DummySeries(): + """ + To put in a series dictionary instead of a real series, to do nothing + when we don't want a given series to be saved. + + E.g. if we'd normally have a "training_error" series in a dictionary + of series, the training loop would have something like this somewhere: + + series["training_error"].append((15,), 20.0) + + but if we don't want to save the training errors this time, we simply + do + + series["training_error"] = DummySeries() + """ + def append(self, index, element): + pass + +class ErrorSeries(Series): + """ + Most basic Series: saves a single float (called an Error as this is + the most common use case I foresee) along with an index (epoch, for + example) and timestamp/cpu.clock for each of these floats. + """ + + def __init__(self, error_name, table_name, + hdf5_file, index_names=('epoch',), + title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True, + other_targets=[], skip_hdf5_append=False): + """ + For most parameters, see Series.__init__ + + Parameters + ---------- + error_name : str + In the HDF5 table, column name for the error float itself. + """ + + # most type/value checks are performed in Series.__init__ + Series.__init__(self, table_name, hdf5_file, index_names, title, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock, + other_targets=other_targets, + skip_hdf5_append=skip_hdf5_append) + + if type(error_name) != str: + raise TypeError("error_name must be a string") + if error_name == "": + raise ValueError("error_name must not be empty") + + self.error_name = error_name + + self._create_table() + + def _create_table(self): + table_description = _get_description_with_n_ints_n_floats( \ + self.index_names, (self.error_name,), + store_timestamp=self.store_timestamp, + store_cpuclock=self.store_cpuclock) + + self._table = self.hdf5_file.createTable(self.hdf5_group, + self.table_name, + table_description, + title=self.title) + + + def append(self, index, error): + """ + Parameters + ---------- + index : tuple of int + Following index_names passed to __init__, e.g. (12, 15) if + index_names were ('epoch', 'minibatch_size'). + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + error : float + Next error in the series. + """ + index = _index_to_tuple(index) + + if len(index) != len(self.index_names): + raise ValueError("index provided does not have the right length (expected " \ + + str(len(self.index_names)) + " got " + str(len(index))) + + # other checks are implicit when calling newrow[..] =, + # which should throw an error if not of the right type + + newrow = self._table.row + + # Columns for index in table are based on index_names + for col_name, value in zip(self.index_names, index): + newrow[col_name] = value + newrow[self.error_name] = error + + # adds timestamp and cpuclock to newrow if necessary + self._timestamp_cpuclock(newrow) + + for t in self.other_targets: + t.append(self._table, newrow) + + if not self.skip_hdf5_append: + newrow.append() + + self.hdf5_file.flush() + +# Does not inherit from Series because it does not itself need to +# access the hdf5_file and does not need a series_name (provided +# by the base_series.) +class AccumulatorSeriesWrapper(): + ''' + Wraps a Series by accumulating objects passed its Accumulator.append() + method and "reducing" (e.g. calling numpy.mean(list)) once in a while, + every "reduce_every" calls in fact. + ''' + + def __init__(self, base_series, reduce_every, reduce_function=numpy.mean): + """ + Parameters + ---------- + base_series : Series + This object must have an append(index, value) function. + + reduce_every : int + Apply the reduction function (e.g. mean()) every time we get this + number of elements. E.g. if this is 100, then every 100 numbers + passed to append(), we'll take the mean and call append(this_mean) + on the BaseSeries. + + reduce_function : function + Must take as input an array of "elements", as passed to (this + accumulator's) append(). Basic case would be to take an array of + floats and sum them into one float, for example. + """ + self.base_series = base_series + self.reduce_function = reduce_function + self.reduce_every = reduce_every + + self._buffer = [] + + + def append(self, index, element): + """ + Parameters + ---------- + index : tuple of int + The index used is the one of the last element reduced. E.g. if + you accumulate over the first 1000 minibatches, the index + passed to the base_series.append() function will be 1000. + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + element : float + Element that will be accumulated. + """ + self._buffer.append(element) + + if len(self._buffer) == self.reduce_every: + reduced = self.reduce_function(self._buffer) + self.base_series.append(index, reduced) + self._buffer = [] + + # The >= case should never happen, except if lists + # were appended by accessing _buffer externally (when it's + # intended to be private), which should be a red flag. + assert len(self._buffer) < self.reduce_every + +# Outside of class to fix an issue with exec in Python 2.6. +# My sorries to the god of pretty code. +def _BasicStatisticsSeries_construct_table_toexec(index_names, store_timestamp, store_cpuclock): + toexec = "class LocalDescription(tables.IsDescription):\n" + + toexec_, pos = _get_description_timestamp_cpuclock_columns(store_timestamp, store_cpuclock) + toexec += toexec_ + + toexec_, pos = _get_description_n_ints(index_names, pos=pos) + toexec += toexec_ + + toexec += "\tmean = tables.Float32Col(pos=" + str(pos) + ")\n" + toexec += "\tmin = tables.Float32Col(pos=" + str(pos+1) + ")\n" + toexec += "\tmax = tables.Float32Col(pos=" + str(pos+2) + ")\n" + toexec += "\tstd = tables.Float32Col(pos=" + str(pos+3) + ")\n" + + # This creates "LocalDescription", which we may then use + exec(toexec) + + return LocalDescription + +# Defaults functions for BasicStatsSeries. These can be replaced. +_basic_stats_functions = {'mean': lambda(x): numpy.mean(x), + 'min': lambda(x): numpy.min(x), + 'max': lambda(x): numpy.max(x), + 'std': lambda(x): numpy.std(x)} + +class BasicStatisticsSeries(Series): + + def __init__(self, table_name, hdf5_file, + stats_functions=_basic_stats_functions, + index_names=('epoch',), title="", hdf5_group='/', + store_timestamp=True, store_cpuclock=True, + other_targets=[], skip_hdf5_append=False): + """ + For most parameters, see Series.__init__ + + Parameters + ---------- + series_name : str + Not optional here. Will be prepended with "Basic statistics for " + + stats_functions : dict, optional + Dictionary with a function for each key "mean", "min", "max", + "std". The function must take whatever is passed to append(...) + and return a single number (float). + """ + + # Most type/value checks performed in Series.__init__ + Series.__init__(self, table_name, hdf5_file, index_names, title, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock, + other_targets=other_targets, + skip_hdf5_append=skip_hdf5_append) + + if type(hdf5_group) != str: + raise TypeError("hdf5_group must be a string") + + if type(stats_functions) != dict: + # just a basic check. We'll suppose caller knows what he's doing. + raise TypeError("stats_functions must be a dict") + + self.hdf5_group = hdf5_group + + self.stats_functions = stats_functions + + self._create_table() + + def _create_table(self): + table_description = \ + _BasicStatisticsSeries_construct_table_toexec( \ + self.index_names, + self.store_timestamp, self.store_cpuclock) + + self._table = self.hdf5_file.createTable(self.hdf5_group, + self.table_name, table_description) + + def append(self, index, array): + """ + Parameters + ---------- + index : tuple of int + Following index_names passed to __init__, e.g. (12, 15) + if index_names were ('epoch', 'minibatch_size') + A single int (not tuple) is acceptable if index_names has a single + element. + An array will be casted to a tuple, as a convenience. + + array + Is of whatever type the stats_functions passed to + __init__ can take. Default is anything numpy.mean(), + min(), max(), std() can take. + """ + index = _index_to_tuple(index) + + if len(index) != len(self.index_names): + raise ValueError("index provided does not have the right length (expected " \ + + str(len(self.index_names)) + " got " + str(len(index))) + + newrow = self._table.row + + for col_name, value in zip(self.index_names, index): + newrow[col_name] = value + + newrow["mean"] = self.stats_functions['mean'](array) + newrow["min"] = self.stats_functions['min'](array) + newrow["max"] = self.stats_functions['max'](array) + newrow["std"] = self.stats_functions['std'](array) + + self._timestamp_cpuclock(newrow) + + for t in self.other_targets: + t.append(self._table, newrow) + + if not self.skip_hdf5_append: + newrow.append() + + self.hdf5_file.flush() + +class SeriesArrayWrapper(): + """ + Simply redistributes any number of elements to sub-series to respective + append()s. + + To use if you have many elements to append in similar series, e.g. if you + have an array containing [train_error, valid_error, test_error], and 3 + corresponding series, this allows you to simply pass this array of 3 + values to append() instead of passing each element to each individual + series in turn. + """ + + def __init__(self, base_series_list): + """ + Parameters + ---------- + base_series_list : array or tuple of Series + You must have previously created and configured each of those + series, then put them in an array. This array must follow the + same order as the array passed as ``elements`` parameter of + append(). + """ + self.base_series_list = base_series_list + + def append(self, index, elements): + """ + Parameters + ---------- + index : tuple of int + See for example ErrorSeries.append() + + elements : array or tuple + Array or tuple of elements that will be passed down to + the base_series passed to __init__, in the same order. + """ + if len(elements) != len(self.base_series_list): + raise ValueError("not enough or too much elements provided (expected " \ + + str(len(self.base_series_list)) + " got " + str(len(elements))) + + for series, el in zip(self.base_series_list, elements): + series.append(index, el) + +class SharedParamsStatisticsWrapper(SeriesArrayWrapper): + ''' + Save mean, min/max, std of shared parameters place in an array. + + Here "shared" means "theano.shared", which means elements of the + array will have a .value to use for numpy.mean(), etc. + + This inherits from SeriesArrayWrapper, which provides the append() + method. + ''' + + def __init__(self, arrays_names, new_group_name, hdf5_file, + base_group='/', index_names=('epoch',), title="", + store_timestamp=True, store_cpuclock=True, + other_targets=[], skip_hdf5_append=False): + """ + For other parameters, see Series.__init__ + + Parameters + ---------- + array_names : array or tuple of str + Name of each array, in order of the array passed to append(). E.g. + ('layer1_b', 'layer1_W', 'layer2_b', 'layer2_W') + + new_group_name : str + Name of a new HDF5 group which will be created under base_group to + store the new series. + + base_group : str + Path of the group under which to create the new group which will + store the series. + + title : str + Here the title is attached to the new group, not a table. + + store_timestamp : bool + Here timestamp and cpuclock are stored in *each* table + + store_cpuclock : bool + Here timestamp and cpuclock are stored in *each* table + """ + + # most other checks done when calling BasicStatisticsSeries + if type(new_group_name) != str: + raise TypeError("new_group_name must be a string") + if new_group_name == "": + raise ValueError("new_group_name must not be empty") + + base_series_list = [] + + new_group = hdf5_file.createGroup(base_group, new_group_name, title=title) + + stats_functions = {'mean': lambda(x): numpy.mean(x.value), + 'min': lambda(x): numpy.min(x.value), + 'max': lambda(x): numpy.max(x.value), + 'std': lambda(x): numpy.std(x.value)} + + for name in arrays_names: + base_series_list.append( + BasicStatisticsSeries( + table_name=name, + hdf5_file=hdf5_file, + index_names=index_names, + stats_functions=stats_functions, + hdf5_group=new_group._v_pathname, + store_timestamp=store_timestamp, + store_cpuclock=store_cpuclock, + other_targets=other_targets, + skip_hdf5_append=skip_hdf5_append)) + + SeriesArrayWrapper.__init__(self, base_series_list) + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/io/seriestables/test_series.py Thu Apr 15 10:52:02 2010 -0400 @@ -0,0 +1,359 @@ +import tempfile + +import numpy +import numpy.random + +from jobman import DD + +import tables + +from series import * +import series + +################################################# +# Utils + +def compare_floats(f1,f2): + if f1-f2 < 1e-3: + return True + return False + +def compare_lists(it1, it2, floats=False): + if len(it1) != len(it2): + return False + + for el1, el2 in zip(it1, it2): + if floats: + if not compare_floats(el1,el2): + return False + elif el1 != el2: + return False + + return True + +################################################# +# Basic Series class tests + +def test_Series_types(): + pass + +################################################# +# ErrorSeries tests + +def test_ErrorSeries_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + validation_error.append((1,2), 30.0) + validation_error.append((2,1), 28.0) + validation_error.append((2,2), 26.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1,1,2,2]) + assert compare_lists(table.cols.minibatch[:], [1,2,1,2]) + assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0]) + assert len(table) == 4 + +def test_ErrorSeries_no_index(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + # empty tuple + index_names=tuple(), + title="Validation error with no index") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append(tuple(), 32.0) + validation_error.append(tuple(), 30.0) + validation_error.append(tuple(), 28.0) + validation_error.append(tuple(), 26.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.validation_error[:], [32.0, 30.0, 28.0, 26.0]) + assert not ("epoch" in dir(table.cols)) + +def test_ErrorSeries_notimestamp(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch", + store_timestamp=False) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1]) + assert not ("timestamp" in dir(table.cols)) + assert "cpuclock" in dir(table.cols) + +def test_ErrorSeries_nocpuclock(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = series.ErrorSeries(error_name="validation_error", table_name="validation_error", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Validation error indexed by epoch and minibatch", + store_cpuclock=False) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append((1,1), 32.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + assert compare_lists(table.cols.epoch[:], [1]) + assert not ("cpuclock" in dir(table.cols)) + assert "timestamp" in dir(table.cols) + +def test_AccumulatorSeriesWrapper_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_error = ErrorSeries(error_name="accumulated_validation_error", + table_name="accumulated_validation_error", + hdf5_file=h5f, + index_names=('epoch','minibatch'), + title="Validation error, summed every 3 minibatches, indexed by epoch and minibatch") + + accumulator = AccumulatorSeriesWrapper(base_series=validation_error, + reduce_every=3, reduce_function=numpy.sum) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + accumulator.append((1,1), 32.0) + accumulator.append((1,2), 30.0) + accumulator.append((2,1), 28.0) + accumulator.append((2,2), 26.0) + accumulator.append((3,1), 24.0) + accumulator.append((3,2), 22.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'accumulated_validation_error') + + assert compare_lists(table.cols.epoch[:], [2,3]) + assert compare_lists(table.cols.minibatch[:], [1,2]) + assert compare_lists(table.cols.accumulated_validation_error[:], [90.0,72.0], floats=True) + +def test_BasicStatisticsSeries_common_case(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats_series = BasicStatisticsSeries(table_name="b_vector_statistics", + hdf5_file=h5f, index_names=('epoch','minibatch'), + title="Basic statistics for b vector indexed by epoch and minibatch") + + # (1,1), (1,2) etc. are (epoch, minibatch) index + stats_series.append((1,1), [0.15, 0.20, 0.30]) + stats_series.append((1,2), [-0.18, 0.30, 0.58]) + stats_series.append((2,1), [0.18, -0.38, -0.68]) + stats_series.append((2,2), [0.15, 0.02, 1.9]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'b_vector_statistics') + + assert compare_lists(table.cols.epoch[:], [1,1,2,2]) + assert compare_lists(table.cols.minibatch[:], [1,2,1,2]) + assert compare_lists(table.cols.mean[:], [0.21666667, 0.23333333, -0.29333332, 0.69], floats=True) + assert compare_lists(table.cols.min[:], [0.15000001, -0.18000001, -0.68000001, 0.02], floats=True) + assert compare_lists(table.cols.max[:], [0.30, 0.58, 0.18, 1.9], floats=True) + assert compare_lists(table.cols.std[:], [0.06236095, 0.31382939, 0.35640177, 0.85724366], floats=True) + +def test_SharedParamsStatisticsWrapper_commoncase(h5f=None): + import numpy.random + + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/", + arrays_names=('b1','b2','b3'), hdf5_file=h5f, + index_names=('epoch','minibatch')) + + b1 = DD({'value':numpy.random.rand(5)}) + b2 = DD({'value':numpy.random.rand(5)}) + b3 = DD({'value':numpy.random.rand(5)}) + stats.append((1,1), [b1,b2,b3]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + b1_table = h5f.getNode('/params', 'b1') + b3_table = h5f.getNode('/params', 'b3') + + assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3 + assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3 + assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3 + assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3 + +def test_SharedParamsStatisticsWrapper_notimestamp(h5f=None): + import numpy.random + + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + stats = SharedParamsStatisticsWrapper(new_group_name="params", base_group="/", + arrays_names=('b1','b2','b3'), hdf5_file=h5f, + index_names=('epoch','minibatch'), + store_timestamp=False) + + b1 = DD({'value':numpy.random.rand(5)}) + b2 = DD({'value':numpy.random.rand(5)}) + b3 = DD({'value':numpy.random.rand(5)}) + stats.append((1,1), [b1,b2,b3]) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + b1_table = h5f.getNode('/params', 'b1') + b3_table = h5f.getNode('/params', 'b3') + + assert b1_table.cols.mean[0] - numpy.mean(b1.value) < 1e-3 + assert b3_table.cols.mean[0] - numpy.mean(b3.value) < 1e-3 + assert b1_table.cols.min[0] - numpy.min(b1.value) < 1e-3 + assert b3_table.cols.min[0] - numpy.min(b3.value) < 1e-3 + + assert not ('timestamp' in dir(b1_table.cols)) + +def test_get_desc(): + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + desc = series._get_description_with_n_ints_n_floats(("col1","col2"), ("col3","col4")) + + mytable = h5f.createTable('/', 'mytable', desc) + + # just make sure the columns are there... otherwise this will throw an exception + mytable.cols.col1 + mytable.cols.col2 + mytable.cols.col3 + mytable.cols.col4 + + try: + # this should fail... LocalDescription must be local to get_desc_etc + test = LocalDescription + assert False + except: + assert True + + assert True + +def test_index_to_tuple_floaterror(): + try: + series._index_to_tuple(5.1) + assert False + except TypeError: + assert True + +def test_index_to_tuple_arrayok(): + tpl = series._index_to_tuple([1,2,3]) + assert type(tpl) == tuple and tpl[1] == 2 and tpl[2] == 3 + +def test_index_to_tuple_intbecomestuple(): + tpl = series._index_to_tuple(32) + + assert type(tpl) == tuple and tpl == (32,) + +def test_index_to_tuple_longbecomestuple(): + tpl = series._index_to_tuple(928374928374928L) + + assert type(tpl) == tuple and tpl == (928374928374928L,) + + + + + +def test_ErrorSeries_appendtarget(h5f=None): + if not h5f: + h5f_path = tempfile.NamedTemporaryFile().name + h5f = tables.openFile(h5f_path, "w") + + validation_errors_from_callback = [] + + def my_callback(table, row): + validation_errors_from_callback.append(row['validation_error']) + + my_callback_target = CallbackAppendTarget(my_callback) + + validation_error = series.ErrorSeries(error_name="validation_error", + table_name="validation_error", + hdf5_file=h5f, + index_names=('minibatch',), + title="Validation error with no index", + other_targets=[my_callback_target], + skip_hdf5_append=True) + + # (1,1), (1,2) etc. are (epoch, minibatch) index + validation_error.append(2, 32.0) + validation_error.append(3, 30.0) + validation_error.append(4, 28.0) + validation_error.append(5, 26.0) + + h5f.close() + + h5f = tables.openFile(h5f_path, "r") + + table = h5f.getNode('/', 'validation_error') + + # h5f should be empty + assert len(table) == 0 + + assert compare_lists(validation_errors_from_callback, [32.0,30.0,28.0,26.0]) + + + + + + +if __name__ == '__main__': + import tempfile + test_get_desc() + test_ErrorSeries_common_case() + test_BasicStatisticsSeries_common_case() + test_AccumulatorSeriesWrapper_common_case() + test_SharedParamsStatisticsWrapper_commoncase() + test_ErrorSeries_appendtarget() +
--- a/pylearn/sandbox/test_scan_inputs_groups.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/sandbox/test_scan_inputs_groups.py Thu Apr 15 10:52:02 2010 -0400 @@ -9,6 +9,7 @@ import theano.tensor as T from pylearn.sandbox.scan_inputs_groups import FillMissing import theano.compile.mode as mode_module +import theano class TestFillMissing(unittest.TestCase): def setUp(self): @@ -16,9 +17,9 @@ #we need to desactivate the check for NaN value as we have them in input #TODO: Make an option to don't check NaN value in input only, bug check in output. - m=mode_module.default_mode - if m=="DEBUG_MODE": - m=copy.copy(mode_module.predefined_modes[m]) + m=mode_module.get_default_mode() + if isinstance(m,theano.compile.debugmode.DebugMode): + m=copy.copy(m) m.check_isfinite=False self.mode = m
--- a/pylearn/shared/layers/kording2004.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/shared/layers/kording2004.py Thu Apr 15 10:52:02 2010 -0400 @@ -1,7 +1,6 @@ import numpy import theano.tensor -from hpu.theano_outgoing import mean, var, cov - +from theano.tensor.basic import mean from pylearn.shared.layers.exponential_mean import ExponentialMean # exponential_mean.py import logging
--- a/pylearn/shared/layers/lecun1998.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/shared/layers/lecun1998.py Thu Apr 15 10:52:02 2010 -0400 @@ -8,8 +8,8 @@ from theano import tensor from theano.compile import shared, pfunc -from theano.sandbox.conv import ConvOp -from theano.sandbox.downsample import DownsampleFactorMax +from theano.tensor.nnet.conv import ConvOp +from theano.tensor.signal.downsample import DownsampleFactorMax from pylearn.shared.layers.util import update_locals from pylearn.shared.layers.squash import squash
--- a/pylearn/shared/layers/logreg.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/shared/layers/logreg.py Thu Apr 15 10:52:02 2010 -0400 @@ -15,12 +15,14 @@ update_locals(self, locals()) @classmethod - def new(cls, input, n_in, n_out, dtype=None): + def new(cls, input, n_in, n_out, dtype=None, name=None): if dtype is None: dtype = input.dtype + if name is None: + name = cls.__name__ cls._debug('allocating params w, b', n_in, n_out, dtype) - w = shared(numpy.zeros((n_in, n_out), dtype=dtype)) - b = shared(numpy.zeros((n_out,), dtype=dtype)) + w = shared(numpy.zeros((n_in, n_out), dtype=dtype), name='%s.w'%name) + b = shared(numpy.zeros((n_out,), dtype=dtype), name='%s.b'%name) return cls(input, w, b, params=[w,b])
--- a/pylearn/shared/layers/rust2005.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/shared/layers/rust2005.py Thu Apr 15 10:52:02 2010 -0400 @@ -28,7 +28,7 @@ from theano.compile import shared from theano.sandbox.softsign import softsign from theano.tensor.nnet import softplus -from theano.sandbox.conv import ConvOp +from theano.tensor.nnet.conv import ConvOp from pylearn.shared.layers.util import update_locals, add_logging
--- a/pylearn/shared/layers/tests/test_kouh2008.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/shared/layers/tests/test_kouh2008.py Thu Apr 15 10:52:02 2010 -0400 @@ -1,5 +1,6 @@ import numpy import theano.compile.debugmode +from theano.compile.debugmode import DebugMode from theano import tensor from theano.compile import pfunc from pylearn.shared.layers import LogisticRegression, Kouh2008 @@ -9,17 +10,20 @@ n_out = 10 n_terms = 3 rng = numpy.random.RandomState(23455) - layer = Kouh2008.new_filters(rng, tensor.dmatrix(), n_in, n_out, n_terms, dtype='float64') + layer = Kouh2008.new_filters_expbounds(rng, tensor.dmatrix(), n_in, n_out, n_terms, dtype='float64') assert layer.output.dtype =='float64' - layer = Kouh2008.new_filters(rng, tensor.fmatrix(), n_in, n_out, n_terms, dtype='float32') + layer = Kouh2008.new_filters_expbounds(rng, tensor.fmatrix(), n_in, n_out, n_terms, dtype='float32') assert layer.output.dtype =='float32' def run_w_random(bsize=10, n_iter=200, n_in = 1024, n_out = 100, n_terms=2, dtype='float64'): + if isinstance(theano.compile.mode.get_default_mode(),DebugMode): + n_iter=2 + x = tensor.dmatrix() y = tensor.lvector() rng = numpy.random.RandomState(23455) - layer = Kouh2008.new_filters(rng, x, n_in, n_out, n_terms, dtype='float64') + layer = Kouh2008.new_filters_expbounds(rng, x, n_in, n_out, n_terms, dtype='float64') out = LogisticRegression.new(layer.output, n_out, 2) cost = out.nll(y).sum() @@ -52,7 +56,7 @@ y = tensor.lvector() rng = numpy.random.RandomState(23455) - layer = Kouh2008.new_filters(rng, x, n_in, n_out, n_terms, dtype='float64') + layer = Kouh2008.new_filters_expbounds(rng, x, n_in, n_out, n_terms, dtype='float64') out = LogisticRegression.new(layer.output, n_out, 2) cost = out.nll(y).sum() #joint optimization except for one of the linear filters @@ -97,11 +101,16 @@ test_A() def test_smaller(): - assert run_w_random(n_in=10, n_out=8) < 6.1 + rval = run_w_random(n_in=10, n_out=8) + if not isinstance(theano.compile.mode.get_default_mode(),DebugMode): + assert rval < 6.1 def test_smaller32(): - assert run_w_random(n_in=10, n_out=8, dtype='float32') < 6.1 + rval = run_w_random(n_in=10, n_out=8, dtype='float32') + if not isinstance(theano.compile.mode.get_default_mode(),DebugMode): + assert rval < 6.1 def test_big(): - assert run_w_random() < 0.1 - + rval = run_w_random() + if not isinstance(theano.compile.mode.get_default_mode(),DebugMode): + assert rval < 0.1
--- a/pylearn/shared/layers/tests/test_lecun1998.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/shared/layers/tests/test_lecun1998.py Thu Apr 15 10:52:02 2010 -0400 @@ -1,8 +1,12 @@ from pylearn.shared.layers.lecun1998 import * from pylearn.shared.layers import LogisticRegression +from theano.compile.debugmode import DebugMode import theano.sandbox.softsign def test_w_random(bsize=10, n_iter=100, dtype='float64'): + if isinstance(theano.compile.mode.get_default_mode(),DebugMode): + n_iter=2 + ishape=(28,28) fshape=(5,5) if dtype == 'float64': @@ -30,7 +34,8 @@ print i, 'rval', fN assert f0 > 6 - assert fN < .3 + if not isinstance(theano.compile.mode.get_default_mode(),DebugMode): + assert fN < .3 def test_squash():
--- a/pylearn/version.py Thu Apr 15 10:50:10 2010 -0400 +++ b/pylearn/version.py Thu Apr 15 10:52:02 2010 -0400 @@ -227,15 +227,15 @@ if resource_type == _imp.PY_COMPILED: return _import_id_py_compiled(location) if resource_type == _imp.C_EXTENSION: - raise NoteImplementedError + raise NotImplementedError if resource_type == _imp.PY_RESOURCE: - raise NoteImplementedError + raise NotImplementedError if resource_type == _imp.PKG_DIRECTORY: return _import_id_pkg_directory(location) if resource_type == _imp.C_BUILTIN: - raise NoteImplementedError + raise NotImplementedError if resource_type == _imp.PY_FROZEN: - raise NoteImplementedError + raise NotImplementedError assert False #the list of resource types above should be exhaustive