# HG changeset patch # User Arnaud Bergeron # Date 1284739292 14400 # Node ID aea510b71386ab5e7c48ee0d1c6e61600cbd2ade # Parent 9686c0d9689de73538d69c29d6092b1ce01c6fa2# Parent f2105a06201c7a6973386c5084d679db5009f094 merge diff -r 9686c0d9689d -r aea510b71386 .hgignore --- a/.hgignore Fri Sep 17 12:01:12 2010 -0400 +++ b/.hgignore Fri Sep 17 12:01:32 2010 -0400 @@ -3,3 +3,4 @@ *.swp *.pyc core.* +html \ No newline at end of file diff -r 9686c0d9689d -r aea510b71386 doc/formulas.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/formulas.txt Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,14 @@ + +.. contents:: Formulas categories + +pylearn.formulas.costs +----------------------- +.. automodule:: pylearn.formulas.costs + :members: + +pylearn.formulas.noise +----------------------- +.. automodule:: pylearn.formulas.noise + :members: + + diff -r 9686c0d9689d -r aea510b71386 doc/index.txt --- a/doc/index.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/index.txt Fri Sep 17 12:01:32 2010 -0400 @@ -24,6 +24,7 @@ For the moment, the following documentation is available. +* `Formulas `_ -- Built-in math formulas optimized for speed and robustness * :doc:`io.SeriesTables module ` -- Saves error series and other statistics during training * `API `_ -- The automatically-generated API documentation diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/API_coding_style.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/API_coding_style.txt Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,253 @@ +========================= + Coding Style Guidelines +========================= + +Main Goals +========== + + * Code should be compatible with Python 2.4 and above (using 2to3 for + conversion to Python 3.x). This may not be possible in the short term + for Theano-dependent code. + + * Code should be easy to read, understand and update by developers and + users. + + * Code should be well-documented and well-tested. + +Python Coding Guidelines +======================== + +Official Guidelines +------------------- + +Source Material +~~~~~~~~~~~~~~~ + +The four main documents describing our Python coding guidelines are: + * `PEP 8 -- Style Guide for Python Code + `_ + * `PEP 257 -- Docstring Conventions + `_ + * `Numpy Docstring Standard + `_ + * `Google Python Style Guide + `_ + + +However, there are a few points mentioned in those documents that we decided +to do differently: + + * Use only one space (not two) after a sentence-ending period in comments. + + * You do not need to add an extra blank line before the closing quotes of + a multi-line docstring. + + .. code-block:: python + + # Good. + """This is a multi-line docstring. + + Which means it has more than one line. + """ + + # Bad. + """This is a multi-line docstring. + + Which means it has more than one line. + + """ + +Excerpts +~~~~~~~~ + +We emphasize here a few important topics that are found in the official +guidelines: + + * Avoid using lists if all you care about is iterating on something. Using + lists: + - uses more memory (and possibly more CPU if the code may break out of + the iteration), + - can lead to ugly code when converted to Python 3 with 2to3, + - can have a different behavior if evaluating elements in the list has + side effects (if you want these side effects, make it explicit by + assigning the list to some variable before iterating on it). + + +------------------------+------------------------+ + | Iterative version | List version | + +========================+========================+ + | .. code-block:: python | .. code-block:: python | + | | | + | my_dict.iterkeys | my_dict.keys | + | my_dict.itervalues | my_dict.values | + | my_dict.iteritems | my_dict.items | + +------------------------+------------------------+ + | .. code-block:: python | .. code-block:: python | + | | | + | itertools.ifilter | filter | + | itertools.imap | map | + | itertools.izip | zip | + +------------------------+------------------------+ + | .. code-block:: python | .. code-block:: python | + | | | + | xrange | range | + +------------------------+------------------------+ + + Code example with ``map``: + + .. code-block:: python + + # Good. + for f_x in imap(f, x): + ... + all_f_x = map(f, x) + map(f, x) + # Bad. + for element in map(f, x): + ... + imap(f, x) + + * Generally prefer list comprehensions to ``map`` / ``filter``, as the former are + easier to read. + + .. code-block:: python + + # Good. + non_comments = [line.strip() for line in my_file.readlines() + if not line.startswith('#')] + # Bad. + non_comments = map(str.strip, + ifilter(lambda line: not line.startswith('#'), + my_file.readlines())) + + * Use ``in`` on container objects instead of using class-specific methods: + it is easier to read and may allow you to re-use your code with different + container types. + + .. code-block:: python + + # Good. + has_key = key in my_dict + has_substring = substring in my_string + # Bad. + has_key = my_dict.has_key(key) + has_substring = my_string.find(substring) >= 0 + + +Additional Recommendations +-------------------------- + +Things you should do even if they are not listed in official guidelines: + + * Avoid backslashes whenever possible. They make it more + difficult to edit code, and they are ugly (as well as potentially + dangerous if there are trailing white spaces). + + .. code-block:: python + + # Good. + if (cond_1 and + cond_2 and + cond_3): + ... + # Bad. + if cond_1 and \ + cond_2 and \ + cond_3: + ... + + * When indenting multi-line statements like lists or function arguments, + keep elements of the same level aligned with each other. + The position of the first + element (on the same line or a new line) should be chosen depending on + what is easiest to read (sometimes both can be ok). + + .. code-block:: python + + # Good. + for my_very_long_variable_name in [my_foo, my_bar, my_love, + my_everything]: + ... + for my_very_long_variable_name in [ + my_foo, my_bar, my_love, my_everything]: + ... + # Good iff the list needs to be frequently updated or is easier to + # understand when each element is on its own line. + for my_very_long_variable_name in [ + my_foo, + my_bar, + my_love, + my_everything, + ]: + ... + # Good as long as it does not require more than two lines. + for my_very_long_variable_name in [my_foo, + my_bar]: + ... + # Bad. + for my_very_long_variable_name in [my_foo, my_bar, my_love, + my_everything]: + ... + for my_very_long_variable_name in [my_foo, + my_bar, + my_love, + my_everything]: + ... + + +The ``logging`` Module vs. the ``warning`` Module +================================================= + +The ``logging`` Module +---------------------- + +A central logging facility for Python capable of logging messages of various +categories/urgency and choosing with some granularity which messages are +displayed/suppressed, as well as where they are displayed or written. This +includes an ``INFO`` level for innocuous status information, a ``WARNING`` level +for unexpected state that is still recoverable, ``DEBUG`` for detailed +information which is only really of interest when things are going wrong, etc. + +In addition to the `library documentation`_, see this helpful tutorial, +`Python Logging 101`_. + +.. _library documentation: http://docs.python.org/library/logging.html +.. _Python Logging 101: http://plumberjack.blogspot.com/2009/09/python-logging-101.html + +The ``warning`` Module +---------------------- + +The ``warning`` module in the standard library and its main interface, the +``warn()`` function, allows the programmer to issue warnings in situations where +they wish to alert the user to some condition, but the situation is not +urgent enough to throw an exception. By default, a warning issued at a given +line of the code will only be displayed the first time that line is executed. +By default, warnings are written to ``sys.stderr`` but the ``warning`` module +contains flexible facilities for altering the defaults, redirecting, etc. + +Which? When? +------------ + +It is our feeling that the ``logging`` module's ``WARNING`` level be used to log +warnings more meant for *internal*, *developer* consumption, to log situations +where something unexpected happened that may be indicative of a problem but +is several layers of abstraction below what a user of the library would +care about. + +By contrast, the warning module should be used for warnings intended for user +consumption, e.g. alerting them that their version of Pylearn is older than +this plugin requires, so things may not work as expected, or that a given +function/class/method is slated for deprecation in a coming release (early +in the library's lifetime, ``DeprecationWarning`` will likely be the most common +case). The warning message issued through this facility should avoid +referring to Pylearn internals. + +Code Sample +=========== + +The following code sample illustrates many of the coding guidelines one should +follow in Pylearn. + +.. code-block:: python + + import os, sys, time + diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/api_optimization.txt --- a/doc/v2_planning/api_optimization.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/api_optimization.txt Fri Sep 17 12:01:32 2010 -0400 @@ -23,83 +23,141 @@ only uses Theano for the implementation. -Iterative Interface -------------------- +Theano Interface +----------------- + +The theano interface to optimization algorithms is to ask for a dictionary of +updates that can be used in theano.function. Implementations of iterative +optimization algorithms should be global functions with a signature like +'iterative_optimizer'. -def iterative_optimizer(parameters, - cost=None, - grads=None, - stop=None, - updates=None, - **kwargs): - """ - :param parameters: list or tuple of Theano variables (typically shared vars) - that we want to optimize iteratively. If we're minimizing f(x), then - together, these variables represent 'x'. - - :param cost: scalar-valued Theano variable that computes an exact or noisy estimate of - cost (what are the conditions on the noise?). Some algorithms might - need an exact cost, some algorithms might ignore the cost if the grads - are given. + def iterative_optimizer(parameters, + cost=None, + gradients=None, + stop=None, + updates=None, + **kwargs): + """ + :param parameters: list or tuple of Theano variables + that we want to optimize iteratively. If we're minimizing f(x), then + together, these variables represent 'x'. Typically these are shared + variables and their values are the initial values for the minimization + algorithm. - :param grads: list or tuple of Theano variables representing the gradients on - the corresponding parameters. These default to tensor.grad(cost, - parameters). + :param cost: scalar-valued Theano variable that computes an exact or noisy estimate of + cost (what are the conditions on the noise?). Some algorithms might + need an exact cost, some algorithms might ignore the cost if the + gradients are given. - :param stop: a shared variable (scalar integer) that (if provided) will be - updated to say when the iterative minimization algorithm has finished - (1) or requires more iterations (0). + :param gradients: list or tuple of Theano variables representing the gradients on + the corresponding parameters. These default to tensor.grad(cost, + parameters). - :param updates: a dictionary to update with the (var, new_value) items - associated with the iterative algorithm. The default is a new empty - dictionary. A KeyError is raised in case of key collisions. + :param stop: a shared variable (scalar integer) that (if provided) will be + updated to say when the iterative minimization algorithm has finished + (1) or requires more iterations (0). - :param kwargs: algorithm-dependent arguments + :param updates: a dictionary to update with the (var, new_value) items + associated with the iterative algorithm. The default is a new empty + dictionary. A KeyError is raised in case of key collisions. - :returns: a dictionary mapping each parameter to an expression that it - should take in order to carry out the optimization procedure. + :param kwargs: algorithm-dependent arguments - If all the parameters are shared variables, then this dictionary may be - passed as the ``updates`` argument to theano.function. + :returns: a dictionary mapping each parameter to an expression that it + should take in order to carry out the optimization procedure. - There may be more key,value pairs in the dictionary corresponding to - internal variables that are part of the optimization algorithm. + If all the parameters are shared variables, then this dictionary may be + passed as the ``updates`` argument to theano.function. - """ + There may be more key,value pairs in the dictionary corresponding to + internal variables that are part of the optimization algorithm. + + """ -One-shot Interface ------------------- +Numpy Interface +--------------- + +The numpy interface to optimization algorithms is supposed to mimick +scipy's. Its arguments are numpy arrays, and functions that manipulate numpy +arrays. -def minimize(x0, f, df, opt_algo, **kwargs): - """ - Return a point x_new that minimizes function `f` with derivative `df`. + def minimize(x0, f, df, opt_algo, **kwargs): + """ + Return a point x_new with the same type as x0 that minimizes function `f` + with derivative `df`. + + This is supposed to provide an interface similar to scipy's minimize + routines, or MATLAB's. + + :type x0: numpy ndarray or list of numpy ndarrays. + :param x0: starting point for minimization - This is supposed to provide an interface similar to scipy's minimize - routines, or MATLAB's. + :type f: python callable mapping something like x0 to a scalar + :param f: function to minimize + + :type df: python callable mapping something like x0 to the derivative of f at that point + :param df: derivative of `f` + + :param opt_algo: one of the functions that implements the + `iterative_optimizer` interface. - :type x0: numpy ndarray - :param x0: starting point for minimization + :param kwargs: passed through to `opt_algo` + + """ + - :type f: python callable mapping something like x0 to a scalar - :param f: function to minimize +There is also a numpy-based wrapper to the iterative algorithms. +This can be more useful than minimize() because it doesn't hog program +control. Technically minimize() is probably implemented using this +minimize_iterator interface. - :type df: python callable mapping something like x0 to the derivative of f at that point - :param df: derivative of `f` + class minimize_iterator(object): + """ + Attributes + - x - the current best estimate of the minimum + - f - the function being minimized + - df - f's derivative function + - opt_algo - the optimization algorithm at work (a serializable, callable + object with the signature of iterative_optimizer above). - :param opt_algo: one of the functions that implements the - `iterative_optimizer` interface. + """ + def __init__(self, x0, f, df, opt_algo, **kwargs): + """Initialize state (arguments as in minimize()) + """ + def __iter__(self): + return self + def next(self): + """Take a step of minimization and return self raises StopIteration when + the algorithm is finished with minimization - :param kwargs: passed through to `opt_algo` + """ + - """ +Examples +-------- -OD: Could it be more convenient for x0 to be a list? +Simple stochastic gradient descent could be called like this: + + sgd([p], gradients=[g], step_size=.1) + +and this would return + + {p:p-.1*g} + + +Simple stochastic gradient descent with extra updates: -OD: Why make a difference between iterative and one-shot versions? A one-shot - algorithm can be seen as an iterative one that stops after its first - iteration. The difference I see between the two interfaces proposed here - is mostly that one relies on Theano while the other one does not, but - hopefully a non-Theano one can be created by simply wrapping around the - Theano one. + sgd([p], gradients=[g], updates={a:b}, step_size=.1) + +will return + + {a:b, p:p-.1*g} + +If the parameters collide with keys in a given updates dictionary an exception +will be raised: + + sgd([p], gradients=[g], updates={p:b}, step_size=.1) + +will raise a KeyError. diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/architecture.txt --- a/doc/v2_planning/architecture.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/architecture.txt Fri Sep 17 12:01:32 2010 -0400 @@ -3,13 +3,13 @@ ==================== -Basic Design Approach -===================== +SE + VM Approach +================= -I propose that the basic design of the library follow the Symbolic Expression -(SE) structure + virtual machine (VM) pattern that worked for Theano. +One avenue for the basic design of the library is to follow the Symbolic +Expression (SE) structure + virtual machine (VM) pattern that worked for Theano. -So the main things for the library to provide would be: +The main things for the library to provide would be: - a few VMs, some of which can run programs in parallel across processors, hosts, and networks [R6,R8]; @@ -57,4 +57,87 @@ just not essential to choose an API that will guarantee a match, or indeed to choose any explicit API at all. +YB: I agree that lambdas are more flexible, but from the user's point of +view it is really important to know what can swap with what, so that they +can easily plug-and-play. So even if informal, something in the spirit +of an API must be described somewhere, and components should declare +either formally or through comments what functionality 'type' +they can take on. +Encapsulation vs. linearity +--------------------------- + +A while ago, the Apstat crew went to fight "encapsulation" to propose instead +a more "linearized" approach to experiment design. I must admit I didn't +really understand the deep motivations behind this, and after practicing both +styles (encapsulation for PLearn / Theano, linearity @ ARL / Ubisoft), I still +don't. I do find, however, some not-so-deep-but-still-significant advantages +to the linear version, which hopefully can be made clear (along with a +clarification of what the h*** am I talking about) in the following example: + + * Linear version: + my_experiment = pipeline([ + data, + filter_samples, + PCA, + k_fold_split, + neural_net, + evaluation, + ]) + + * Encapsulated version: + my_experiment = evaluation( + data=PCA(filter_samples(data)), + split=k_fold_split, + model=neural_net) + +What I like in the linear version is it is much more easily human-readable +(once you know what it means): you just follow the flow of the experiment by +reading through a single list. +On the other hand, the encapsulated version requires some deeper analysis to +understand what is going on and in which order. +Also, commenting out parts of the processing is simpler in the first case (it +takes a single # in front of an element). +However, linearity tends to break when the experiment is actually not linear, +i.e. the graph of object dependencies is more complex (*). + +I'm just bringing this up because it may be nice to be able to provide the +user with the most intuitive way to design experiments. I actually don't think +those approaches are mutually exclusive, and it could be possible for the +underlying system to use the more flexible / powerful encapsulated +representation, while having the option to write simple scripts in a form that +is easier to understand and manipulate. + +It could also be worth discussing this issue with Xavier / Christian / +Nicolas. + +(*) Note that I cheated a bit in my example above: the graph from the +encapsulated version is not a simple chain, so it is not obvious how to +convert it into the pipeline given in the linear version. It's still possible +though, but this is probably not the place to get into the details. + +RP comment : The way I see it, you could always have everything using the +encapsulation paradigm ( which as you pointed out is a bit more powerful) and +then have linear shortcuts ( functions that take a list of functions and some +inputs and apply them in some order). You will not be able to have a one case +cover all pipeline function, but I think it is sufficient to offer such +options (linear functions) for a few widely used cases .. + + +Jobman Compatibility Approach +============================= + +One basic approach for the library is to provide a set of components that are +compatible with remote execution. The emphasis could be not so much on +standardizing the roles and APIs of components, so much as ensuring that they +can be glued together and supports parallel execution on one or more CPUs or +clusters. + +In this approach we would provide a proxy for asynchronous execution +(e.g. "pylearn.call(fn, args, kwargs, backend=default_backend)"), which would +come with constraints on what fn, args, and kwargs can be. Specifically, they +must be picklable, and there are benefits (e.g. automatic function call caching) +associated with them being hashable as well. + + + diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/coding_style.txt --- a/doc/v2_planning/coding_style.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/coding_style.txt Fri Sep 17 12:01:32 2010 -0400 @@ -8,7 +8,139 @@ - David - Olivier D [leader] +Open for public debate +---------------------- + * Avoid contractions in code comments (particularly in + documentation): "We do not add blue to red because it does not look good" + rather than "We don't add blue to red because it doesn't look good". + OD: I mostly find it to be cleaner (been used to it while writing + scientific articles too). + JB: +1 + + * Imperative vs. third-person comments. + # Return the sum of elements in x. <-- imperative + # Returns the sum of elements in x. <-- third-person + OD: I am used to the imperative form and like it better only because it + typically saves one letter (the 's') and is easier to conjugate. + JB: What about being compatible with markup formats that have a :returns: + tag? + OD: That'd make sense. However, when I wrote the above I hadn't looked + closely at PEP257 yet, and I just noticed the following official + recommendation for one-line docstrings in it: + The docstring is a phrase ending in a period. It prescribes the + function or method's effect as a command ("Do this", "Return that"), not as a + description; e.g. don't write "Returns the pathname ...". + Anyone knows which style is most popular in the open-source + community? + + * OD: I like always doing the following when subclassing + a class A: + class B(A): + def __init__(self, b_arg_1, b_arg_2, **kw): + super(B, self).__init__(**kw) + ... + The point here is that the constructor always allow for extra keyword + arguments (except for the class at the very top of the hierarchy), which + are automatically passed to the parent class. + Pros: + - You do not need to repeat the parent class arguments whenever you + write a new subclass. + - Whenever you add an argument to the parent class, all child classes + can benefit from it without modifying their code. + Cons: + - One needs to look at the parent classes to see what these arguments + are. + - You cannot use a **kw argument in your constructor for your own + selfish purpose. + - I have no clue whether one could do this with multiple inheritance. + - More? + Question: Should we encourage this in Pylearn? + + JB: +0.5 + +Closed for public debate +------------------------ + + * Use imports for packages and modules only. I.e. avoid + from foo import * + from foo import Bar + OD: Overall I agree with this. However we probably want to allow some + exceptions, like: + from itertools import imap, izip + Also, some people may want to have shortcuts like + from theano import tensor as T + but I would prefer to forbid this. It is handy when trying stuff in + the interactive interpreter, but in real code it can easily get messy + when you want to copy / paste different pieces of code and they use + different conventions. Typing tensor.* is a bit longer, but a lot more + portable. + JB: I thought that these are nice: + - "from foo import Bar" + - "from foo import Bar, Blah" + What's wrong with them? They keep the code listing short and readable. + I would discourage these forms when symbols 'Bar' and 'Blah' are + ambiguous, in which case the parent module prefix serves to disambiguate + them in the code. + I agree that the "import A as B" form should be discouraged in general, + because that's just confusing and makes code less grep-friendly. + OD: I agree that "from foo import Bar, Blah" is sometimes convenient + (typically when you re-use Bar / Blah many times in the same file), + and would vote in favor of accepting it when it is appropriate. + This guideline was taken from Google's coding recommendation: + "from foo import * or from foo import Bar is very nasty and can + lead to serious maintenance issues because it makes it hard to find + module dependencies." + OD: Decision was taken in committee's meeting to allow + from foo import Bar, Blah + when imported stuff is re-used multiple times in the same file, and + there is no ambiguity. + + * Imports should usually be on separate lines. + OD: I would add an exception, saying it is ok to group multiple imports + from the standard library on a single line, e.g. + import os, sys, time + I just don't see much benefit in putting them on separate lines (for + third-party imports I agree it is best to keep them separate, as it + makes dependencies clearer, and diffs look better when someone adds / + removes an import). Does anyone see a good reason to keep standard + library imports on different lines? + JB: what does 'usually' mean here? The guideline seems vacuous. + OD: Sorry my fault, I did not quote the whole guideline from PEP8. The + 'usually' was because of what followed: + it's okay to say this though: + from subprocess import Popen, PIPE + (which btw contradicts Google's recommendation mentioned previously) + OD: Decision was taken in committee's meeting to allow multiple imports + on the same line for standard library modules (only). + + * The BDFL recommends inserting a blank line between the + last paragraph in a multi-line docstring and its closing quotes, placing + the closing quotes on a line by themselves. This way, Emacs' + fill-paragraph command can be used on it. + OD: I think it is ugly and I have not seen it used much. Any Emacs + user believes it is a must? + OD: Decision was taken in committee's meeting to drop this + recommendation. + + * JB: How should we combine capitalization and underscores to name classes + and functions related to an algorithm like 'SGD' or a model like 'RBM' + whose common name is capitalized? Case in point: How should I name a + Hybrid Monte Carlo Sampler? Should I use the common HMC abbreviation? + OD: This one is answered by PEP8 (search HTTPServerError in it). + You should use: + RBMClassName + rbm_function_name + As far as using abbreviations is concerned: + All identifiers in the Python standard library (...) SHOULD use + English words wherever feasible (in many cases, abbreviations and + technical terms are used which aren't English). + so I guess HMC is ok when using Hybrid Monte Carlo is considered to + make some names too long. + + +Note about warnings +------------------- Fred: This is a refactored thing from James email of what we should put in message that we send to the user: @@ -19,28 +151,29 @@ Existing Python coding style specifications and guidelines ---------------------------------------------------------- - * http://www.python.org/dev/peps/pep-0008/ Style Guide for Python Code - * http://www.python.org/dev/peps/pep-0257/ Docstring Conventions - * http://google-styleguide.googlecode.com/svn/trunk/pyguide.html Google Python Style Guide - * http://www.voidspace.org.uk/python/articles/python_style_guide.shtml - * http://python.net/~goodger/projects/pycon/2007/idiomatic/handout.html - * http://www.cs.caltech.edu/courses/cs11/material/python/misc/python_style_guide.html - * http://barry.warsaw.us/software/STYLEGUIDE.txt - * http://self.maluke.com/style - * http://chandlerproject.org/Projects/ChandlerCodingStyleGuidelines - * http://lists.osafoundation.org/pipermail/dev/2003-March/000479.html - * http://learnpython.pbworks.com/PythonTricks - * http://eikke.com/how-not-to-write-python-code/ - * http://jaynes.colorado.edu/PythonGuidelines.html - * http://docs.djangoproject.com/en/dev/internals/contributing/#coding-style - * http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines + * Must-read + * Official Python coding style guide: http://www.python.org/dev/peps/pep-0008 + * Official docstring conventions: http://www.python.org/dev/peps/pep-0257 + * Google Python Style Guide: http://google-styleguide.googlecode.com/svn/trunk/pyguide.html + * Interesting + * Code Like a Pythonista: http://python.net/~goodger/projects/pycon/2007/idiomatic/handout.html + * Numpy notes on conversion to Python 3: http://projects.scipy.org/numpy/browser/trunk/doc/Py3K.txt + * Can skip + * Python style for university class: http://www.cs.caltech.edu/courses/cs11/material/python/misc/python_style_guide.html + * Mailman coding style: http://barry.warsaw.us/software/STYLEGUIDE.txt + * Some company coding style: http://self.maluke.com/style + * Chandler coding style: http://chandlerproject.org/Projects/ChandlerCodingStyleGuidelines + * Outdated recommendations: http://lists.osafoundation.org/pipermail/dev/2003-March/000479.html + * Mostly some beginners tips: http://learnpython.pbworks.com/PythonTricks + * More beginners tips: http://eikke.com/how-not-to-write-python-code/ + * Cogent coding guidelines: http://jaynes.colorado.edu/PythonGuidelines.html + * Djangoo coding guidelines: http://docs.djangoproject.com/en/dev/internals/contributing/#coding-style + * Numpy documentation style guidelines: http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines + * Some random guy guidelines (nothing special): http://www.voidspace.org.uk/python/articles/python_style_guide.shtml We will probably want to take PEP-8 as starting point, and read what other people think about it / how other coding guidelines differ from it. -Dumi: we should also try to find tools that automate these -processes: pylint, pyflakes, pychecker, pythontidy - OD: Things about PEP 8 I don't like (but it may be just me): * If necessary, you can add an extra pair of parentheses around an @@ -61,27 +194,7 @@ or less been wiped out by HTML's convention of ignoring extra whitespace: see http://en.wikipedia.org/wiki/Sentence_spacing for more detail. I think it's okay to drop this convention in source code.) - - * Imports should usually be on separate lines - --> Can be a lot of lines wasted for no obvious benefit. I think this is - mostly useful when you import different modules from different places, - but I would say that for instance for standard modules it would be - better to import them all on a single line (doing multiple lines only - if there are too many of them), e.g. prefer: - import os, sys, time - to - import os - import sys - import time - However, I agree about separating imports between standard lib / 3rd - party, e.g. prefer: - import os, sys, time - import numpy, scipy - to - import numpy, os, scipy, sys, time - (Personal note: preferably order imports by alphabetical order, makes - it easier to quickly see if a specific module is already imported, - and avoids duplicated imports) + OD: Cool, thanks, I guess we can drop it then. * Missing in PEP 8: - How to indent multi-line statements? E.g. do we want @@ -101,12 +214,6 @@ be to go with 2 when it can fit on two lines, and 3 otherwise. Same with lists. - * From PEP 257: The BDFL [3] recommends inserting a blank line between the - last paragraph in a multi-line docstring and its closing quotes, placing - the closing quotes on a line by themselves. This way, Emacs' - fill-paragraph command can be used on it. - --> I have nothing against Emacs, but this is ugly! - Documentation ------------- @@ -136,16 +243,13 @@ Use RST with Sphinx. Task: Provide specific examples on how to document a class, method, and some specific classes like Op (DE). Modify the theano documentation to include that. +OD: May want to check out + http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines * Python versions to be supported Support 2.4 (because some of the clusters are still running 2.4) and write code that can be converted to 3.x with 2to3 in a straightforward way. Task: Write to-do's and to-not-do's to avoid compatibility issues. (OD) -(DWF: Pauli Virtanen and others have put together extensive -documentation in the process of porting NumPy to Py3K, see his notes at -http://projects.scipy.org/numpy/browser/trunk/doc/Py3K.txt -- this is -the most complete resource for complicated combinations of Python and C). - * C coding style How to write C code (in particular for Numpy / Cuda), and how to mix C and @@ -162,6 +266,8 @@ * Automatized code verification Use pychecker & friends to make sure everything is fine. Task: Look into the various options available (DE) +Result: See sections 'Tools to help us out' and 'Automating and enforcing coding +style' * Tests Force people to write tests. Automatic email reminder of code lines not @@ -170,6 +276,7 @@ automatically warn the user when he is using untested stuff (and to remind ourselves we should add a test). Task: See feasibility. (OD) +Result: See section 'Enforcing strict testing policy'. * VIM / Emacs plugins / config files To enforce good coding style automatically. @@ -181,79 +288,9 @@ Have a sample code that showcases everything one should comply to. -Some coding guidlines (work-in-progress from OD) ------------------------------------------------- - - * Avoid using lists if all you care about is iterating on something. Using - lists: - - uses more memory (and possibly more CPU if the code may break out of - the iteration) - - can lead to ugly code when converted to Python 3 with 2to3 - - can have a different behavior if evaluating elements in the list has - side effects (if you want these side effects, make it explicit by - assigning the list to some variable before iterating on it) - - Iterative version List version - my_dict.iterkeys() my_dict.keys() - my_dict.itervalues() my_dict.values() - my_dict.iteritems() my_dict.items() - itertools.imap map - itertools.ifilter filter - itertools.izip zip - xrange range - - * Use `in` on container objects instead of using class-specific methods. - It is easier to read and may allow you to use your code with different - container types. - - Yes No - --- -- - key in my_dict my_dict.has_key(key) - sub_string in my_string my_string.find(sub_string) >= 0 - - * (Point to debate) Avoid contractions in code comments (particularly in - documentation): "We do not add blue to red because it does not look - good" rather than "We don't add blue to red because it doesn't look - good". I mostly find it to be cleaner (been used to it while writing - scientific articles too). +Some coding guidelines (work-in-progress from OD) +------------------------------------------------- - * (Point to debate) Imperative vs. third-person comments. I am used to the - imperative form and like it better only because it typically saves one - letter (the 's'): "Return the sum of elements in x" rather than - "Returns the sum of elements in x". - - * (Point to debate) I like always doing the following when subclassing - a class A: - class B(A): - def __init__(self, b_arg_1, b_arg_2, **kw): - super(B, self).__init__(**kw) - ... - The point here is that the constructor always allow for extra keyword - arguments (except for the class at the very top of the hierarchy), which - are automatically passed to the parent class. - Pros: - - You do not need to repeat the parent class arguments whenever you - write a new subclass. - - Whenever you add an argument to the parent class, all child classes - can benefit from it without modifying their code. - Cons: - - One needs to look at the parent classes to see what these arguments - are. - - You cannot use a **kw argument in your constructor for your own - selfish purpose. - - I have no clue whether one could do this with multiple inheritance. - - More? - Question: Should we encourage this in Pylearn? - - * Generally prefer list comprehensions to map / filter, as the former are - easier to read. - Yes: - non_comments = [line.strip() for line in my_file.readlines() - if not line.startswith('#')] - No: - non_comments = map(str.strip, - filter(lambda line: not line.startswith('#'), - my_file.readlines())) * Use the `key` argument instead of `cmp` when sorting (for Python 3 compatibility). @@ -272,6 +309,12 @@ * Code indent must be done with four blank characters (not with tabs). + * Limit lines to 79 characters. + + * Comments should start with a capital letter (unless the first word is a + code identifier) and end with a period (very short inline comments may + ignore this rule). + * Whenever you read / write binary files, specify it in the mode ('rb' for reading, 'wb' for writing). This is important for cross-platform and Python 3 compatibility (e.g. when pickling / unpickling objects). @@ -290,9 +333,251 @@ raise MyException(args) where MyException inherits from Exception. + * Imports should be listed in alphabetical order. It makes it easier to + verify that something is imported, and avoids duplicated imports. + + * Use absolute imports only. This is compatible across a wider range of + Python versions, and avoids confusion about what is being + imported. + + * Use a leading underscore '_' for internal attributes / methods, + but avoid the double underscore '__' unless you know what you are + doing. + + * A script's only top-level code should be something like: + if __name__ == '__main__': + sys.exit(main()) + + * No conditional expression (not supported in Python 2.4). These are + expressions of the form + x = y if condition else z + + * Use either "try ... except" or "try ... finally", but do not mix + "except" with "finally" (which is not supported in Python 2.4). + You can make a try... except inside a try... finally if you need both. + + * Do not use the `all` and `any` builtin functions (they are not supported + in Python 2.4). + You can use numpy.{all,any} instead of import theano.gof.python25 that + define all and any. + OD: I think we should have something like pylearn.compat.{all,any}. + numpy.{all,any} are meant to be used on arrays only. + OD: As agreed during committee's meeting, we will use + theano.gof.python25 + + * Do not use the `hashlib` module (not supported in Python 2.4). + You can do as in theano.gof.cc: + ..code:: + if sys.version_info[:2] >= (2,5): + import hashlib + def hash_from_code(msg): + return hashlib.md5(msg).hexdigest() + else: + import md5 + def hash_from_code(msg): + return md5.new(msg).hexdigest() + OD: Yep, we could probably come up with such a wrapper in a pylearn.compat + module. + + * Do not use mutable arguments as default values. Instead, use a helper + function: + Yes: + def f(array=None): + array = pylearn.if_none(array, []) + No: + def f(array=[]): + # Dangerous if `array` is modified down the road. + Mercurial commits ----------------- * How to write good commit messages? + OD: Check Django's guidelines (link above) * Standardize the merge commit text (what is the message from fetch?) +During committee's meeting, Fred mentioned a bug with Assembla links for +multi-line commits. + +Tools to help us out +--------------------- + +Dumi: + + * pylint: highly configurable and very popular tool, similar in spirit to lint + for C. Can specify a config file, customize/disable warnings and errors, hook + it to vim/emacs and include coding style convensions in the check too. A nice + feature is that you can include a comment like "# pylint: disable-msg=C0103" + into a file and disable a message locally. This is nice and dangerous at the + same time. Another cool feature is incremental checking with caching of + results, which also allows tracking of progress. + + * pyflakes: pylint alternative that is supposedly faster, but is I think more + limited in the number of things it is good at: "PyFlakes will tell you when + you have forgotten an import, mistyped a variable name, defined two functions + with the same name, shadowed a variable from another scope, imported a module + twice, or two different modules with the same name, and so on.". Most reviews + found online praise the speed, but note that pylint is clearly superior in + every other respect. + + * pychecker: it actually *imports* each module (not sure if pylint does this). + It seems that pylint = pychecker + coding style and that pylint is more + popular. + + * pep8: if all you care is about obeying PEP-8: + http://pypi.python.org/pypi/pep8 (includes the actual PEP-8 snippets with the + errors found, which is neat). Otherwise, pylint seems like a superset of this. + + * http://www.doughellmann.com/articles/pythonmagazine/completely-different/2008-03-linters/index.html + - article from 2008 comparing pylint, pychecker, and pyflakes. The conclusion + is to use pylint, more or less. + +I say we stick with pylint for now as it provides a great degree of flexibility +in a single mature package. + + * vim + pylint: http://www.vim.org/scripts/script.php?script_id=891 + * emcas + pylint: http://www.emacswiki.org/emacs/PythonProgrammingInEmacs#toc5 + +Automating and enforcing coding style +------------------------------------- + +Ideally, we would like to have a uniform approach to this, where everyone tests +against the same tool(s) and uses the same list of disabled warnings etc. + +Dumi: there are several ways of approaching this, independently of the tools used: + + * Create a precommit hook for mercurial, which runs the tool(s) of choice and + generates warnings or aborts the commit process. This hook is a simple Python + module (well, as simple as we want it to be), which we can include into + everyone's hgrc, in the precommit.pylint variable, for instance. An example + is http://github.com/jrburke/dvcs_jslint/blob/master/dvcs_jslint.js. The + advantage of this approach is that the load is distributed and + errors/warnings are caught client-side, before the commit. + + * Another client-side option is to have editor plugins for the various style + checkers: vim and emacs can access pylint pretty easily for instance. + + * Instead of doing this client-side, one can do things server-side. On + Assembla, this means using their Webhooks + (http://www.assembla.com/spaces/demostuff/webhook_tool), since HTTP-based + hooks that we would need to tie with our buildbot server (whichever server we + choose that to be). + + * I (DE) prefer starting with the client-side approach, as it is easier to + implement, has no single point of failure and is deployable fast. We could + have a "batch" script that runs our lint tools in conjunction with hg + annotate and sends hate-mail once a week to offenders who have somehow + slipped things through the cracks. Also on the server-side we could run + time-consuming checking (though how such checks would differ from tests is + unclear). + +Note that: + + * I haven't found anything ready-made online, so we need to write these + hooks ourselves. + * I think we should make it so that it is not possible to commit things if + pylint reports an actual error. + +Type checking +------------- + +(Suggested by Francois Savard) + +vu que vous êtes en train de vous occuper de l'aspect coding style, je +mentionne ceci, à faire ce que vous en voulez: j'aime bien éviter des +erreurs sur l'ordre de mes paramètres, sur les assumptions sur les +paramètres etc. en faisant des argument check. Ça remplace un peu le +static type checking des langages genre Java. + +En Python y'a une façon élégante de définir ses propres typecheckers, +value checkers etc. et ensuite les passer en paramètre à un décorateur de +fonction: + +http://code.activestate.com/recipes/454322-type-checking-decorator/ + +(Juste un exemple, vu que les checks peuvent être plus élaborés, inclure +des value checks (>0 etc.), être flexibles pour ne pas demander que ce +soit un type fixe mais plutôt que ça réponde à certaines contraintes (que +ça "ressemble" à un float, p. ex.). J'avais développé une lib pour faire +qqch du genre en Javascript). + +Je ne sais pas si vous comptiez parler de ça, et si ça vaut la peine, mais +personnellement je préfère du code à des commentaires qui peuvent être out +of sync avec le contenu d'une méthode. Si vous croyez que ça vaut la peine, +vous pourriez p-e définir des type/value-checkers standards pour éviter que +tout le monde redéfinissent les siens à sa façon. + +OD: This was discussed in committee's meeting. We agreed to provide ways to do +this, but not to enforce its usage. + +Consistent inf / nan +-------------------- + +OD: Use numpy.inf and numpy.nan rather than float('inf') / float('nan')? +(should be slightly more efficient even if efficiency usually doesn't matter +here - the main goal would be for everyone to use the same inf / nan to make +the code consistent). +OD: Approved during committee's meeting. + +Enforcing strict testing policy +------------------------------- + +The `coverage` third-party module provides a way to gather code coverage +statistics in the test suite. `nosetests` has a plugin that can be activated +with the --with-coverage option to use this module. +It is possible to know which lines specifically lack coverage. However, we +will probably want to post-process this data to do more than a simple report +(which noone will care about). This could be done either by parsing nosetests' +coverage output, or modifying its coverage plugin, or writing our own version +of it. The main goal would be to identify who is responsible for writing lines +that are not currently covered (using 'hg annotate'), in order to send email +notifications. + +We should aim at 100% code coverage in tests. This is realistic because +`coverage` offers ways to ignore coverage for lines we explicitely do not want +to cover (typically debug code, or AssertionError / NotImplementedError that +are not supposed to be triggered during normal usage). +We may need to do some advanced processing though to e.g. collect results from +multiple build bots, if for instance some bot is running tests without GPU +support, and another one is taking care of the GPU tests. + +Code that should be tested but for which no test is currently written would +also require some decorator / helper function that would trigger a warning at +run-time (only once / execution). This could be enforced by adopting a +different policy about lack-of-coverage notification emails, depending on +whether or not the warning is present: +- if there is no warning, daily email notification (ADD A WARNING!!!) +- if there is a warning, weekly email notification (ADD A TEST!!!) + +Meeting 2010/09/16 +------------------ + +Tasks to be performed by tomorrow: + * OD: + * Write down summary of Python coding style recommendations + * Start a file that showcases those guidelines + * DWF: + * Look into recommendations on how to document a class, method, ... + * Write recommendations on when to use logging vs. warning + * Make public some configuration files / plugins for vim + * Come up with official common file header (license in particular) + +Suggested per-file boilerplate +------------------------------ + +"""Module docstring as the first line, as usual.""" + +__authors__ = "Olivier Delalleau, Frederic Bastien, David Warde-Farley" +__copyright__ = "(c) 2010, Université de Montréal" +__license__ = "3-clause BSD License" +__contact__ = "Name Of Current Guardian of this file " + +We could also pull Mercurial revision info and put it in __version__, this +seems to be common. + +Editor setup +------------ + +(DWF:) Some enhanced configuration files for Vim that I've put a little bit +of work into modifying in some cases can be found at: + +http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Divers/VimPythonRecommendations diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/dataset.txt --- a/doc/v2_planning/dataset.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/dataset.txt Fri Sep 17 12:01:32 2010 -0400 @@ -175,6 +175,22 @@ COMMENTS ~~~~~~~~ +JB asks: How about asking datasets to also provide a visualization mechanism +for showing / playing individual examples from the dataset, but also other +external objects that are similar to dataset examples (e.g. filters from a +weight matrix that filters images). This doesn't have to be complicated, and it +can be shared between datasets that exist in one modality (e.g. image datasets +can all use an image-rending method) + +OD replies: Besides being able to display data without prior knowledge of the +kind of data inside a dataset, is there any reason to put this within the +dataset class? If not, it seems to me it may be more appropriate to have a way +for the dataset to describe the kind of data it holds, and keep the +visualization code separate from the dataset itself. It would make it easier +in particular to try different visualization systems, and description of the +data may turn out to be useful for other reasons (however, it also means we'd +need to come up with a good way to describe data, which could prove +difficult). JB asks: What may be passed as argument to the functions in Dataset, and what can be expected in return? Are there side effects (e.g. on the state of the @@ -258,6 +274,11 @@ use numpy arrays (for numeric data) or lists (for anything else) to store mini-batches' data. So I vote for 'no'. +YB: I agree that a mini-batch should definitely be safely assumed +to fit in memory. That makes it at least in principle semantically +different from a dataset. But barring that restriction, it might +share of the properties of a dataset. + A dataset is a learner ~~~~~~~~~~~~~~~~~~~~~~ @@ -324,3 +345,55 @@ understanding of it, but my feeling is that you need your learner to be written in a specific way to achieve this, in which case it may be up to the learner to take its input data and store it into a shared variable. + +RP comment: Yes, the dataset object alone can not handle this, the issue is somewhere +between the dataset and the learner. Or in other words, everytime you change +the data you need to recompile your theano function. So the learner can not +only get data from the dataset, it needs to get a shared variable. The learner +should also be aware when the dataset is changed, to recompile its internal +functions. I'm not sure which is the best wa to do this. My personal feeling +is that the dataset should be part of the learner. The lerner should provide +a function use_dataset ( or replace_dataset). When this function is called, +all the theano functions in the learner get recompiled based on shared +variables that the dataset object provides. It sort of fits very well in the +framework that I have in mind, which was spattered around in the learner.txt +and some of my previous emails. I think it shares a lot with James concepts, +since it follows quite closely the concepts behind Theano. + +OD asks: Ok, so why would the dataset have to be responsible for providing a +shared variable? Why wouldn't the learner just create this shared variable +internally and copy into it the data provided by the dataset? + +RP replies: Sure, the learner could take care of all this. Note though that the +learner should take care to divide the dataset into chunks that fit in the +GPU memory ( in case of a large dataset) and then take care of updating the +shared variables acording to the current chunk. Personally I feel like all +this data division, management and so on should be done by the dataset. +It feels more natural that way. For example assume you have a dataset that +is composed of a time series and some static data ( carre-tech heart beat +data is a good example). The static data is small enough so that you could +always store on the GPU, and you would only need to split the time series. +For the learner to do this ( since it gets the same interface from any +dataset object) would be like and if then, while for the +dataset is just a different class. But I'm happy to have all this GPU stuff +send to the learner as well if everybody else believe that is better. + +FB comment: I don't understand why you would need to recompile the theano function. +Their is 2 cases, the data is in a shared variable. You can directly change the data +in the shared variable without recompiling the theano fct. The second case is when +the dataset is in an ordinary theano variable. In that case, the first step in the +theano fct will be to transfer the dataset to the gpu before computation. If the data +change at each call, that will be as efficient as changing the data manually every time +in the shared variable. + +AB: I have an idea about this which kind of fits in the "building a +theano op" thing that we talked about at the last meeting. + +We can just build a theano Op that wraps dataset objects and takes +care of the details of tranferring data to the GPU or otherwise. + +I have a prototype interface/implemantation in the shared_dataset.py +file in this directory. + +OD: I like AB's approach. + diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/main_plan.txt --- a/doc/v2_planning/main_plan.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/main_plan.txt Fri Sep 17 12:01:32 2010 -0400 @@ -235,7 +235,7 @@ separate file. Indexing Convention -~~~~~~~~~~~~~~~~~~~ +=================== Something to decide on - Fortran-style or C-style indexing. Although we have often used c-style indexing in the past (for efficiency in c!) this is no diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/optimization.txt --- a/doc/v2_planning/optimization.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/optimization.txt Fri Sep 17 12:01:32 2010 -0400 @@ -46,10 +46,82 @@ -Proposal for API -================ +Discussion +========== + +OD asks: Could it be more convenient for x0 to be a list? + +JB replies: Yes, but that's not the interface used by other minimize() +routines (e.g. in scipy). Maybe another list-based interface is required? + +OD replies: I think most people would prefer to use a list-based interface, so + they don't have to manually pack / unpack multiple arrrays of parameters. So I + would vote in favor or having both (where the main reason to also provide a + non-list interface would be to allow one to easily switch e.g. to scipy's + minimize). + I would guess the reason scipy's interface is like this is because it makes + it easier for the optimization algorithm. However, this does not really + matter if we are just wrapping a theano-based algorithm (that already has + to handle multiple parameters), and avoiding useless data copies on each call + to f / df can only help speed-wise. +JB replies: Done, I added possibility that x0 is list of ndarrays to the api +doc. + + + +OD asks: Why make a difference between iterative and one-shot versions? A one-shot + algorithm can be seen as an iterative one that stops after its first + iteration. The difference I see between the two interfaces proposed here + is mostly that one relies on Theano while the other one does not, but + hopefully a non-Theano one can be created by simply wrapping around the + Theano one. + +JB replies: Right, it would make more sense to distinguish them by the fact that +one works on Theano objects, and the other on general Python callable functions. +There is room for an iterative numpy interface, but I didn't make it yet. Would +that answer your question? + +OD replies and asks: Partly. Do we really need a non-iterative interface? + +OD: I wish we could get closer to each other the Theano and Numpy interfaces. +It would be nice if we could do something like: -See api_optimization.txt. + # Theano version. + updates = sgd([p], gradients=[g], stop=stop, step_size=.1) + sgd_step = theano.function([input_var, target_var], [], updates=updates) + while not stop.value: + input, target = training_iter.next() + sgd_step(input, target) + + # Numpy version (you can replace *.value by regular numpy arrays). + sgd_step = sgd([p.value], gradients=g_func, stop=stop.value, step_size=.1) + while not stop.value: + input, target = training_iter.next() + sgd_step(input, target) + +where sgd would look something like: -OD: Do we really need a different file? If yes, maybe create a subdirectory to - be able to easily find all files related to optimization? + class sgd(...): + def __init__(self, parameters, cost=None, gradients=None, stop=None, + step_size=None): + # Allow for extra arguments to be provided in self.__call__, that + # are forwarded to the underlying gradients function. + self.gradients = lambda *lst, **kw: gradients(*(parameters + lst), + **kw) + ... + + def __call__(*lst, **kw): + grads = self.gradients(*lst, **kw) + for param, grad in izip(self.parameters, grads): + param -= self.step_size * grad + +Then a wrapper to provide a scipy-like interface could be: + + def minimize(x0, f, df, algo, **kw): + stop = numpy.array(0, dtype=numpy.int8) + algo_step = eval(algo)([x0], cost=f, gradients=lambda x: (df(x), ), + stop=stop, **kw) + while not stop: + algo_step() + + diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/plugin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/plugin.py Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,314 @@ + +import time +from collections import defaultdict, deque +from copy import copy + +inf = float('inf') + +############# +### EVENT ### +############# + +class Event(object): + + def __init__(self, type, **attributes): + self.type = type + self.__dict__.update(attributes) + self.attributes = dict(type = type, **attributes) + + def match(self, other): + if isinstance(other, Matcher): + return other(self) + else: + oattr = other.attributes + for k, v in self.attributes.iteritems(): + if k in oattr: + v2 = oattr[k] + if isinstance(v2, Matcher): + if not v2(v): return False + else: + if v != v2: return False + return True + + def __str__(self): + return "Event(%s)" % ", ".join("%s=%s" % (k, v) for k, v in self.attributes.iteritems()) + +class Matcher(object): + + def __call__(self, object): + raise NotImplementedError("Implement this!") + +class FnMatcher(Matcher): + + def __init__(self, function): + self.function = function + + def __call__(self, object): + return self.function(object) + +all_events = FnMatcher(lambda _: True) + + + +################ +### SCHEDULE ### +################ + +class Schedule(Matcher): + def __add__(self, i): + return OffsetSchedule(self, i) + def __or__(self, s): + return UnionSchedule(self, to_schedule(s)) + def __and__(self, s): + return IntersectionSchedule(self, to_schedule(s)) + def __sub__(self, i): + return OffsetSchedule(self, -i) + def __ror__(self, s): + return UnionSchedule(to_schedule(s), self) + def __rand__(self, s): + return IntersectionSchedule(to_schedule(s), self) + def __invert__(self): + return NegatedSchedule(self) + +def to_schedule(x): + if x in (None, False): + return never + if x is True: + return always + elif isinstance(x, (list, tuple)): + return reduce(UnionSchedule, x) + else: + return x + + +class ScheduleMix(Schedule): + __n__ = None + def __init__(self, *subschedules): + assert (not self.__n__) or len(subschedules) == self.__n__ + self.subschedules = map(to_schedule, subschedules) + +class UnionSchedule(ScheduleMix): + def __call__(self, time): + return any(s(time) for s in self.subschedules) + +class IntersectionSchedule(ScheduleMix): + def __call__(self, time): + return all(s(time) for s in self.subschedules) + +class DifferenceSchedule(ScheduleMix): + __n__ = 2 + def __call__(self, time): + return self.subschedules[0](time) and not self.subschedules[1](time) + +class NegatedSchedule(ScheduleMix): + __n__ = 1 + def __call__(self, time): + return not self.subschedules[0](time) + +class OffsetSchedule(Schedule): + def __init__(self, schedule, offset): + self.schedule = schedule + self.offset = offset + def __call__(self, time): + if isinstance(time, int): + return self.schedule(time - self.offset) + else: + t1, t2 = time + return self.schedule((t1 - self.offset, t2 - self.offset)) + + +class AlwaysSchedule(Schedule): + def __call__(self, time): + return True + +always = AlwaysSchedule() +never = ~always + +class IntervalSchedule(Schedule): + def __init__(self, step, repeat = inf): + self.step = step + self.upper_bound = step * (repeat - 1) + def __call__(self, time): + if isinstance(time, int): + if time < 0 or time > self.upper_bound: + return False + return time % self.step == 0 + else: + t1, t2 = time + if t2 < 0 or t1 > self.upper_bound: + return False + diff = t2 - t1 + t1m = t1 % self.step + t2m = t2 % self.step + return (diff >= self.step + or t1m == 0 + or t2m == 0 + or t1m > t2m) + +each = lambda step, repeat = inf: each0(step, repeat) + step +each0 = IntervalSchedule + + +class RangeSchedule(Schedule): + def __init__(self, low = None, high = None): + self.low = low or -inf + self.high = high or inf + def __call__(self, time): + if isinstance(time, int): + return self.low <= time <= self.high + else: + t1, t2 = time + return self.low <= t1 <= self.high \ + or self.low <= t2 <= self.high + +inrange = RangeSchedule + + +class ListSchedule(Schedule): + def __init__(self, *schedules): + self.schedules = schedules + def __call__(self, time): + if isinstance(time, int): + return time in self.schedules + else: + for t in self.schedules: + if t1 <= t <= t2: + return True + return False + +at = ListSchedule + + +############## +### PLUGIN ### +############## + +class Plugin(object): + + def attach(self, scheduler): + c = copy(self) + c.scheduler = scheduler + return c + + def __call__(self, event): + raise NotImplementedError("Implement this!") + + def fire(self, type, **attributes): + event = Event(type, issuer = self, **attributes) + self.scheduler.queue(event) + +class FnPlugin(Plugin): + + def __init__(self, function): + self.function = function + + def __call__(self, event): + return self.function(self, event) + +class DispatchPlugin(Plugin): + + def __call__(self, event): + getattr(self, "on_" + event.type, self.generic)(event) + + def generic(self, event): + return + + +################# +### SCHEDULER ### +################# + +class Scheduler(object): + + def __init__(self): + self.plugins = [] + self.categorized = defaultdict(list) + self.event_queue = deque() + + def __call__(self): + i = 0 + evq = self.event_queue + self.queue(Event("begin", issuer = self)) + while True: + self.queue(Event("tick", issuer = self, time = i)) + while evq: + event = evq.popleft() + candidates = self.categorized[event.type] + self.categorized[None] + for event_template, plugin in candidates: + if event.match(event_template): + plugin(event) # note: the plugin might queue more events + if event.type == "terminate": + return + i += 1 + + def schedule_plugin(self, event_template, plugin): + plugin = plugin.attach(self) + if isinstance(event_template, Matcher) or isinstance(event_template.type, Matcher): + # These plugins may execute upon any event type + self.categorized[None].append((event_template, plugin)) + else: + self.categorized[event_template.type].append((event_template, plugin)) + self.plugins.append((event_template, plugin)) + + def queue(self, event): + self.event_queue.append(event) + + + + +@FnPlugin +def printer(self, event): + print event + +@FnPlugin +def stopper(self, event): + self.fire("terminate") + +@FnPlugin +def byebye(self, event): + print "bye bye!" + + +@FnPlugin +def waiter(self, event): + time.sleep(0.1) + +# @FnPlugin +# def timer(self, event): +# if not hasattr(self, 'previous'): +# self.beginning = time.time() +# self.previous = 0 +# now = time.time() - self.beginning +# inow = int(now) +# if inow > self.previous: +# self.fire("second", time = inow) +# self.previous = now + +class Timer(DispatchPlugin): + + def on_begin(self, event): + self.beginning = time.time() + self.previous = 0 + + def on_tick(self, event): + now = time.time() - self.beginning + inow = int(now) + if inow > self.previous: + self.fire("second", time = inow) + self.previous = now + + + +sch = Scheduler() + + +sch.schedule_plugin(all_events, Timer()) +sch.schedule_plugin(Event("tick"), waiter) # this means: execute the waiter plugin (a delay) on every "tick" event. Is it confusing to use Event(...)? +sch.schedule_plugin(Event("second"), printer) + +# sch.schedule_plugin(all_events, printer) + +sch.schedule_plugin(Event("tick", time = at(100)), stopper) +sch.schedule_plugin(Event("terminate"), byebye) + +sch() diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/plugin.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/plugin.txt Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,181 @@ + +====================================== +Plugin system for iterative algorithms +====================================== + +I would like to propose a plugin system for iterative algorithms in +Pylearn. Basically, it would be useful to be able to sandwich +arbitrary behavior in-between two training iterations of an algorithm +(whenever applicable). I believe many mechanisms are best implemented +this way: early stopping, saving checkpoints, tracking statistics, +real time visualization, remote control of the process, or even +interlacing the training of several models and making them interact +with each other. + +So here is the proposal: essentially, a plugin would be a (schedule, +timeline, function) tuple. + +Schedule +======== + +The schedule is some function that takes two "times", t1 and t2, and +returns True if the plugin should be run in-between these times. The +indices refer to a "timeline" unit described below (e.g. "real time" or +"iterations"). The reason why we check a time range [t1, t2] rather than +some discrete time t is that we do not necessarily want to schedule plugins +on iteration numbers. For instance, we could want to run a plugin every +second, or every minute, and then [t1, t2] would be the start time and end +time of the last iteration - and then we run the plugin whenever a new +second started in that range (but still on training iteration +boundaries). Alternatively, we could want to run a plugin every n examples +seen - but if we use mini-batches, the nth example might be square in the +middle of a batch. + +I've implemented a somewhat elaborate schedule system. `each(10)` +produces a schedule that returns true whenever a multiple of 10 is in +the time range. `at(17, 153)` produces one that returns true when 17 +or 143 is in the time range. Schedules can be combined and negated, +e.g. `each(10) & ~at(20, 30)` (execute at each 10, except at 20 and +30). So that gives a lot of flexibility as to when you want to do +things. + +Timeline +======== + +This would be a string indicating on what "timeline" the schedule is +supposed to operate. For instance, there could be a "real time" +timeline, an "algorithm time" timeline, an "iterations" timeline, a +"number of examples" timeline, and so on. This means you can schedule +some action to be executed every actual second, or every second of +training time (ignoring time spent executing plugins), or every +discrete iteration, or every n examples processed. This might be a +bloat feature (it was an afterthought to my original design, anyway), +but I think that there are circumstances where each of these options +is the best one. + +Function +======== + +The plugin function would receive some object containing the time +range, a flag indicating whether the training has started, a flag +indicating whether the training is done (which they can set in order +to stop training), as well as anything pertinent about the model. + +Implementation +============== + +I have implemented the feature in plugin.py, in this directory. Simply +run python plugin.py to test it. + + + +=============== +Revised version +=============== + +Taking into account ideas thrown around during the September 16 +meeting I (OB) have made the following modifications to my original +proposal: + +Event objects +============= + +In the revised framework, an Event is a generic object which can +contain any attributes you want, with one privileged attribute, the +'type' attribute, which is a string. I expect the following attributes +to be used widely: + +* type: this is a string describing the abstract semantics of this + event ("tick", "second", "millisecond", "batch", etc.) + +* issuer: a pointer to the plugin that issued this event. This allows + for fine grained filtering in the case where several plugins can + fire the same event type + +* time: an integer or float index on an abstract timeline. For + instance, the "tick" event would have a "time" field, which would be + increased by one every time the event is fired. Pretty much all + recurrent events should include this. + +* data: some data associated to the event. presumably it doesn't have + to be named "data", and more than one data field could be given. + +The basic idea is that it should be possible to say: "I want this +plugin to be executed every tenth time an event of this type is fired +by this plugin", or any subset of these conditions. + +Matching events +=============== + +When registering a plugin, you specify a sort of "abstract event" that +an event must "match" in order to be fed to the plugin. This can be +done by simply instantiating an event with the fields you want to +match. I think examples would explain best my idea +(sch.schedule_plugin = add a plugin to the scheduler): + +# Print the error on every parameter update (learner given in the event) +sch.schedule_plugin(Event("parameter_update"), PrintError()) +# Print the reconstruction error of daa0 whenever it does a parameter update +sch.schedule_plugin(Event("parameter_update", issuer = daa0), PrintReconstructionError()) +# Save the learner every 10 minutes +sch.schedule_plugin(Event("minute", time = each(10)), Save(learner)) + +The events given as first argument to schedule_plugin are not real +events: they are "template events" meant to be *matched* against the +real events that will be fired. If the terminology is confusing, it +would not be a problem to use another class with a better name (for +example, On("minute", time = each(10)) could be clearer than +Event(...), I don't know). + +Note that fields in these Event objects can be a special kind of +object, a Matcher, which allows to filter events based on arbitrary +conditions. My Schedule objects (each, at, etc.) now inherit from +Matcher. You could easily have a matcher that allows you to match +issuers that are instances of a certain class, or matches every single +event (I have an example of the latter in plugin.py). + +Plugins +======= + +The plugin class would have the following methods: + +* attach(scheduler): tell the plugin that it is being scheduled by the + scheduler, store the scheduler in self. The method can return self, + or a copy of itself. + +* fire(type, **attributes): adds Event(type, issuer = self, **attributes) + to the event queue of self.scheduler + +Scheduler +========= + +A Scheduler would have a schedule_plugin(event_template, plugin) +method to add plugins, a queue(event) method to queue a new event, and +it would be callable. + +My current version proceeds as follows: + +* Fire Event("begin"). Somewhat equivalent to "tick" at time 0, but I + find it cleaner to have a special event to mark the beginning of the + event loop. +* Infinite loop + * Fire Event("tick", time = ) + * Loop until the queue is empty + * Pop event, execute all plugins that respond to it + * Check if event.type == "terminate". If so, stop. + +Varia +===== + +I've made a very simple implementation of a DispatchPlugin which, upon +reception of an event, dispatches it to its "on_" method +(or calls a fallback). It seems nice. However, in order for it to work +reliably, it has to be registered on all events, and I'm not sure it +can scale well to more complex problems where the source of events is +important. + +Implementation +============== + +See plugin.py. + diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/plugin_RP.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/plugin_RP.py Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,161 @@ +''' +================================================= +Plugin system for interative algortithm Version B +================================================= + +After the meeting (September 16) we sort of stumbled on +two possible versions of the plug-in system. This represents +the second version. It suffered a few changes after seeing +Olivier's code and talking to him. + +Concept +======= + +The basic idea behind this version is not to have a list of all +possible events, but rather have plugin register to events.By +specifying what plugin listens to which event produced by what +plugin you define a sort of dependency graph. Structuring things +in such a graph might make the script more intuitive when reading. + +I will first go through pseudo-code for two example and then enumerate +my insights and concepts on the matter + + +Example : Producer - Consumer that Guillaume described +====================================================== + + +.. code-block:: +''' + sch = Schedular() + + @FnPlugin(sch) + def producer(self,event): + self.fire('stuff', value = 'some text') + + @FnPlugin(sch) + def consumer(self,event): + print event.value + + @FnPlugin(sch) + def prod_consumer(self,event): + print event.value + self.fire('stuff2', value = 'stuff') + + producer.act( on = Event('begin'), when = once() ) + producer.act( on = Event('stuff'), when = always() ) + consumer.act( on = Event('stuff'), when = always() ) + prod_consumer.act( on = Event('stuff'), when = always() ) + + sch.run() + + + +''' +Example : Logistic regression +============================= + +Task description +---------------- + +Apply a logistic regression network to some dataset. Use early stopping. +Save the weights everytime a new best score is obtained. Print trainnig score +after each epoch. + + +Possible script +--------------- + +Notes : This would look the same for any other architecture that does not +imply pre-training ( i.e. deep networks). For example the mlp. + +.. code-block:: +''' + +sched = Schedular() + +# Data / Model Building : +# I skiped over how to design this part +# though I have some ideas +real_train_data, real_valid_data = load_mnist() +model = logreg() + +# Main Plugins ( already provided in the library ); +# This wrappers also registers the plugin +train_data = create_data_plugin( sched, data = real_train_data) +valid_data = create_data_plugin( sched, data = real_valid_data) +train_model = create_train_model(sched, model = model) +validate_model = create_valid_model(sched, model = model, data = valid_data) +early_stopper = create_early_stopper(sched) + + +# On the fly plugins ( print random stuff); the main difference from my +# FnPlugin from Olivier's version is that it also register the plugin in sched +@FnPlugin(sched) +def print_error(self, event): + if event.type == Event('begin'): + self.value = [] + elif event.type == train_model.error(): + self.value += [event.value] + else event.type == train_data.eod(): + print 'Error :', numpy.mean(self.value) + +@FnPlugin(sched) +def save_model(self, event): + if event.type == early_stopper.new_best_error(): + cPickle.dump(model.parameters(), open('best_params.pkl','wb')) + + +# Create the dependency graph describing what does what +train_model.act(on = train_data.batch(), when = always()) +validate_model.act(on = train_model.done(), when = every(n=10000)) +early_stopper.act(on = validate_model.error(), when = always()) +print_error.act( on = train_model.error(), when = always() ) +print_error.act( on = train_data.eod(), when = always() ) +save_model.act( on = eraly_stopper.new_best_errot(), when = always() ) + +# Run the entire thing +sched.run() + + +''' +Notes +===== + + * I think we should have a FnPlugin decorator ( exactly like Olivier's) just + that also attaches the new created plugin to the schedule. This way you + can create plugin on the fly ( as long as they are simple functions that + print stuff, or compute simple statitics ). + * I added a method act to a Plugin. You use that to create the dependency + graph ( it could also be named listen to be more plugin like interface) + * Plugins are obtained in 3 ways : + - by wrapping a dataset / model or something similar + - by a function that constructs it from nothing + - by decorating a function + In all cases I would suggest then when creating them you should provide + the schedular as well, and the constructor also registers the plugin + + * The plugin concept works well as long as the plugins are a bit towards + heavy duty computation, disregarding printing plugins and such. If you have + many small plugins this system might only introduce an overhead. I would + argue that using theano is restricted to each plugin. Therefore I would + strongly suggest that the architecture to be done outside the schedular + with a different approach. + + * I would suggest that the framework to be used only for the training loop + (after you get the adapt function, compute error function) so is more about + the meta-learner, hyper-learner learner level. + + * A general remark that I guess everyone will agree on. We should make + sure that implementing a new plugin is as easy/simple as possible. We + have to hide all the complexity in the schedular ( it is the part of the + code we will not need or we would rarely need to work on). + + * I have not went into how to implement the different components, but + following Olivier's code I think that part would be more or less straight + forward. + + ''' + + +''' diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/plugin_architecture_GD.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/plugin_architecture_GD.txt Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,267 @@ +Overview +======== + +The "central authority" (CA) is the glue which takes care of interfacing plugins +with one another. It has 3 basic roles: +* it maintains a list of "registered" or "active" plugins +* it receives and queues the various messages sent by the plugins +* dispatches the messages to the recipient, based on various "events" + +Events can take different forms: +* the CA can trigger various events based on running time +* can be linked to messages emitted by the various plugins. Events can be + triggered based on the frequency of such messages. +* Once an event is triggered, it is relayed to the appropriate "recipient + plugin(s)" + +It is the responsibility of each plugin to inform the CA of which "events" it +cares about. + + +Generic Pseudo-code +=================== + +I'll try to write this in pseudo-python as best I can. I'll do this in +traditional OOP, as this is what I'm more comfortable with. I'll leave it up to +James and OB to python-ize this :) + + +class MessageX(Message): + """ + A message is basically a data container. This could very well be replaced by + a generic Python object. + """ + +class Plugin(object): + """ + The base plugin object doesn't do much. It contains a reference to the CA + (upon plugin being registered with the CA), provides boilerplate code + for storing which "events" this plugin is susceptible to, as well as code + for registering callback functions for the various messages. + """ + + CA = None # to be initialized upon plugin registration + active_msg = {} # dictionary of messages this plugin is susceptible to + callbacks = {} # mapping of message class names --> callback function + + def listen(msg_class, interval): + """ + :param msg_class: reference to the "message" class we are interested in. + These messages will be forwarded to this plugin, when + the trigger condition is met. + :param interval: integer. Forward the message to this plugin every 'interval' + such messages. + """ + self.active_msg[msg_class] = interval + + + def check_trigger(msg_class, time): + """ + Checks whether or not the "trigger" condition associated with message of + class 'msg_class' is satisfied or not. This could be the default + behavior, and be overridden by the various plugins. + """ + return time % self.active_msg[msg_class] == 0 + + + def handler(msg_class, callback): + """ + Decorator which registers a callback function for the given message + type. + + NOTE: I don't think what I wrote would work as a Python decorator. I am + not sure how to handle decoraters with multiple parameters (one + explicit, and the other as the reference to the function). I'm pretty + sure James or OB could figure it out though ! + + :params msg_class: reference to the message class for which we are + registering a callback function + :params callback : reference to which function to call for a given message + """ + + self.callbacks[msg_class] = callback + + + def execute(self, message): + """ + Boiler-plate code which executes the right callback function, for the + given message type. + """ + for (msg_class, callback) in self.callbacks.iteritems(): + if message.__class__ == msg_class: + callback(message) + + +class ProducerPlugin(Plugin): + + def dostuff(): + """ + A typical "producer" plugin. It basically performs an arbitrary action + and asks the CA to forward the results (in the form of a message) to + other plugins. + """ + + # iteratively do stuff and relay messages to other plugins + while(condition): + + msga = # do something + ca.send(msga) # ask CA to forward to other plugins + + +class ConsumerPlugin(Plugin): + + @handler(MessageA) + def func(msga): + """ + A consumer or "passive plugin" (eg. logger, etc). This function is + register as being the callback function for Message A objects. + """ + # do something with message A + + +class ConsumerProducerPlugin(Plugin): + + @handler(MessageA) + def func(msga): + """ + Example of a consumer / producer plugin. It receives MessageA messages, + processes the data, then asks the CA to send a new message (MessageB) as + the result of its computation. The CA will automatically forward to all + interested parties. + + :param msga: MessageA instance + """ + + data = dostuff(msga) # process message + msgb = MessageB(data) # generate new message for other plugins + ca.send(msgb) # ask CA to forward to other plugins + + + +class CentralAuthority(object): + + active_plugins = [] # contains a list of registered plugins + + mailmain = {} # dictionary which contains, for each message class, a + # list of plugins interested in this message + + event_count = {} # dictionary of "event" counts for various messages + + def register(plugin): + """ + Registers the plugin and adds it as a listener for the various messages + it is interested in. + :param plugin: plugin instance which we want to "activate" + """ + + # each plugin must have a reference to the CA + plugin.ca = self + + # maintain list of active plugins + active_plugins.append(plugin) + + # remember which messages this plugin cares about + for msg in plugin.active_msg.keys(): + self.mailman[msg].append(plugin) + self.event_count[msg] = 0 + + def send(msg): + """ + This function relays the message to the appropriate plugins, based on + their "trigger" condition. It also keeps track of the number of times + this event was raised. + + :param msg: message instance + """ + + event_count[msg.__class__] += 1 + + # for all plugins interested in this message ... + for plugin in self.mailman[msg.__class__]: + + # check if trigger condition is met + if plugin.check_trigger(msg, self.event_count[msg.__class__]): + + # have the plugin execute the message + plugin.execute(msg) + + + def run(self): + """ + This would be the main loop of the program. I won't go into details + because its still somewhat blurry in my head :) But basically, the CA + could be configured to send out its own messages, independently from all + other plugins. + + These could be "synchronous" messages such as: "5 seconds have passed", + or others such as "save state we are about to get killed". + + NOTE: seems like this would almost have to live in its own thread ... + """ + + # the following would be parametrized obviously + while(True): + msg = ElapsedTimeMessage(5) + self.send(msg) + sleep(5) + + + +Putting it all-together +======================= + + +def main(): + + ca = CentralAuthority() + + producer = ProducerPlugin() + ca.register(producer) + + consumer = ConsumerPlugin() + consumer.listen(MessageB, 1) + ca.register(consumer)) + + other = ConsumerProducerPlugin() + other.listen(MessageB, 10) + ca.register(other) + + # this is the function call which gets the ball rolling + producer.dostuff() + + +DISCUSSION: blocking vs. non-blocking +===================================== + +In the above example, I used "blocking" sends. However it is not-clear that this +is the best option. + +In the example, the producer basically acts as the main loop. It relinquishes +control of the main loop when the CA decides to forward the message to other +plugins. Control will only be returned once the cascade of send/receives +initiated with MessageA is complete (all subplugins have processed MessageA and +any messages sent as a side-effect have also been processed). + +This definitely imposes constraints on what the plugins can do, and how they do +it. For the type of single-processor / linear jobs we tend to run, this might be +enough (??). + +The good news is that going forward, the above plugin architecture can also +scale to distributed systems, by changing the sends to be non-blocking. Plugins +could then live on different machines and process data as they see fit. +Synchronization would be enforced by way of messages. In the above, the "main +producer" would thus become a consumer/producer who listens for "done processing +MessageA" messages and produces a new MessageA as a result. + +On single-processor systems, the synchronization overhead might be too costly +however. That is something we would have to investigate. On the plus side +however, our plugins would be "future proof" and lend themselves well to the +type of "massively parallel jobs" we wish to run (i.e. meta-learners, etc.) + + + +Logistic Regression +=================== + + +TO COME SOON (?) diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/requirements.txt --- a/doc/v2_planning/requirements.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/requirements.txt Fri Sep 17 12:01:32 2010 -0400 @@ -77,3 +77,12 @@ hyper-parameters, and want to easily be able to recover the full "processing pipeline" that performs best, and use it on real/test data later. +OD comments: Note that R9 and R13 may conflict with each other. Some +optimizations performed by R9 may modify the input "symbolic graph" in such a +way that extracting the required components for "production purpose" (R13) +could be made more difficult (or even impossible). Imagine for instance that +the graph is modified to take advantage of the fact that k-fold validation can +be performed efficiently internally by some specific algorithm. Then it may +not be obvious anymore how to remove the k-fold split in the saved model you +want to use in production. + diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/shared_dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/shared_dataset.py Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,47 @@ +import theano + +# This is not final and may not even run for now. It is just to give +# a feeling of what the interface could look like. + +def shared_dataset(dataset, mem_size): + if dataset.total_size > mem_size: + return OnlineDataset(dataset) + else: + return MemoryDataset(dataset) + +class MemoryDataset(theano.Op): + def __init__(self, dataset): + self.input = theano.shared(dataset.input) + self.output = theano.shared(dataset.output) + self.batch_size = dataset.batch_size + + def make_node(self, idx): + idx_ = theano.as_tensor_variable(idx) + return theano.Apply(self, + inputs = [idx_], + outputs = [self.input.type(), + self.output.type()]) + + def preform(self, node, inputs, output_storage): + idx, = inputs + self.output_storage[0][0] = self.input[idx*self.batch_size:(idx+1)*self.batch_size] + self.output_storage[1][0] = self.output[idx*self.batch_size:(idx+1)*self.batch_size] + +class OnlineDataset(theano.Op): + def __init__(self, dataset): + self.dataset = dataset + + def make_node(self, idx): + idx_ = theano.as_tensor_variable(idx) + return theano.Apply(self, + inputs = [idx_], + outputs = [theano.tensor.fmatrix(), + theano.tensor.fmatrix()]) + # fix this so its not fmatrix(), + # but whatever the dataset outputs + + def perform(self, node, inputs, output_storage): + idx, = inputs + b = self.dataset.get_batch(idx.value) + output_storage[0][0] = b.input + output_storage[1][0] = b.output diff -r 9686c0d9689d -r aea510b71386 doc/v2_planning/use_cases.txt --- a/doc/v2_planning/use_cases.txt Fri Sep 17 12:01:12 2010 -0400 +++ b/doc/v2_planning/use_cases.txt Fri Sep 17 12:01:32 2010 -0400 @@ -66,6 +66,7 @@ classification_accuracy( examples=MNIST.validation_dataset, function=as_classifier('learner_obj'))), + step_fn = vm_lambda(('learner_obj',), sgd_step_fn( parameters = vm_getattr('learner_obj', 'params'), @@ -96,6 +97,29 @@ - there are no APIs for things which are not passed as arguments (i.e. the logic of the whole program is not exposed via some uber-API). +OD comments: I didn't have time to look closely at the details, but overall I +like the general feel of it. At least I'd expect us to need something like +that to be able to handle the multiple use cases we want to support. I must +say I'm a bit worried though that it could become scary pretty fast to the +newcomer, with 'lambda functions' and 'virtual machines'. +Anyway, one point I would like to comment on is the line that creates the +linear classifier. I hope that, as much as possible, we can avoid the need to +specify dataset dimensions / number of classes in algorithm constructors. I +regularly had issues in PLearn with the fact we had for instance to give the +number of inputs when creating a neural network. I much prefer when this kind +of thing can be figured out at runtime: + - Any parameter you can get rid of is a significant gain in + user-friendliness. + - It's not always easy to know in advance e.g. the dimension of your input + dataset. Imagine for instance this dataset is obtained in a first step + by going through a PCA whose number of output dimensions is set so as to + keep 90% of the variance. + - It seems to me it fits better the idea of a symbolic graph: my intuition + (that may be very different from what you actually have in mind) is to + see an experiment as a symbolic graph, which you instantiate when you + provide the input data. One advantage of this point of view is it makes + it natural to re-use the same block components on various datasets / + splits, something we often want to do. K-fold cross validation of a classifier --------------------------------------- @@ -113,7 +137,7 @@ initial_model=alloc_model('param1', 'param2'), burnin=100, score_fn = vm_lambda(('learner_obj',), - graph=classification_error( + classification_error( function=as_classifier('learner_obj'), dataset=MNIST.subset(validation_set))), step_fn = vm_lambda(('learner_obj',), @@ -145,7 +169,7 @@ extending the symbolic program, and calling the extended function. vm.call( - [pylearn.min(model.weights) for model in trained_models], + [pylearn.min(pylearn_getattr(model, 'weights')) for model in trained_models], param1=1, param2=2) If this is run after the previous calls: diff -r 9686c0d9689d -r aea510b71386 pylearn/formulas/__init__.py diff -r 9686c0d9689d -r aea510b71386 pylearn/formulas/costs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/formulas/costs.py Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,22 @@ +""" +This script defines a few often used cost functions. +""" +import theano +import theano.tensor as T +from tags import tags + +@tags('cost','binary','cross-entropy') +def binary_crossentropy(output, target): + """ Compute the crossentropy of binary output wrt binary target. + + .. math:: + L_{CE} \equiv t\log(o) + (1-t)\log(1-o) + + :type output: Theano variable + :param output: Binary output or prediction :math:`\in[0,1]` + :type target: Theano variable + :param target: Binary target usually :math:`\in\{0,1\}` + """ + return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output)) + + diff -r 9686c0d9689d -r aea510b71386 pylearn/formulas/noise.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/formulas/noise.py Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,56 @@ +""" + +This script define the different symbolic noise functions. +The noise contract is simple: noise_lvl is a symbolic variable going from 0 to 1. +0: no changement. +1: max noise. +""" +import theano +import theano.tensor as T +from tags import tags +s=""" +* A latex mathematical description of the formulas(for picture representation in generated documentation) +* Tags(for searching): + * a list of lower lovel fct used + * category(name of the submodule itself) +* Tell if we did some work to make it more numerical stable. Do theano do the optimization needed? +* Tell if the grad is numericaly stable? Do theano do the optimization needed? +* Tell if work on gpu/not/unknow +* Tell alternate name +* Tell the domaine, range of the input/output(range should use the english notation of including or excluding) +""" + +@tags('noise','binomial','salt') +def binomial_noise(theano_rng,inp,noise_lvl): + """ This add binomial noise to inp. Only the salt part of pepper and salt. + + :type inp: Theano Variable + :param inp: The input that we want to add noise + :type noise_lvl: float + :param noise_lvl: The % of noise. Between 0(no noise) and 1. + """ + return theano_rng.binomial( size = inp.shape, n = 1, p = 1 - noise_lvl, dtype=theano.config.floatX) * inp + + +@tags('noise','binomial NLP','pepper','salt') +def pepper_and_salt_noise(theano_rng,inp,noise_lvl): + """ This add pepper and salt noise to inp + + :type inp: Theano Variable + :param inp: The input that we want to add noise + :type noise_lvl: tuple(float,float) + :param noise_lvl: The % of noise for the salt and pepper. Between 0(no noise) and 1. + """ + return theano_rng.binomial( size = inp.shape, n = 1, p = 1 - noise_lvl[0], dtype=theano.config.floatX) * inp \ + + (inp==0) * theano_rng.binomial( size = inp.shape, n = 1, p = noise_lvl[1], dtype=theano.config.floatX) + +@tags('noise','gauss','gaussian') +def gaussian_noise(theano_rng,inp,noise_lvl): + """ This add gaussian NLP noise to inp + + :type inp: Theano Variable + :param inp: The input that we want to add noise + :type noise_lvl: float + :param noise_lvl: The standard deviation of the gaussian. + """ + return theano_rng.normal( size = inp.shape, std = noise_lvl, dtype=theano.config.floatX) + inp diff -r 9686c0d9689d -r aea510b71386 pylearn/formulas/tags.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pylearn/formulas/tags.py Fri Sep 17 12:01:32 2010 -0400 @@ -0,0 +1,53 @@ + +from collections import defaultdict + +tags_db = defaultdict(set) + +def tags(*_tags): + tags = set() + def add_tag(tag): + if isinstance(tag, (list, tuple)): + map(add_tag, tag) + elif isinstance(tag, (str, unicode)): + for word in tag.split(" "): + tags.add(word) + tags.add(tag) + else: + raise TypeError("Tags should be strings or lists/tuples of strings. Got: %s, of type %s" % (tag, type(tag))) + map(add_tag, _tags) + tags = tuple(sorted(tags)) + def decorator(function): + function.tags = tags + function.__doc__ += "\n\nTags: %s" % ", ".join(tags) + for tag in tags: + tags_db[tag].add(function) + return function + return decorator + +def search(*tags): + return reduce(set.__and__, [tags_db[tag] for tag in tags]) + + +if __name__ == '__main__': + common_tags = ['c', 'd'] + + @tags(common_tags, 'a', 'b', 'long tag') + def f(a,b): + ''' function f returns a+b ''' + return a+b + + @tags(common_tags, 'x') + def g(a,b): + ''' function f returns a-b ''' + return a-b + + @tags('c', 'x', 'y', 'z') + def h(a,b): + ''' function f returns a*b ''' + return a*b + + + + print f.__doc__ + print [x.__name__ for x in search('c', 'd')] + print [x.__name__ for x in search('x')]