changeset 1118:8cc324f388ba

proposal for a plugin system
author Olivier Breuleux <breuleuo@iro.umontreal.ca>
date Tue, 14 Sep 2010 16:01:32 -0400
parents c1943feada10
children 81ea57c6716d
files doc/v2_planning/plugin.py doc/v2_planning/plugin.txt
diffstat 2 files changed, 395 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/v2_planning/plugin.py	Tue Sep 14 16:01:32 2010 -0400
@@ -0,0 +1,327 @@
+
+import time
+from collections import defaultdict
+
+inf = float('inf')
+
+################
+### SCHEDULE ###
+################
+
+class Schedule(object):
+    def __add__(self, i):
+        return OffsetSchedule(self, i)
+    def __or__(self, s):
+        return UnionSchedule(self, to_schedule(s))
+    def __and__(self, s):
+        return IntersectionSchedule(self, to_schedule(s))
+    def __sub__(self, i):
+        return OffsetSchedule(self, -i)
+    def __ror__(self, s):
+        return UnionSchedule(to_schedule(s), self)
+    def __rand__(self, s):
+        return IntersectionSchedule(to_schedule(s), self)
+    def __invert__(self):
+        return NegatedSchedule(self)
+
+def to_schedule(x):
+    if x in (None, False):
+        return never
+    if x is True:
+        return always
+    elif isinstance(x, (list, tuple)):
+        return reduce(UnionSchedule, x)
+    else:
+        return x
+
+
+class ScheduleMix(Schedule):
+    __n__ = None
+    def __init__(self, *subschedules):
+        assert (not self.__n__) or len(subschedules) == self.__n__
+        self.subschedules = map(to_schedule, subschedules)
+
+class UnionSchedule(ScheduleMix):
+    def __call__(self, t1, t2):
+        return any(s(t1, t2) for s in self.subschedules)
+
+class IntersectionSchedule(ScheduleMix):
+    def __call__(self, t1, t2):
+        return all(s(t1, t2) for s in self.subschedules)
+
+class DifferenceSchedule(ScheduleMix):
+    __n__ = 2
+    def __call__(self, t1, t2):
+        return self.subschedules[0](t1, t2) and not self.subschedules[1](t1, t2)
+
+class NegatedSchedule(ScheduleMix):
+    __n__ = 1
+    def __call__(self, t1, t2):
+        return not self.subschedules[0](t1, t2)
+
+class OffsetSchedule(Schedule):
+    def __init__(self, schedule, offset):
+        self.schedule = schedule
+        self.offset = offset
+    def __call__(self, t1, t2):
+        return self.schedule(t1 - self.offset, t2 - self.offset)
+
+
+class AlwaysSchedule(Schedule):
+    def __call__(self, t1, t2):
+        return True
+
+always = AlwaysSchedule()
+never = ~always
+
+class IntervalSchedule(Schedule):
+    def __init__(self, step, repeat = inf):
+        self.step = step
+        self.upper_bound = step * (repeat - 1)
+    def __call__(self, t1, t2):
+        if t2 < 0 or t1 > self.upper_bound:
+            return False
+        diff = t2 - t1
+        t1m = t1 % self.step
+        t2m = t2 % self.step
+        return (diff >= self.step
+                or t1m == 0
+                or t2m == 0
+                or t1m > t2m)
+
+each = lambda step, repeat = inf: each0(step, repeat) + step
+each0 = IntervalSchedule
+
+
+class RangeSchedule(Schedule):
+    def __init__(self, low = None, high = None):
+        self.low = low or -inf
+        self.high = high or inf
+    def __call__(self, t1, t2):
+        return self.low <= t1 <= self.high \
+            or self.low <= t2 <= self.high
+
+inrange = RangeSchedule    
+
+
+class ListSchedule(Schedule):
+    def __init__(self, *schedules):
+        self.schedules = schedules
+    def __call__(self, t1, t2):
+        for t in self.schedules:
+            if t1 <= t <= t2:
+                return True
+        return False
+
+at = ListSchedule
+at_start = at(-inf)
+at_end = at(inf)
+
+
+##############
+### RUNNER ###
+##############
+
+class scratchpad:
+    pass
+
+# # ORIGINAL RUNNER, NO TIMELINES
+# def runner(master, plugins):
+#     """
+#     master is a function which is in charge of the "this" object.  It
+#         is in charge of updating the t1, t2 and done fields, It must
+#         take a single argument, this.
+
+#     plugins is a list of (schedule, function) pairs. In-between each
+#         execution of the master function, as well as at the very
+#         beginning and at the very end, the schedule will be consulted
+#         for the time range [t1, t2], and if there is a match, the
+#         function will be called with this as the argument. The order
+#         in which the functions are provided is respected.
+
+#     Note: the reason why we use t1 and t2 instead of just t is that it
+#     gives the master function the ability to run several iterations at
+#     once without consulting any plugins. In that situation, t1 and t2
+#     represent a range, and the schedule must determine if there would
+#     have been an event in that range (we do not distinguish between a
+#     single event and multiple events).
+
+#     For instance, if one is training using minibatches, one could set
+#     t1 and t2 to the index of the lower and higher examples, and the
+#     plugins' schedules would be given according to how many examples
+#     were seen rather than how many minibatches were processed.
+
+#     Another possibility is to use real time - t1 would be the time
+#     before the execution of the master function, t2 the time after
+#     (in, say, milliseconds). Then you can define plugins that run
+#     every second or every minute, but only in-between two training
+#     iterations.
+#     """
+
+#     this = scratchpad()
+#     this.t1 = -inf
+#     this.t2 = -inf
+#     this.started = False
+#     this.done = False
+#     while True:
+#         for schedule, function in plugins:
+#             if schedule(this.t1, this.t2):
+#                 function(this)
+#                 if this.done:
+#                     break
+#         master(this)
+#         this.started = True
+#         if this.done:
+#             break
+#     this.t1 = inf
+#     this.t2 = inf
+#     for schedule, function in plugins:
+#         if schedule(this.t1, this.t2):
+#             function(this)
+
+
+
+
+def runner(main, plugins):
+    """
+    :param main: A function which must take a single argument,
+        ``this``. The ``this`` argument contains a settable ``done``
+        flag indicating whether the iterations should keep going or
+        not, as well as a flag indicating whether this is the first
+        time runner() is calling main(). main() may store whatever it
+        wants in ``this``. It may also add one or more timelines in
+        ``this.timelines[timeline_name]``, which plugins can exploit.
+
+    :param plugins: A list of (schedule, timeline, function)
+        tuples. In-between each execution of the main function, as
+        well as at the very beginning and at the very end, the
+        schedule will be consulted for the time range [t1, t2] from
+        the appropriate timeline, and if there is a match, the
+        function will be called with ``this`` as the argument. The
+        order in which the functions are provided is respected.
+
+        For any plugin, the timeline can be
+        * 'iterations', where t1 == t2 == the iteration number
+        * 'real_time', where t1 and t2 mark the start of the last
+          loop and the start of the current loop, in seconds since
+          the beginning of training (includes time spent in plugins)
+        * 'algorithm_time', where t1 and t2 mark the start and end
+          of the last iteration of the main function (does not
+          include time spent in plugins)
+        * A main function specific timeline.
+
+        At the very beginning, the time for all timelines is
+        -infinity, at the very end it is +infinity.
+    """
+    start_time = time.time()
+
+    this = scratchpad()
+
+    this.timelines = defaultdict(lambda: [-inf, -inf])
+    realt = this.timelines['real_time']
+    algot = this.timelines['algorithm_time']
+    itert = this.timelines['iterations']
+
+    this.started = False
+    this.done = False
+
+    while True:
+
+        for schedule, timeline, function in plugins:
+            if schedule(*this.timelines[timeline]):
+                function(this)
+                if this.done:
+                    break
+
+        t1 = time.time()
+        main(this)
+        t2 = time.time()
+
+        if not this.started:
+            realt[:] = [0, 0]
+            algot[:] = [0, 0]
+            itert[:] = [-1, -1]
+        realt[:] = [realt[1], t2 - start_time]
+        algot[:] = [algot[1], algot[1] + (t2 - t1)]
+        itert[:] = [itert[0] + 1, itert[1] + 1]
+
+        this.started = True
+        if this.done:
+            break
+
+    this.timelines = defaultdict(lambda: [inf, inf])
+
+    for schedule, timeline, function in plugins:
+        if schedule(*this.timelines[timeline]):
+            function(this)
+
+
+
+
+
+################
+### SHOWCASE ###
+################
+
+def main(this):
+    if not this.started:
+        this.error = 1.0
+        # note: runner will automatically set this.started to true
+    else:
+        this.error /= 1.1
+
+
+def welcome(this):
+    print "Let's start!"
+
+def print_iter(this):
+    print "Now running iteration #%i" % this.timelines['iterations'][0]
+
+def print_error(this):
+    print "The error rate is %s" % this.error
+
+def maybe_stop(this):
+    thr = 0.01
+    if this.error < thr:
+        print "Error is below the threshold: %s <= %s" % (this.error, thr)
+        this.done = True
+
+def wait_a_bit(this):
+    time.sleep(1./37)
+
+def printer(txt):
+    def f(this):
+        print txt
+    return f
+
+def stop_this_madness(this):
+    this.done = True
+
+def byebye(this):
+    print "Bye bye!"
+
+runner(main = main,
+       plugins = [# At the very beginning, print a welcome message
+                  (at_start, 'iterations', welcome),
+                  # Each iteration from 1 to 10 inclusive, OR each multiple of 10
+                  # (except 0 - each() excludes 0, each0() includes it)
+                  # print the error
+                  (inrange(1, 10) | each(10), 'iterations',  print_error),
+                  # Each multiple of 10, check for stopping condition
+                  (each(10), 'iterations',  maybe_stop),
+                  # At iteration 1000, if we ever get that far, just stop
+                  (at(1000), 'iterations',  stop_this_madness),
+                  # Wait a bit
+                  (each(1), 'iterations',  wait_a_bit),
+                  # Print bonk each second of real time
+                  (each(1), 'real_time',  printer('BONK')),
+                  # Print thunk each second of time in main() (main()
+                  # is too fast, so this does not happen for many
+                  # iterations)
+                  (each(1), 'algorithm_time',  printer('THUNK')),
+                  # Announce the next iteration
+                  (each0(1), 'iterations',  print_iter),
+                  # At the very end, display a message
+                  (at_end, 'iterations',  byebye)])
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/v2_planning/plugin.txt	Tue Sep 14 16:01:32 2010 -0400
@@ -0,0 +1,68 @@
+
+======================================
+Plugin system for iterative algorithms
+======================================
+
+I would like to propose a plugin system for iterative algorithms in
+Pylearn. Basically, it would be useful to be able to sandwich
+arbitrary behavior in-between two training iterations of an algorithm
+(whenever applicable). I believe many mechanisms are best implemented
+this way: early stopping, saving checkpoints, tracking statistics,
+real time visualization, remote control of the process, or even
+interlacing the training of several models and making them interact
+with each other.
+
+So here is the proposal: essentially, a plugin would be a (schedule,
+timeline, function) tuple.
+
+Schedule
+========
+
+The schedule is some function that takes two "times", t1 and t2, and
+returns True if the plugin should be run in-between these times. The
+reason why we check a time range [t1, t2] rather than some discrete
+time t is that we do not necessarily want to schedule plugins on
+iteration numbers. For instance, we could want to run a plugin every
+second, or every minute, and then [t1, t2] would be the start time and
+end time of the last iteration - and then we run the plugin whenever a
+new second started in that range (but still on training iteration
+boundaries). Alternatively, we could want to run a plugin every n
+examples seen - but if we use mini-batches, the nth example might be
+square in the middle of a batch.
+
+I've implemented a somewhat elaborate schedule system. `each(10)`
+produces a schedule that returns true whenever a multiple of 10 is in
+the time range. `at(17, 153)` produces one that returns true when 17
+or 143 is in the time range. Schedules can be combined and negated,
+e.g. `each(10) & ~at(20, 30)` (execute at each 10, except at 20 and
+30). So that gives a lot of flexibility as to when you want to do
+things.
+
+Timeline
+========
+
+This would be a string indicating on what "timeline" the schedule is
+supposed to operate. For instance, there could be a "real time"
+timeline, an "algorithm time" timeline, an "iterations" timeline, a
+"number of examples" timeline, and so on. This means you can schedule
+some action to be executed every actual second, or every second of
+training time (ignoring time spent executing plugins), or every
+discrete iteration, or every n examples processed. This might be a
+bloat feature (it was an afterthought to my original design, anyway),
+but I think that there are circumstances where each of these options
+is the best one.
+
+Function
+========
+
+The plugin function would receive some object containing the time
+range, a flag indicating whether the training has started, a flag
+indicating whether the training is done (which they can set in order
+to stop training), as well as anything pertinent about the model.
+
+Implementation
+==============
+
+I have implemented the feature in plugin.py, in this directory. Simply
+run python plugin.py to test it.
+