diff doc/v2_planning/arch_src/plugin_JB.py @ 1212:478bb1f8215c

plugin_JB - added SPAWN control element and demo program
author James Bergstra <bergstrj@iro.umontreal.ca>
date Wed, 22 Sep 2010 01:37:55 -0400
parents
children 9fac28d80fb7
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/v2_planning/arch_src/plugin_JB.py	Wed Sep 22 01:37:55 2010 -0400
@@ -0,0 +1,366 @@
+"""plugin_JB - draft of potential library architecture using iterators
+
+This strategy makes use of a simple imperative language whose statements are python function
+calls to create learning algorithms that can be manipulated and executed in several desirable
+ways.  
+
+The training procedure for a PCA module is easy to express:
+
+    # allocate the relevant modules
+    dataset = Dataset(numpy.random.RandomState(123).randn(13,1))
+    pca = PCA_Analysis()
+    pca_batchsize=1000
+
+    # define the control-flow of the algorithm
+    train_pca = SEQ([
+        BUFFER_REPEAT(pca_batchsize, CALL(dataset.next)), 
+        FILT(pca.analyze)])
+
+    # run the program
+    train_pca.run()
+
+The CALL, SEQ, FILT, and BUFFER_REPEAT are control-flow elements. The control-flow elements I
+defined so far are:
+
+- CALL - a basic statement, just calls a python function
+- FILT - like call, but passes the return value of the last CALL or FILT to the python function
+- SEQ - a sequence of elements to run in order
+- REPEAT - do something N times (and return None or maybe the last CALL?)
+- BUFFER_REPEAT - do something N times and accumulate the return value from each iter
+- LOOP - do something an infinite number of times
+- CHOOSE - like a switch statement (should rename to SWITCH)
+- WEAVE - interleave execution of multiple control-flow elements
+- POPEN - launch a process and return its status when it's complete
+- PRINT - a shortcut for CALL(print_obj)
+
+
+We don't have many requirements per-se for the architecture, but I think this design respects
+and realizes all of them.
+The advantages of this approach are:
+
+    - algorithms (including partially run ones) are COPYABLE, and SERIALIZABLE
+
+    - algorithms can be executed without seizing control of the python process (the run()
+      method does this, but if you look inside it you'll see it's a simple for loop)
+
+      - it is easy to execute an algorithm step by step in a main loop that also checks for
+        network or filesystem events related to e.g. job management.
+
+    - the library can provide learning algorithms via control-flow templates, and the user can
+      edit them (with search/replace calls) to include HOOKS, and DIAGNOSTIC plug-in
+      functionality
+
+      e.g. prog.find(CALL(cd1_update, layer=layer1)).replace_with(
+          SEQ([CALL(cd1_update, layer=layer1), CALL(my_debugfn)]))
+
+    - user can print the 'program code' of an algorithm built from library pieces
+
+    - program can be optimized automatically.
+      
+      - e.g. BUFFER(N, CALL(dataset.next))  could be replaced if dataset.next implements the
+        right attribute/protocol for 'bufferable' or something.
+
+      - e.g. SEQ([a,b,c,d])  could be compiled to a single CALL to a Theano-compiled function
+        if a, b, c, and d are calls to callable objects that export something like a
+        'theano_SEQ' interface
+
+
+"""
+
+__license__ = 'TODO'
+__copyright__ = 'TODO'
+
+import cPickle, copy, os, subprocess, sys, time
+import numpy
+
+####################################################
+# CONTROL-FLOW CONSTRUCTS
+
+class INCOMPLETE: 
+    """Return value for Element.step"""
+
+class ELEMENT(object):
+    """
+    Base class for control flow elements (e.g. CALL, REPEAT, etc.)
+
+    The design is that every element has a driver, that is another element, or the iterator
+    implementation in the ELEMENT class.
+
+    the driver calls start when entering a new control element
+       - this would be called once per e.g. outer loop iteration
+
+    the driver calls step to advance the control element
+       - which returns INCOMPLETE
+       - which returns any other object to indicate completion
+    """
+
+    # subclasses should override these methods:
+    def start(self, arg):
+        pass
+    def step(self):
+        pass
+
+    # subclasses should typically not override these:
+    def run(self, arg=None, n_steps=float('inf')):
+        self.start(arg)
+        i = 0
+        r = self.step()
+        while r is INCOMPLETE:
+            i += 1
+            #TODO make sure there is not an off-by-one error
+            if i > n_steps:
+                break
+            r = self.step()
+        return r
+
+class BUFFER_REPEAT(ELEMENT):
+    """
+    Accumulate a number of return values into one list / array.
+
+    The source of return values `src` is a control element that will be restarted repeatedly in
+    order to fulfil the requiement of gathering N samples.
+
+    TODO: support accumulating of tuples of arrays
+    """
+    def __init__(self, N, src, storage=None):
+        """
+        TODO: use preallocated `storage`
+        """
+        self.N = N
+        self.n = 0
+        self.src = src
+        self.storage = storage
+        self.src.start(None)
+        if self.storage != None:
+            raise NotImplementedError()
+    def start(self, arg):
+        self.buf = [None] * self.N
+        self.n = 0
+        self.finished = False
+    def step(self):
+        assert not self.finished
+        r = self.src.step()
+        if r is INCOMPLETE:
+            return r
+        self.src.start(None) # restart our stream
+        self.buf[self.n] = r
+        self.n += 1
+        if self.n == self.N:
+            self.finished = True
+            return self.buf
+        else:
+            return INCOMPLETE
+        assert 0
+
+class CALL(ELEMENT):
+    """
+    Control flow terminal - call a python function or method.
+
+    Returns the return value of the call.
+    """
+    def __init__(self, fn, *args, **kwargs):
+        self.fn = fn
+        self.args = args
+        self.kwargs=kwargs
+        self.use_start_arg = kwargs.pop('use_start_arg', False)
+    def start(self, arg):
+        self.start_arg = arg
+        self.finished = False
+        return self
+    def step(self):
+        assert not self.finished
+        self.finished = True
+        if self.use_start_arg:
+            if self.args:
+                raise TypeError('cant get positional args both ways')
+            return self.fn(self.start_arg, **self.kwargs)
+        else:
+            return self.fn(*self.args, **self.kwargs)
+    def __getstate__(self):
+        rval = dict(self.__dict__)
+        if type(self.fn) is type(self.step): #instancemethod
+            fn = rval.pop('fn')
+            rval['i fn'] = fn.im_func, fn.im_self, fn.im_class
+        return rval
+    def __setstate__(self, dct):
+        if 'i fn' in dct:
+            dct['fn'] = type(self.step)(*dct.pop('i fn'))
+        self.__dict__.update(dct)
+
+def FILT(fn, **kwargs):
+    """
+    Return a CALL object that uses the return value from the previous CALL as the first and
+    only positional argument.
+    """
+    return CALL(fn, use_start_arg=True, **kwargs)
+
+def CHOOSE(which, options):
+    """
+    Execute one out of a number of optional control flow paths
+    """
+    raise NotImplementedError()
+
+def LOOP(elements):
+    #TODO: implement a true infinite loop
+    try:
+        iter(elements)
+        return REPEAT(sys.maxint, elements)
+    except TypeError:
+        return REPEAT(sys.maxint, [elements])
+
+class REPEAT(ELEMENT):
+    def __init__(self, N, elements, pass_rvals=False):
+        self.N = N
+        self.elements = elements
+        self.pass_rvals = pass_rvals
+
+    #TODO: check for N being callable
+    def start(self, arg):
+        self.n = 0   #loop iteration
+        self.idx = 0 #element idx
+        self.finished = False
+        self.elements[0].start(arg)
+    def step(self):
+        assert not self.finished
+        r = self.elements[self.idx].step()
+        if r is INCOMPLETE:
+            return INCOMPLETE
+        self.idx += 1
+        if self.idx < len(self.elements):
+            self.elements[self.idx].start(r)
+            return INCOMPLETE
+        self.n += 1
+        if self.n < self.N:
+            self.idx = 0
+            self.elements[self.idx].start(r)
+            return INCOMPLETE
+        else:
+            self.finished = True
+            return r
+
+def SEQ(elements):
+    return REPEAT(1, elements)
+
+class WEAVE(ELEMENT):
+    """
+    Interleave execution of a number of elements.
+
+    TODO: allow a schedule (at least relative frequency) of elements from each program
+    """
+    def __init__(self, n_required, elements):
+        self.elements = elements
+        if n_required == -1:
+            self.n_required = len(elements)
+        else:
+            self.n_required = n_required
+    def start(self, arg):
+        for el in self.elements:
+            el.start(arg)
+        self.elem_finished = [0] * len(self.elements)
+        self.idx = 0
+        self.finished= False 
+    def step(self):
+        assert not self.finished # if this is triggered, we have a broken driver
+
+        #start with this check in case there were no elements
+        # it's possible for the number of finished elements to exceed the threshold
+        if sum(self.elem_finished) >= self.n_required:
+            self.finished = True
+            return None
+
+        # step the active element
+        r = self.elements[self.idx].step()
+
+        if r is not INCOMPLETE:
+            self.elem_finished[self.idx] = True
+
+            # check for completion
+            if sum(self.elem_finished) >= self.n_required:
+                self.finished = True
+                return None
+
+        # advance to the next un-finished element
+        self.idx = (self.idx+1) % len(self.elements)
+        while self.elem_finished[self.idx]:
+            self.idx = (self.idx+1) % len(self.elements)
+
+        return INCOMPLETE
+
+class POPEN(ELEMENT):
+    def __init__(self, args):
+        self.args = args
+    def start(self, arg):
+        self.p = subprocess.Popen(self.args)
+    def step(self):
+        r = self.p.poll() 
+        if r is None:
+            return INCOMPLETE
+        return r
+
+def PRINT(obj):
+    return CALL(print_obj, obj)
+
+class SPAWN(ELEMENT):
+    SUCCESS = 0
+    def __init__(self, data, prog):
+        self.data = data
+        self.prog = prog
+    def start(self, arg):
+        # pickle the (data, prog) pair
+        s = cPickle.dumps((self.data, self.prog))
+
+        # call python with a stub function that
+        # unpickles the data, prog pair and starts running the prog
+        self.rpipe, wpipe = os.pipe()
+        code = 'import sys, plugin_JB; sys.exit(plugin_JB.SPAWN._main(%i))'%wpipe
+        self.p = subprocess.Popen(
+                ['python', '-c', code], 
+                stdin=subprocess.PIPE)
+        # send the data and prog to the other process
+        self.p.stdin.write(s)
+        self.finished= False
+
+        #TODO: send over tgz of the modules this code needs
+
+        #TODO: When the client process is on a different machine, negotiate with the client
+        # process to determine which modules it needs, and send over the code for pure python
+        # ones.  Make sure versions match for non-pure python ones.
+
+    def step(self):
+        assert not self.finished
+        r = self.p.poll() 
+        if r is None:
+            return INCOMPLETE    # typical exit case
+        self.finished = True
+        if r != self.SUCCESS:
+            print "UH OH", r # TODO - ???
+        rfile = os.fdopen(self.rpipe)
+        # recv the revised of the data dictionary
+        data = cPickle.load(rfile)
+        # modify the data dict in-place
+        # for new values to be visible to other components
+        self.data.update(data)
+        rfile.close()
+        #TODO: return something meaningful? like r?
+        return None
+
+    @staticmethod
+    def _main(wpipe):
+        #TODO: unpack and install tgz of the modules this code needs
+        data, prog = cPickle.load(sys.stdin)
+        rval = prog.run()
+        os.write(wpipe, cPickle.dumps(data))
+        return SPAWN.SUCCESS
+        #os.close(wpipe)
+
+
+def print_obj(obj):
+    print obj
+def print_obj_attr(obj, attr):
+    print getattr(obj, attr)
+def no_op(*args, **kwargs):
+    pass
+
+def importable_fn(d):
+    d['new key'] = len(d)
+