Mercurial > pylearn

"""plugin_JB - draft of potential library architecture using iterators

This strategy makes use of a simple imperative language whose statements are python function
calls to create learning algorithms that can be manipulated and executed in several desirable
ways.

The training procedure for a PCA module is easy to express:

    # allocate the relevant modules
    dataset = Dataset(numpy.random.RandomState(123).randn(13,1))
    pca = PCA_Analysis()
    pca_batchsize=1000

    # define the control-flow of the algorithm
    train_pca = SEQ([
        BUFFER_REPEAT(pca_batchsize, CALL(dataset.next)),
        FILT(pca.analyze)])

    # run the program
    train_pca.run()

The CALL, SEQ, FILT, and BUFFER_REPEAT are control-flow elements. The control-flow elements I
defined so far are:

- CALL - a basic statement, just calls a python function
- FILT - like call, but passes the return value of the last CALL or FILT to the python function
- SEQ - a sequence of elements to run in order
- REPEAT - do something N times (and return None or maybe the last CALL?)
- BUFFER_REPEAT - do something N times and accumulate the return value from each iter
- LOOP - do something an infinite number of times
- CHOOSE - like a switch statement (should rename to SWITCH)
- WEAVE - interleave execution of multiple control-flow elements
- POPEN - launch a process and return its status when it's complete
- PRINT - a shortcut for CALL(print_obj)


We don't have many requirements per-se for the architecture, but I think this design respects
and realizes all of them.
The advantages of this approach are:

    - algorithms (including partially run ones) are COPYABLE, and SERIALIZABLE

    - algorithms can be executed without seizing control of the python process (the run()
      method does this, but if you look inside it you'll see it's a simple for loop)

      - it is easy to execute an algorithm step by step in a main loop that also checks for
        network or filesystem events related to e.g. job management.

    - the library can provide learning algorithms via control-flow templates, and the user can
      edit them (with search/replace calls) to include HOOKS, and DIAGNOSTIC plug-in
      functionality

      e.g. prog.find(CALL(cd1_update, layer=layer1)).replace_with(
          SEQ([CALL(cd1_update, layer=layer1), CALL(my_debugfn)]))

    - user can print the 'program code' of an algorithm built from library pieces

    - program can be optimized automatically.

      - e.g. BUFFER(N, CALL(dataset.next))  could be replaced if dataset.next implements the
        right attribute/protocol for 'bufferable' or something.

      - e.g. SEQ([a,b,c,d])  could be compiled to a single CALL to a Theano-compiled function
        if a, b, c, and d are calls to callable objects that export something like a
        'theano_SEQ' interface


"""

__license__ = 'TODO'
__copyright__ = 'TODO'

import cPickle, copy, os, subprocess, sys, time
import numpy

####################################################
# CONTROL-FLOW CONSTRUCTS

class INCOMPLETE:
    """Return value for Element.step"""

class ELEMENT(object):
    """
    Base class for control flow elements (e.g. CALL, REPEAT, etc.)

    The design is that every element has a driver, that is another element, or the iterator
    implementation in the ELEMENT class.

    the driver calls start when entering a new control element
       - this would be called once per e.g. outer loop iteration

    the driver calls step to advance the control element
       - which returns INCOMPLETE
       - which returns any other object to indicate completion
    """

    # subclasses should override these methods:
    def start(self, arg):
        pass
    def step(self):
        pass

    # subclasses should typically not override these:
    def run(self, arg=None, n_steps=float('inf')):
        self.start(arg)
        i = 0
        r = self.step()
        while r is INCOMPLETE:
            i += 1
            #TODO make sure there is not an off-by-one error
            if i > n_steps:
                break
            r = self.step()
        return r

class BUFFER_REPEAT(ELEMENT):
    """
    Accumulate a number of return values into one list / array.

    The source of return values `src` is a control element that will be restarted repeatedly in
    order to fulfil the requiement of gathering N samples.

    TODO: support accumulating of tuples of arrays
    """
    def __init__(self, N, src, storage=None):
        """
        TODO: use preallocated `storage`
        """
        self.N = N
        self.n = 0
        self.src = src
        self.storage = storage
        self.src.start(None)
        if self.storage != None:
            raise NotImplementedError()
    def start(self, arg):
        self.buf = [None] * self.N
        self.n = 0
        self.finished = False
    def step(self):
        assert not self.finished
        r = self.src.step()
        if r is INCOMPLETE:
            return r
        self.src.start(None) # restart our stream
        self.buf[self.n] = r
        self.n += 1
        if self.n == self.N:
            self.finished = True
            return self.buf
        else:
            return INCOMPLETE
        assert 0

class CALL(ELEMENT):
    """
    Control flow terminal - call a python function or method.

    Returns the return value of the call.
    """
    def __init__(self, fn, *args, **kwargs):
        self.fn = fn
        self.args = args
        self.kwargs=kwargs
        self.use_start_arg = kwargs.pop('use_start_arg', False)
    def start(self, arg):
        self.start_arg = arg
        self.finished = False
        return self
    def step(self):
        assert not self.finished
        self.finished = True
        if self.use_start_arg:
            if self.args:
                raise TypeError('cant get positional args both ways')
            return self.fn(self.start_arg, **self.kwargs)
        else:
            return self.fn(*self.args, **self.kwargs)
    def __getstate__(self):
        rval = dict(self.__dict__)
        if type(self.fn) is type(self.step): #instancemethod
            fn = rval.pop('fn')
            rval['i fn'] = fn.im_func, fn.im_self, fn.im_class
        return rval
    def __setstate__(self, dct):
        if 'i fn' in dct:
            dct['fn'] = type(self.step)(*dct.pop('i fn'))
        self.__dict__.update(dct)

def FILT(fn, **kwargs):
    """
    Return a CALL object that uses the return value from the previous CALL as the first and
    only positional argument.
    """
    return CALL(fn, use_start_arg=True, **kwargs)

def CHOOSE(which, options):
    """
    Execute one out of a number of optional control flow paths
    """
    raise NotImplementedError()

def LOOP(elements):
    #TODO: implement a true infinite loop
    try:
        iter(elements)
        return REPEAT(sys.maxint, elements)
    except TypeError:
        return REPEAT(sys.maxint, [elements])

class REPEAT(ELEMENT):
    def __init__(self, N, elements, pass_rvals=False):
        self.N = N
        self.elements = elements
        self.pass_rvals = pass_rvals

    #TODO: check for N being callable
    def start(self, arg):
        self.n = 0   #loop iteration
        self.idx = 0 #element idx
        self.finished = False
        self.elements[0].start(arg)
    def step(self):
        assert not self.finished
        r = self.elements[self.idx].step()
        if r is INCOMPLETE:
            return INCOMPLETE
        self.idx += 1
        if self.idx < len(self.elements):
            self.elements[self.idx].start(r)
            return INCOMPLETE
        self.n += 1
        if self.n < self.N:
            self.idx = 0
            self.elements[self.idx].start(r)
            return INCOMPLETE
        else:
            self.finished = True
            return r

def SEQ(elements):
    return REPEAT(1, elements)

class WEAVE(ELEMENT):
    """
    Interleave execution of a number of elements.

    TODO: allow a schedule (at least relative frequency) of elements from each program
    """
    def __init__(self, n_required, elements):
        self.elements = elements
        if n_required == -1:
            self.n_required = len(elements)
        else:
            self.n_required = n_required
    def start(self, arg):
        for el in self.elements:
            el.start(arg)
        self.elem_finished = [0] * len(self.elements)
        self.idx = 0
        self.finished= False
    def step(self):
        assert not self.finished # if this is triggered, we have a broken driver

        #start with this check in case there were no elements
        # it's possible for the number of finished elements to exceed the threshold
        if sum(self.elem_finished) >= self.n_required:
            self.finished = True
            return None

        # step the active element
        r = self.elements[self.idx].step()

        if r is not INCOMPLETE:
            self.elem_finished[self.idx] = True

            # check for completion
            if sum(self.elem_finished) >= self.n_required:
                self.finished = True
                return None

        # advance to the next un-finished element
        self.idx = (self.idx+1) % len(self.elements)
        while self.elem_finished[self.idx]:
            self.idx = (self.idx+1) % len(self.elements)

        return INCOMPLETE

class POPEN(ELEMENT):
    def __init__(self, args):
        self.args = args
    def start(self, arg):
        self.p = subprocess.Popen(self.args)
    def step(self):
        r = self.p.poll()
        if r is None:
            return INCOMPLETE
        return r

def PRINT(obj):
    return CALL(print_obj, obj)

class SPAWN(ELEMENT):
    SUCCESS = 0
    def __init__(self, data, prog):
        self.data = data
        self.prog = prog
    def start(self, arg):
        # pickle the (data, prog) pair
        s = cPickle.dumps((self.data, self.prog))

        # call python with a stub function that
        # unpickles the data, prog pair and starts running the prog
        self.rpipe, wpipe = os.pipe()
        code = 'import sys, plugin_JB; sys.exit(plugin_JB.SPAWN._main(%i))'%wpipe
        self.p = subprocess.Popen(
                ['python', '-c', code],
                stdin=subprocess.PIPE)
        # send the data and prog to the other process
        self.p.stdin.write(s)
        self.finished= False

        #TODO: send over tgz of the modules this code needs

        #TODO: When the client process is on a different machine, negotiate with the client
        # process to determine which modules it needs, and send over the code for pure python
        # ones.  Make sure versions match for non-pure python ones.

    def step(self):
        assert not self.finished
        r = self.p.poll()
        if r is None:
            return INCOMPLETE    # typical exit case
        self.finished = True
        if r != self.SUCCESS:
            print "UH OH", r # TODO - ???
        rfile = os.fdopen(self.rpipe)
        # recv the revised of the data dictionary
        data = cPickle.load(rfile)
        # modify the data dict in-place
        # for new values to be visible to other components
        self.data.update(data)
        rfile.close()
        #TODO: return something meaningful? like r?
        return None

    @staticmethod
    def _main(wpipe):
        #TODO: unpack and install tgz of the modules this code needs
        data, prog = cPickle.load(sys.stdin)
        rval = prog.run()
        os.write(wpipe, cPickle.dumps(data))
        return SPAWN.SUCCESS
        #os.close(wpipe)


def print_obj(obj):
    print obj
def print_obj_attr(obj, attr):
    print getattr(obj, attr)
def no_op(*args, **kwargs):
    pass

def importable_fn(d):
    d['new key'] = len(d)
author	boulanni <nicolas_boulanger@hotmail.com>
date	Wed, 22 Sep 2010 12:49:41 -0400
parents	478bb1f8215c
children	9fac28d80fb7