# HG changeset patch # User James Bergstra # Date 1284996503 14400 # Node ID acfd5e747a7521215aa5e30c279016c12734a175 # Parent 98954d8cb92df88c43ec537896ec304afb88cc98 v2planning - a few changes to plugin proposals diff -r 98954d8cb92d -r acfd5e747a75 doc/v2_planning/plugin_JB.py --- a/doc/v2_planning/plugin_JB.py Mon Sep 20 02:56:11 2010 -0400 +++ b/doc/v2_planning/plugin_JB.py Mon Sep 20 11:28:23 2010 -0400 @@ -1,46 +1,74 @@ -"""plugin_JB - draft of library architecture using iterators""" +"""plugin_JB - draft of potential library architecture using iterators + +This strategy makes use of a simple imperative language whose statements are python function +calls to create learning algorithms that can be manipulated and executed in several desirable +ways. + +The training procedure for a PCA module is easy to express: + + # allocate the relevant modules + dataset = Dataset(numpy.random.RandomState(123).randn(13,1)) + pca = PCA_Analysis() + pca_batchsize=1000 + + # define the control-flow of the algorithm + train_pca = SEQ([ + BUFFER_REPEAT(pca_batchsize, CALL(dataset.next)), + FILT(pca.analyze)]) + + # run the program + VirtualMachine(train_pca).run() + +The CALL, SEQ, FILT, and BUFFER_REPEAT are control-flow elements. The control-flow elements I +defined so far are: + +- CALL - a basic statement, just calls a python function +- FILT - like call, but passes the return value of the last CALL or FILT to the python function +- SEQ - a sequence of elements to run in order +- REPEAT - do something N times (and return None or maybe the last CALL?) +- BUFFER_REPEAT - do something N times and accumulate the return value from each iter +- LOOP - do something an infinite number of times +- CHOOSE - like a switch statement (should rename to SWITCH) +- WEAVE - interleave execution of multiple control-flow elements + + +We don't have many requirements per-se for the architecture, but I think this design respects +and realizes all of them. +The advantages of this approach are: + + - algorithms (including partially run ones) are COPYABLE, and SERIALIZABLE + + - algorithms can be executed without seizing control of the python process (the VM is an + iterator) so your main loop (aka alternate VM implementation) can be checking for network + or filesystem events related to job management + + - the library can provide learning algorithms via control-flow templates, and the user can + edit them (with search/replace calls) to include HOOKS, and DIAGNOSTIC plug-in + functionality + + e.g. prog.find(CALL(cd1_update, layer=layer1)).replace_with( + SEQ([CALL(cd1_update, layer=layer1), CALL(my_debugfn)])) + + - user can print the 'program code' of an algorithm built from library pieces + + - program can be optimized automatically. + + - e.g. BUFFER(N, CALL(dataset.next)) could be replaced if dataset.next implements the + right attribute/protocol for 'bufferable' or something. + + - e.g. SEQ([a,b,c,d]) could be compiled to a single CALL to a Theano-compiled function + if a, b, c, and d are calls to callable objects that export something like a + 'theano_SEQ' interface """ -- PICKLABLE - algorithms are serializable at all points during execution - -- ITERATOR walks through algorithms with fine granularity - -- COMPONENTS - library provides components on which programs operate - -- ALGORITHMS - library provides algorithms in clean (no hooks) form - -- HOOKS - user can insert print / debug logic with search/replace type calls - e.g. prog.find(CALL(cd1_update)).replace_with(SEQ([CALL(cd1_update), CALL(debugfn)])) - -- PRINTING - user can print the 'program code' of an algorithm built from library pieces - -- MODULAR EXPERIMENTS - an experiment object with one (or more?) programs and all of the objects referred to by - those programs. It is the preferred type of object to be serialized. The main components of - the algorithms should be top-level attributes of the package. This object can be serialized - and loaded in another process to implement job migration. - -- OPTIMIZATION - program can be optimized automatically - e.g. BUFFER(N, CALL(dataset.next)) can be replaced if dataset.next implements the right - attribute/protocol for 'bufferable' or something. - - e.g. SEQ([a,b,c,d]) can be compiled with Theano if sub-sequence is compatible - -- don't need greenlets to get efficiency, the implementations of control flow ops can manage a - stack or stack tree in the vm (like greenlets do I think) we don't really need - greenlets/stackless I don't think - -""" - -__license__ = None -__copyright__ = None +__license__ = 'TODO' +__copyright__ = 'TODO' import copy, sys, cPickle - import numpy - ################################################### # Virtual Machine for executing programs diff -r 98954d8cb92d -r acfd5e747a75 doc/v2_planning/plugin_greenlet.py --- a/doc/v2_planning/plugin_greenlet.py Mon Sep 20 02:56:11 2010 -0400 +++ b/doc/v2_planning/plugin_greenlet.py Mon Sep 20 11:28:23 2010 -0400 @@ -1,35 +1,14 @@ -"""plugin_greenlet - draft of library architecture using greenlets""" - - -""" - -- PICKLABLE - algorithms are serializable at all points during execution +"""plugin_greenlet - draft of library architecture using greenlets -- ITERATOR walks through algorithms with fine granularity - -- COMPONENTS - library provides components on which programs operate - -- ALGORITHMS - library provides algorithms in clean (no hooks) form - -- HOOKS - user can insert print / debug logic with search/replace type calls - e.g. prog.find(CALL(cd1_update)).replace_with(SEQ([CALL(cd1_update), CALL(debugfn)])) +HISTORICAL - NOT ACTUALLY A PROPOSAL +==================================== -- PRINTING - user can print the 'program code' of an algorithm built from library pieces - -- MODULAR EXPERIMENTS - an experiment object with one (or more?) programs and all of the objects referred to by - those programs. It is the preferred type of object to be serialized. The main components of - the algorithms should be top-level attributes of the package. This object can be serialized - and loaded in another process to implement job migration. +This was the original approach for what I renamed to plugin_JB, until I realized that I could +get the end result without using greenlets at all. -- OPTIMIZATION - program can be optimized automatically - e.g. BUFFER(N, CALL(dataset.next)) can be replaced if dataset.next implements the right - attribute/protocol for 'bufferable' or something. - - e.g. SEQ([a,b,c,d]) can be compiled with Theano if sub-sequence is compatible - -- don't need greenlets to get efficiency, the implementations of control flow ops can manage a - stack or stack tree in the vm (like greenlets do I think) we don't really need - greenlets/stackless I don't think +Still, greenlets seem like they could be neat and making this program stretched my mind so I +keep it. There's something wrong when you run with the kfold validation, but until that point +I think it works. """