# HG changeset patch # User Razvan Pascanu # Date 1284672870 14400 # Node ID ae5ba6206fd3e51de4929bdc593b5dc8dce25675 # Parent 0904dd74894db3fed5f557c614c65ce408b117f8 a first draft of pseudo-code for logreg .. using version B (?) approach diff -r 0904dd74894d -r ae5ba6206fd3 doc/v2_planning/architecture.txt --- a/doc/v2_planning/architecture.txt Thu Sep 16 17:11:10 2010 -0400 +++ b/doc/v2_planning/architecture.txt Thu Sep 16 17:34:30 2010 -0400 @@ -116,6 +116,12 @@ convert it into the pipeline given in the linear version. It's still possible though, but this is probably not the place to get into the details. +RP comment : The way I see it, you could always have everything using the +encapsulation paradigm ( which as you pointed out is a bit more powerful) and +then have linear shortcuts ( functions that take a list of functions and some +inputs and apply them in some order). You will not be able to have a one case +cover all pipeline function, but I think it is sufficient to offer such +options (linear functions) for a few widely used cases .. Jobman Compatibility Approach diff -r 0904dd74894d -r ae5ba6206fd3 doc/v2_planning/plugin_RP.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/doc/v2_planning/plugin_RP.py Thu Sep 16 17:34:30 2010 -0400 @@ -0,0 +1,165 @@ +''' +================================================= +Plugin system for interative algortithm Version B +================================================= + +After the meeting (September 16) we sort of stumbled on +two possible versions of the plug-in system. This represents +the second version. It suffered a few changes after seeing +Olivier's code and talking to him. + +Concept +======= + +The basic idea behind this version is not to have a list of all +possible events, but rather have plugin register to events.By +specifying what plugin listens to which event produced by what +plugin you define a sort of dependency graph. Structuring things +in such a graph might make the script more intuitive when reading. + +I will first go through pseudo-code for two example and then enumerate +my insights and concepts on the matter + + +Example : Producer - Consumer that Guillaume described +====================================================== + + +.. code-block:: +''' + sch = Schedular() + p = ProducerFactory() + p = sched.schedule_plugin(event = every(p.outputStuffs()), p ) + p = sched.schedule_plugin(event = Event("begin"), p) + c = sched.schedule_plugin(event = every(p.outputStuffs()), ConsumerFactory ) + pc= sched.schedule_plugin(event = every(p.outputStuffs()), ProducerConsumerFactory ) + + sched.run() + + + +''' +Example : Logistic regression +============================= + +Task description +---------------- + +Apply a logistic regression network to some dataset. Use early stopping. +Save the weights everytime a new best score is obtained. Print trainnig score +after each epoch. + + +Possible script +--------------- + + Sorry for long variable names, I wanted to make it clear what things are .. + +.. code-block:: +''' + sched = Schedular() + # This is a shortcut .. I've been to the dataset committee and they have + # something else in mind, a bit more fancy; I totally agree with their + # ideas I just wrote it like this for brevity; + train_data, valid_data, test_data = load_mnist() + + # This part was not actually discussed into details ; I have my own + # opinions of how this part should be done .. but for now I decomposed it + # in two functions for convinience + logreg = generate_logreg_model() + + + + # Note that this is not meant to replace the string idea of Olivier. I + # actually think that is a cool idea, when writing things down I realized + # it might be a bit more intuitive if you would get that object by calling + # a method of the instance of the plugin with a significant name + # I added a warpping function that sort of tells on which such events + # you can have similar to what Olivier wrote { every, at .. } + doOneTrainingStepPlugin =ModelPluginFactory( model = logreg ) + trainDataPlugin = sched.schedule_plugin( + event = every(doOneTrainingStepPlugin.new_train_error), + DatasetsPluginFactory( data = train_data) ) + + trainDataPlugin = sched.schedule_plugin( + event = Event('begin'), trainDataPlugin ) + + clock = sched.schedule_plugin( event = all_events, ClockFactory()) + + doOneTrainingStepPlugin = sched.schedule_plugin( + event = every(trainDataPlugin.new_batch()), + ModelFactory( model = logreg)) + + + + + # Arguably we wouldn't need such a plugin. I added just to show how to + # deal with multiple events from same plugin; the plugin is suppose to + # reset the index of the dataset to 0, so that you start a new epoch + resetDataset = sched.schedule_plugin( + event = every(trainDataPlugin.end_of_dataset()), + ResetDatasetFactory( data = train_data) ) + + + checkValidationPlugin = sched.schedule_plugin( + event =every_nth(doOneTrainingStepPlugin.done(), n=1000), + ValidationFactory( model = logreg data = valid_data)) + + # You have the options to also do : + # + # checkValidationPlugin = sched.schedule_plugin( + # event =every(trainDataPlugin.end_of_dataset()), + # ValidationFactory( model = logreg, data = valid_data)) + # checkValidationPlugin = sched.schedule_plugin( + # event =every(clock.hour()), + # ValidationFactory( model = logreg, data = valid_data)) + + # This plugin would be responsible to send the Event("terminate") when the + # patience expired. + earlyStopperPlugin = sched.schedule_plugin( + event = every(checkValidationPlugin.new_validation_error()), + earlyStopperFactory(initial_patience = 10) ) + + # Printing & Saving plugins + + printTrainingError = sched.schedule_plugin( + event = every(doOneTrainingStepPlugin.new_train_error()), + AggregateAndPrintFactory()) + + printTrainingError = sched.schedule_plugin( + event = every(trainDataPlugin.end_of_dataset()), + printTrainingError) + saveWeightsPlugin = sched.schedule_plugin( + event = every(earlyStopperPlugin.new_best_valid_error()), + saveWeightsFactory( model = logreg) ) + + sched.run() + +''' +Notes +===== + + In my code schedule_plugin returns the plugin that it regsiters. I think that + writing something like + x = f( .. ) + y = f(x) + + makes more readable then writing f( .., event_belongs_to = x), or even worse, + you only see text, and you would have to go to the plugins to see what events + they actually produce. + + At this point I am more concern with how the scripts will look ( the cognitive + load to understand them) and how easy is to go to hack into them. From this point + of view I would have the following suggestions : + * dataset and model creation should create outside the schedular with possibly + other mechanisms + * there are two types of plugins, those that do not affect the experiment, + they just compute statistics and print them, or save different data and those + plugin that change the state of the model, like train, or influence the life + of the experiment. There should be a minimum of plugins of the second category, + to still have the code readable. ( When understanding a script, you only need + to understand that part, the rest you assume is just printing stuff). + The different categories should also be grouped. + + +'''