# HG changeset patch # User Frederic Bastien # Date 1210698599 14400 # Node ID bc72a0fa6d0199fb4e70fbf7c9c9a8d775bd25b1 # Parent 4803cb76e26b0ad349ac5aaa265c531cd05295d2# Parent 051e07807554b7e6d6a3dfade42d89c3a490fc5d Automated merge with ssh://p-omega1@lgcm.iro.umontreal.ca/tlearn diff -r 051e07807554 -r bc72a0fa6d01 dataset.py --- a/dataset.py Tue May 13 13:05:45 2008 -0400 +++ b/dataset.py Tue May 13 13:09:59 2008 -0400 @@ -6,8 +6,7 @@ from sys import maxint import numpy -class AbstractFunction (Exception): """Derived class must override this function""" -class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" +from exceptions import * class AttributesHolder(object): def __init__(self): pass @@ -531,7 +530,7 @@ class FieldsSubsetDataSet(DataSet): """ - A sub-class of DataSet that selects a subset of the fields. + A sub-class of L{DataSet} that selects a subset of the fields. """ def __init__(self,src,fieldnames): self.src=src @@ -572,7 +571,7 @@ class DataSetFields(LookupList): """ - Although a DataSet iterates over examples (like rows of a matrix), an associated + Although a L{DataSet} iterates over examples (like rows of a matrix), an associated DataSetFields iterates over fields (like columns of a matrix), and can be understood as a transpose of the associated dataset. @@ -639,7 +638,7 @@ class MinibatchDataSet(DataSet): """ - Turn a LookupList of same-length (iterable) fields into an example-iterable dataset. + Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset. Each element of the lookup-list should be an iterable and sliceable, all of the same length. """ def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, @@ -709,14 +708,14 @@ class HStackedDataSet(DataSet): """ - A DataSet that wraps several datasets and shows a view that includes all their fields, + A L{DataSet} that wraps several datasets and shows a view that includes all their fields, i.e. whose list of fields is the concatenation of their lists of fields. If a field name is found in more than one of the datasets, then either an error is raised or the fields are renamed (either by prefixing the __name__ attribute of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). - TODO: automatically detect a chain of stacked datasets due to A | B | C | D ... + @todo: automatically detect a chain of stacked datasets due to A | B | C | D ... """ def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): DataSet.__init__(self,description,field_types) @@ -808,11 +807,11 @@ class VStackedDataSet(DataSet): """ - A DataSet that wraps several datasets and shows a view that includes all their examples, + A L{DataSet} that wraps several datasets and shows a view that includes all their examples, in the order provided. This clearly assumes that they all have the same field names and all (except possibly the last one) are of finite length. - TODO: automatically detect a chain of stacked datasets due to A + B + C + D ... + @todo: automatically detect a chain of stacked datasets due to A + B + C + D ... """ def __init__(self,datasets): self.datasets=datasets @@ -1026,7 +1025,7 @@ class CachedDataSet(DataSet): """ - Wrap a dataset whose values are computationally expensive to obtain + Wrap a L{DataSet} whose values are computationally expensive to obtain (e.g. because they involve some computation, or disk access), so that repeated accesses to the same example are done cheaply, by caching every example value that has been accessed at least once. @@ -1035,10 +1034,10 @@ (and cached) upon construction of the CachedDataSet, rather at the first access. - @todo when cache_all_upon_construction create mini-batches that are as + @todo: when cache_all_upon_construction create mini-batches that are as large as possible but not so large as to fill up memory. - @todo add disk-buffering capability, so that when the cache becomes too + @todo: add disk-buffering capability, so that when the cache becomes too big for memory, we cache things on disk, trying to keep in memory only the record most likely to be accessed next. """ @@ -1093,18 +1092,18 @@ class ApplyFunctionDataSet(DataSet): """ - A dataset that contains as fields the results of applying a given function - example-wise or minibatch-wise to all the fields of an input dataset. - The output of the function should be an iterable (e.g. a list or a LookupList) - over the resulting values. + A L{DataSet} that contains as fields the results of applying a + given function example-wise or minibatch-wise to all the fields of + an input dataset. The output of the function should be an iterable + (e.g. a list or a LookupList) over the resulting values. - In minibatch mode, the function is expected to work on minibatches (takes - a minibatch in input and returns a minibatch in output). More precisely, - it means that each element of the input or output list should be iterable - and indexable over the individual example values (typically these - elements will be numpy arrays). All of the elements in the input and - output lists should have the same length, which is the length of the - minibatch. + In minibatch mode, the function is expected to work on minibatches + (takes a minibatch in input and returns a minibatch in output). More + precisely, it means that each element of the input or output list + should be iterable and indexable over the individual example values + (typically these elements will be numpy arrays). All of the elements + in the input and output lists should have the same length, which is + the length of the minibatch. The function is applied each time an example or a minibatch is accessed. To avoid re-doing computation, wrap this dataset inside a CachedDataSet. @@ -1187,9 +1186,10 @@ def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): """ - Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the - user to define a set of fields as the 'input' field and a set of fields - as the 'target' field. Optionally, a single weight_field can also be defined. + Wraps an arbitrary L{DataSet} into one for supervised learning tasks + by forcing the user to define a set of fields as the 'input' field + and a set of fields as the 'target' field. Optionally, a single + weight_field can also be defined. """ args = ((input_fields,'input'),(output_fields,'target')) if weight_field: args+=(([weight_field],'weight')) diff -r 051e07807554 -r bc72a0fa6d01 exceptions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/exceptions.py Tue May 13 13:09:59 2008 -0400 @@ -0,0 +1,7 @@ +""" +Common exceptions. +@todo: This file should be part of a common/ python package. +""" + +class AbstractFunction (Exception): """Derived class must override this function""" +class NotImplementedYet (NotImplementedError): """Work in progress, this should eventually be implemented""" diff -r 051e07807554 -r bc72a0fa6d01 learner.py --- a/learner.py Tue May 13 13:05:45 2008 -0400 +++ b/learner.py Tue May 13 13:09:59 2008 -0400 @@ -13,7 +13,6 @@ A L{Learner} can be seen as a learning algorithm, a function that when applied to training data returns a learned function (which is an object that can be applied to other data and return some output data). - """ def __init__(self): @@ -169,34 +168,42 @@ class TLearner(Learner): """ - TLearner is a virtual class of Learners that attempts to factor out of the definition - of a learner the steps that are common to many implementations of learning algorithms, - so as to leave only 'the equations' to define in particular sub-classes, using Theano. + TLearner is a virtual class of L{Learner}s that attempts to factor + out of the definition of a learner the steps that are common to + many implementations of learning algorithms, so as to leave only + 'the equations' to define in particular sub-classes, using Theano. - In the default implementations of use and update, it is assumed that the 'use' and 'update' methods - visit examples in the input dataset sequentially. In the 'use' method only one pass through the dataset is done, - whereas the sub-learner may wish to iterate over the examples multiple times. Subclasses where this - basic model is not appropriate can simply redefine update or use. - + In the default implementations of use and update, it is assumed + that the 'use' and 'update' methods visit examples in the input + dataset sequentially. In the 'use' method only one pass through the + dataset is done, whereas the sub-learner may wish to iterate over + the examples multiple times. Subclasses where this basic model is + not appropriate can simply redefine update or use. + Sub-classes must provide the following functions and functionalities: - - attributeNames(): defines all the names of attributes which can be used as fields or - attributes in input/output datasets or in stats collectors. - All these attributes are expected to be theano.Result objects - (with a .data property and recognized by theano.Function for compilation). - The sub-class constructor defines the relations between - the Theano variables that may be used by 'use' and 'update' - or by a stats collector. - - defaultOutputFields(input_fields): return a list of default dataset output fields when + - attributeNames(): defines all the names of attributes which can + be used as fields or + attributes in input/output datasets or in + stats collectors. All these attributes + are expected to be theano.Result objects + (with a .data property and recognized by + theano.Function for compilation). The sub-class + constructor defines the relations between the + Theano variables that may be used by 'use' + and 'update' or by a stats collector. + - defaultOutputFields(input_fields): return a list of default + dataset output fields when None are provided by the caller of use. - The following naming convention is assumed and important. - Attributes whose names are listed in attributeNames() can be of any type, - but those that can be referenced as input/output dataset fields or as - output attributes in 'use' or as input attributes in the stats collector - should be associated with a Theano Result variable. If the exported attribute - name is , the corresponding Result name (an internal attribute of - the TLearner, created in the sub-class constructor) should be _. - Typically will be numpy ndarray and _ will be the corresponding - Theano Tensor (for symbolic manipulation). + The following naming convention is assumed and important. Attributes + whose names are listed in attributeNames() can be of any type, + but those that can be referenced as input/output dataset fields or + as output attributes in 'use' or as input attributes in the stats + collector should be associated with a Theano Result variable. If the + exported attribute name is , the corresponding Result name + (an internal attribute of the TLearner, created in the sub-class + constructor) should be _. Typically will be numpy + ndarray and _ will be the corresponding Theano Tensor (for + symbolic manipulation). @todo pousser dans Learner toute la poutine qui peut l'etre sans etre dependant de Theano @@ -252,19 +259,20 @@ class MinibatchUpdatesTLearner(TLearner): """ - This adds to L{TLearner} a + This adds the following functions to a L{TLearner}: - updateStart(), updateEnd(), updateMinibatch(minibatch), isLastEpoch(): - functions executed at the beginning, the end, in the middle - (for each minibatch) of the update method, and at the end - of each epoch. This model only - works for 'online' or one-shot learning that requires - going only once through the training data. For more complicated - models, more specialized subclasses of TLearner should be used - or a learning-algorithm specific update method should be defined. + functions executed at the beginning, the end, in the middle (for + each minibatch) of the update method, and at the end of each + epoch. This model only works for 'online' or one-shot learning + that requires going only once through the training data. For more + complicated models, more specialized subclasses of TLearner should + be used or a learning-algorithm specific update method should + be defined. - - a 'parameters' attribute which is a list of parameters (whose names are - specified by the user's subclass with the parameterAttributes() method) - + - a 'parameters' attribute which is a list of parameters + (whose names are specified by the user's subclass with the + parameterAttributes() method) + """ def __init__(self): diff -r 051e07807554 -r bc72a0fa6d01 lookup_list.py --- a/lookup_list.py Tue May 13 13:05:45 2008 -0400 +++ b/lookup_list.py Tue May 13 13:09:59 2008 -0400 @@ -6,7 +6,7 @@ A LookupList is a sequence whose elements can be named (and unlike a dictionary the order of the elements depends not on their key but on the order given by the user through construction) so that - following syntactic constructions work as one would expect: + following syntactic constructions work as one would expect:: example = LookupList(['x','y','z'],[1,2,3]) example['x'] = [1, 2, 3] # set or change a field print example('z','y') # prints [3,2] @@ -21,7 +21,10 @@ example2 = LookupList(['v', 'w'], ['a','b']) print example+example2 # addition is like for lists, a concatenation of the items. example + example # throw an error as we can't have duplicate name. - Note that the element names should be unique. + @note: The element names should be unique. + @todo: Convert this documentation into doctest + format, and actually perform doctest'ing: + U{http://epydoc.sourceforge.net/manual-epytext.html#doctest-blocks} """ def __init__(self,names=[],values=[]): assert len(values)==len(names)