changeset 167:4803cb76e26b

Updated documentation
author Joseph Turian <turian@gmail.com>
date Mon, 12 May 2008 18:51:42 -0400
parents ee11ed427ba8
children bc72a0fa6d01
files dataset.py learner.py lookup_list.py
diffstat 3 files changed, 75 insertions(+), 63 deletions(-) [+]
line wrap: on
line diff
--- a/dataset.py	Mon May 12 18:40:17 2008 -0400
+++ b/dataset.py	Mon May 12 18:51:42 2008 -0400
@@ -530,7 +530,7 @@
 
 class FieldsSubsetDataSet(DataSet):
     """
-    A sub-class of DataSet that selects a subset of the fields.
+    A sub-class of L{DataSet} that selects a subset of the fields.
     """
     def __init__(self,src,fieldnames):
         self.src=src
@@ -571,7 +571,7 @@
         
 class DataSetFields(LookupList):
     """
-    Although a DataSet iterates over examples (like rows of a matrix), an associated
+    Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
     DataSetFields iterates over fields (like columns of a matrix), and can be understood
     as a transpose of the associated dataset.
 
@@ -638,7 +638,7 @@
     
 class MinibatchDataSet(DataSet):
     """
-    Turn a LookupList of same-length (iterable) fields into an example-iterable dataset.
+    Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset.
     Each element of the lookup-list should be an iterable and sliceable, all of the same length.
     """
     def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
@@ -708,14 +708,14 @@
     
 class HStackedDataSet(DataSet):
     """
-    A DataSet that wraps several datasets and shows a view that includes all their fields,
+    A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
     i.e. whose list of fields is the concatenation of their lists of fields.
 
     If a field name is found in more than one of the datasets, then either an error is
     raised or the fields are renamed (either by prefixing the __name__ attribute 
     of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
 
-    TODO: automatically detect a chain of stacked datasets due to A | B | C | D ...
+    @todo: automatically detect a chain of stacked datasets due to A | B | C | D ...
     """
     def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
         DataSet.__init__(self,description,field_types)
@@ -807,11 +807,11 @@
 
 class VStackedDataSet(DataSet):
     """
-    A DataSet that wraps several datasets and shows a view that includes all their examples,
+    A L{DataSet} that wraps several datasets and shows a view that includes all their examples,
     in the order provided. This clearly assumes that they all have the same field names
     and all (except possibly the last one) are of finite length.
 
-    TODO: automatically detect a chain of stacked datasets due to A + B + C + D ...
+    @todo: automatically detect a chain of stacked datasets due to A + B + C + D ...
     """
     def __init__(self,datasets):
         self.datasets=datasets
@@ -1025,7 +1025,7 @@
 
 class CachedDataSet(DataSet):
   """
-  Wrap a dataset whose values are computationally expensive to obtain
+  Wrap a L{DataSet} whose values are computationally expensive to obtain
   (e.g. because they involve some computation, or disk access),
   so that repeated accesses to the same example are done cheaply,
   by caching every example value that has been accessed at least once.
@@ -1034,10 +1034,10 @@
   (and cached) upon construction of the CachedDataSet, rather at the
   first access.
 
-  @todo when cache_all_upon_construction create mini-batches that are as 
+  @todo: when cache_all_upon_construction create mini-batches that are as 
   large as possible but not so large as to fill up memory.
   
-  @todo add disk-buffering capability, so that when the cache becomes too
+  @todo: add disk-buffering capability, so that when the cache becomes too
   big for memory, we cache things on disk, trying to keep in memory only
   the record most likely to be accessed next.
   """
@@ -1092,18 +1092,18 @@
                       
 class ApplyFunctionDataSet(DataSet):
   """
-  A dataset that contains as fields the results of applying a given function
-  example-wise or minibatch-wise to all the fields of an input dataset.
-  The output of the function should be an iterable (e.g. a list or a LookupList)
-  over the resulting values.
+  A L{DataSet} that contains as fields the results of applying a
+  given function example-wise or minibatch-wise to all the fields of
+  an input dataset.  The output of the function should be an iterable
+  (e.g. a list or a LookupList) over the resulting values.
 
-  In minibatch mode, the function is expected to work on minibatches (takes
-  a minibatch in input and returns a minibatch in output). More precisely,
-  it means that each element of the input or output list should be iterable
-  and indexable over the individual example values (typically these
-  elements will be numpy arrays). All of the elements in the input and
-  output lists should have the same length, which is the length of the
-  minibatch.
+  In minibatch mode, the function is expected to work on minibatches
+  (takes a minibatch in input and returns a minibatch in output). More
+  precisely, it means that each element of the input or output list
+  should be iterable and indexable over the individual example values
+  (typically these elements will be numpy arrays). All of the elements
+  in the input and output lists should have the same length, which is
+  the length of the minibatch.
 
   The function is applied each time an example or a minibatch is accessed.
   To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
@@ -1186,9 +1186,10 @@
 
 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
     """
-    Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the
-    user to define a set of fields as the 'input' field and a set of fields
-    as the 'target' field. Optionally, a single weight_field can also be defined.
+    Wraps an arbitrary L{DataSet} into one for supervised learning tasks
+    by forcing the user to define a set of fields as the 'input' field
+    and a set of fields as the 'target' field. Optionally, a single
+    weight_field can also be defined.
     """
     args = ((input_fields,'input'),(output_fields,'target'))
     if weight_field: args+=(([weight_field],'weight'))
--- a/learner.py	Mon May 12 18:40:17 2008 -0400
+++ b/learner.py	Mon May 12 18:51:42 2008 -0400
@@ -13,7 +13,6 @@
     A L{Learner} can be seen as a learning algorithm, a function that when
     applied to training data returns a learned function (which is an object that
     can be applied to other data and return some output data).
-    
     """
     
     def __init__(self):
@@ -169,34 +168,42 @@
     
 class TLearner(Learner):
     """
-    TLearner is a virtual class of Learners that attempts to factor out of the definition
-    of a learner the steps that are common to many implementations of learning algorithms,
-    so as to leave only 'the equations' to define in particular sub-classes, using Theano.
+    TLearner is a virtual class of L{Learner}s that attempts to factor
+    out of the definition of a learner the steps that are common to
+    many implementations of learning algorithms, so as to leave only
+    'the equations' to define in particular sub-classes, using Theano.
 
-    In the default implementations of use and update, it is assumed that the 'use' and 'update' methods
-    visit examples in the input dataset sequentially. In the 'use' method only one pass through the dataset is done,
-    whereas the sub-learner may wish to iterate over the examples multiple times. Subclasses where this
-    basic model is not appropriate can simply redefine update or use.
-    
+    In the default implementations of use and update, it is assumed
+    that the 'use' and 'update' methods visit examples in the input
+    dataset sequentially. In the 'use' method only one pass through the
+    dataset is done, whereas the sub-learner may wish to iterate over
+    the examples multiple times. Subclasses where this basic model is
+    not appropriate can simply redefine update or use.
+
     Sub-classes must provide the following functions and functionalities:
-      - attributeNames(): defines all the names of attributes which can be used as fields or
-                          attributes in input/output datasets or in stats collectors.
-                          All these attributes are expected to be theano.Result objects
-                          (with a .data property and recognized by theano.Function for compilation).
-                          The sub-class constructor defines the relations between
-                          the Theano variables that may be used by 'use' and 'update'
-                          or by a stats collector.
-      - defaultOutputFields(input_fields): return a list of default dataset output fields when
+      - attributeNames(): defines all the names of attributes which can
+      be used as fields or
+                          attributes in input/output datasets or in
+                          stats collectors.  All these attributes
+                          are expected to be theano.Result objects
+                          (with a .data property and recognized by
+                          theano.Function for compilation).  The sub-class
+                          constructor defines the relations between the
+                          Theano variables that may be used by 'use'
+                          and 'update' or by a stats collector.
+      - defaultOutputFields(input_fields): return a list of default
+      dataset output fields when
                           None are provided by the caller of use.
-    The following naming convention is assumed and important.
-    Attributes whose names are listed in attributeNames() can be of any type,
-    but those that can be referenced as input/output dataset fields or as
-    output attributes in 'use' or as input attributes in the stats collector
-    should be associated with a Theano Result variable. If the exported attribute
-    name is <name>, the corresponding Result name (an internal attribute of
-    the TLearner, created in the sub-class constructor) should be _<name>.
-    Typically <name> will be numpy ndarray and _<name> will be the corresponding
-    Theano Tensor (for symbolic manipulation).
+    The following naming convention is assumed and important.  Attributes
+    whose names are listed in attributeNames() can be of any type,
+    but those that can be referenced as input/output dataset fields or
+    as output attributes in 'use' or as input attributes in the stats
+    collector should be associated with a Theano Result variable. If the
+    exported attribute name is <name>, the corresponding Result name
+    (an internal attribute of the TLearner, created in the sub-class
+    constructor) should be _<name>.  Typically <name> will be numpy
+    ndarray and _<name> will be the corresponding Theano Tensor (for
+    symbolic manipulation).
 
     @todo pousser dans Learner toute la poutine qui peut l'etre sans etre
     dependant de Theano
@@ -252,19 +259,20 @@
 
 class MinibatchUpdatesTLearner(TLearner):
     """
-    This adds to L{TLearner} a 
+    This adds the following functions to a L{TLearner}:
       - updateStart(), updateEnd(), updateMinibatch(minibatch), isLastEpoch():
-                          functions executed at the beginning, the end, in the middle
-                          (for each minibatch) of the update method, and at the end
-                          of each epoch. This model only
-                          works for 'online' or one-shot learning that requires
-                          going only once through the training data. For more complicated
-                          models, more specialized subclasses of TLearner should be used
-                          or a learning-algorithm specific update method should be defined.
+      functions executed at the beginning, the end, in the middle (for
+      each minibatch) of the update method, and at the end of each
+      epoch. This model only works for 'online' or one-shot learning
+      that requires going only once through the training data. For more
+      complicated models, more specialized subclasses of TLearner should
+      be used or a learning-algorithm specific update method should
+      be defined.
 
-      - a 'parameters' attribute which is a list of parameters (whose names are
-      specified by the user's subclass with the parameterAttributes() method)
-      
+      - a 'parameters' attribute which is a list of parameters
+      (whose names are specified by the user's subclass with the
+      parameterAttributes() method)
+
     """
 
     def __init__(self):
--- a/lookup_list.py	Mon May 12 18:40:17 2008 -0400
+++ b/lookup_list.py	Mon May 12 18:51:42 2008 -0400
@@ -6,7 +6,7 @@
     A LookupList is a sequence whose elements can be named (and unlike
     a dictionary the order of the elements depends not on their key but
     on the order given by the user through construction) so that
-    following syntactic constructions work as one would expect:
+    following syntactic constructions work as one would expect::
        example = LookupList(['x','y','z'],[1,2,3])
        example['x'] = [1, 2, 3] # set or change a field
        print example('z','y') # prints [3,2]
@@ -21,7 +21,10 @@
        example2 = LookupList(['v', 'w'], ['a','b'])
        print example+example2 # addition is like for lists, a concatenation of the items.
        example + example # throw an error as we can't have duplicate name.
-    Note that the element names should be unique.
+    @note: The element names should be unique.
+    @todo: Convert this documentation into doctest
+    format, and actually perform doctest'ing:
+    U{http://epydoc.sourceforge.net/manual-epytext.html#doctest-blocks}
     """
     def __init__(self,names=[],values=[]):
         assert len(values)==len(names)