Mercurial > pylearn

--- a/dataset.py	Thu Mar 27 01:59:44 2008 -0400
+++ b/dataset.py	Mon Apr 07 09:48:39 2008 -0400
@@ -10,12 +10,13 @@
     A DataSet is a generator of iterators; these iterators can run through the
     examples in a variety of ways.  A DataSet need not necessarily have a finite
     or known length, so this class can be used to interface to a 'stream' which
-    feed on-line learning.
+    feeds on-line learning.

     To iterate over examples, there are several possibilities:
-    - for i in dataset.zip(field1, field2,field3, ...)
-    - for i in dataset.minibatches(N, field1, field2, ...)
-    - for i in dataset
+    - for example in dataset.zip([field1, field2,field3, ...])
+    - for val1,val2,val3 in dataset.zip([field1, field2,field3])
+    - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N)
+    - for example in dataset
     Each of these is documented below.

     Note: For a dataset of fixed and known length, which can implement item
@@ -36,19 +37,19 @@

         Using this syntax, "i" will be an Example instance (or equivalent) with
         all the fields of DataSet self.  Every field of "i" will give access to
-        a the field of a single example.  Fields should be accessible via
-        i[identifier], but the derived class is free to accept any type of
-        identifier, and add extra functionality to the iterator.
+        a field of a single example.  Fields should be accessible via
+        i["fielname"] or i[3] (in the fieldNames() order), but the derived class is free
+        to accept any type of identifier, and add extra functionality to the iterator.
         """
-        raise AbstractFunction()
+        return self.zip(*self.fieldNames())

     def zip(self, *fieldnames):
         """
         Supports two forms of syntax:

-            for i in dataset.zip(f1, f2, f3): ...
+            for i in dataset.zip([f1, f2, f3]): ...

-            for i1, i2, i3 in dataset.zip(f1, f2, f3): ...
+            for i1, i2, i3 in dataset.zip([f1, f2, f3]): ...

         Using the first syntax, "i" will be an indexable object, such as a list,
         tuple, or Example instance, such that on every iteration, i[0] is the f1
@@ -120,7 +121,27 @@
         # This list may not be finite; what would make sense in the use you have
         # in mind?
         # -JB
-        """Return the list of field names in the examples of this dataset."""
+        #James-
+        # You are right. I had put this to be able to iterate over the fields
+        # but maybe an iterator mechanism (over fields rather than examples)
+        # would be more appropriate. Fieldnames are needed in general
+        # by the iterators over examples or minibatches, to construct
+        # examples or minibatches with the corresponding names as attributes.
+        # -YB
+        """
+        Return an iterator (an object with an __iter__ method) that
+        iterates over the names of the fields. As a special cases,
+        a list or a tuple of field names can be returned.
+        """"
+        # Note that some datasets
+        # may have virtual fields and support a virtually infinite number
+        # of possible field names. In that case, fieldNames() should
+        # either raise an error or iterate over a particular set of
+        # names as appropriate. Another option would be to iterate
+        # over the sub-datasets comprising a single field at a time.
+        # I am not sure yet what is most appropriate.
+        #  -YB
+        """
         raise AbstractFunction()

     def rename(*new_field_specifications):
@@ -129,6 +150,9 @@
         # Wouldn't this functionality be easier to provide via a
         # RenamingDataSet, such as the one I've written below?
         # -JB
+        # You are right. Whichever implementation, however, we need a generic way to
+        # 'concatenate' fields, to handle the ([old_field1, old_field2, ...], new_field) semantics.
+        # -YB
         """
         Return a new dataset that maps old fields (of self) to new fields (of the returned
         dataset). The minimal syntax that should be supported is the following:
@@ -140,6 +164,24 @@
         """
         raise AbstractFunction()

+
+    def apply_function(function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+        """
+        Return a dataset that contains as fields the results of applying
+        the given function (example-wise) to the specified input_fields. The
+        function should return a sequence whose elements will be stored in
+        fields whose names are given in the output_fields list. If copy_inputs
+        is True then the resulting dataset will also contain the fields of self.
+        If accept_minibatches, then the function may be called
+        with minibatches as arguments (what is returned by the minibatches
+        iterator). In any case, the computations may be delayed until the examples
+        of the resulting dataset are requested. If cache is True, then
+        once the output fields for some examples have been computed, then
+        are cached (to avoid recomputation if the same examples are again
+        requested).
+        """
+        return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache)
+
 class RenamingDataSet(DataSet):
     """A DataSet that wraps another one, and makes it look like the field names
     are different
@@ -287,9 +329,9 @@
 class ArrayDataSet(FiniteDataSet):
     """
     An ArrayDataSet behaves like a numpy array but adds the notion of named fields
-    from DataSet (and the ability to view multiple field values as an 'Example').
+    from DataSet (and the ability to view the values of multiple fields as an 'Example').
     It is a  fixed-length and fixed-width dataset
-    in which each element is a numpy array or a number, hence the whole
+    in which each element is a fixed dimension numpy array or a number, hence the whole
     dataset corresponds to a numpy array. Fields
     must correspond to a slice of array columns. If the dataset has fields,
     each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
@@ -382,9 +424,6 @@
                 # and coherent with the data array
                 assert fieldslice.start >= 0 and fieldslice.stop <= cols

-    def __iter__(self):
-        return self.zip(*self.fieldNames())
-
     def minibatches(self,
             fieldnames = DataSet.minibatches_fieldnames,
             minibatch_size = DataSet.minibatches_minibatch_size,
@@ -446,7 +485,8 @@
         return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)

     def __array__(self):
-        """Return an view of this dataset which is an numpy.ndarray
+        """Return a view of this dataset which is an numpy.ndarray (i.e. losing
+        the identity and name of fields within the dataset).

         Numpy uses this special function name to retrieve an ndarray view for
         function such as numpy.sum, numpy.dot, numpy.asarray, etc.
@@ -454,6 +494,9 @@
         If this dataset has no fields, then we simply return self.data,
         otherwise things are complicated.
         - why do we want this behaviour when there are fields? (JB)
+        - for convenience and completeness (but maybe it would make
+          more sense to implement this through a 'field-merging'
+          dataset). (YB)
         """
         if not self.fields:
             return self.data
@@ -497,4 +540,75 @@
             c+=slice_width
         return result

+class ApplyFunctionDataset(DataSet):
+    """
+    A dataset that contains as fields the results of applying
+    a given function (example-wise) to specified input_fields of a source
+    dataset. The function should return a sequence whose elements will be stored in
+    fields whose names are given in the output_fields list. If copy_inputs
+    is True then the resulting dataset will also contain the fields of the source.
+    dataset. If accept_minibatches, then the function expects
+    minibatches as arguments (what is returned by the minibatches
+    iterator). In any case, the computations may be delayed until the examples
+    of self are requested. If cache is True, then
+    once the output fields for some examples have been computed, then
+    are cached (to avoid recomputation if the same examples are again requested).
+    """
+    def __init__(src,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+        DataSet.__init__(self)
+        self.src=src
+        self.function=function
+        self.input_fields=input_fields
+        self.output_fields=output_fields
+        self.copy_inputs=copy_inputs
+        self.accept_minibatches=accept_minibatches
+        src_fieldnames = src.fieldNames()
+        if copy_inputs:
+            for src_field in src_fieldnames:
+                assert src_field not in output_fields
+            self.fieldnames=src_fieldnames+output_fields
+        else:
+            self.fieldnames=output_fields
+        for input_field in input_fields:
+            assert input_field in src_fieldnames
+        self.cache=cache
+        if cache:
+            # maybe a fixed-size array kind of structure would be more efficient than a list
+            # in the case where src is FiniteDataSet. -YB
+            self.cached_examples = []

+    def fieldNames(self): return self.fieldnames
+
+    def minibatches(self,
+                    fieldnames = DataSet.minibatches_fieldnames,
+                    minibatch_size = DataSet.minibatches_minibatch_size,
+                    n_batches = DataSet.minibatches_n_batches):
+
+        class Iterator(LookupList):
+
+            def __init__(self,dataset):
+                LookupList.__init__(self, fieldnames, [0]*len(fieldnames))
+                self.dataset=dataset
+                if dataset.copy_inputs:
+                    src_fields=dataset.fieldNames()
+                else:
+                    src_fields=dataset.input_fields
+                self.src_iterator=self.src.minibatches(src_fields,minibatch_size,n_batches)
+
+            def __iter__(self):
+                return self
+
+            def next(self):
+                src_examples = self.src_iterator.next()
+                if self.dataset.copy_inputs:
+                    function_inputs = src_examples
+                else:
+                    function_inputs =
+                    [src_examples[field_name] for field_name in self.dataset.input_fields])
+                return self.dataset.function(*function_inputs)
+
+        for fieldname in fieldnames:
+            assert fieldname in self.input_fields
+        return Iterator(self)
+
+
--- a/gradient_learner.py	Thu Mar 27 01:59:44 2008 -0400
+++ b/gradient_learner.py	Mon Apr 07 09:48:39 2008 -0400
@@ -3,7 +3,6 @@
 from tensor import *
 import gradient
 from compile import Function
-from gradient_based_optimizer import *

 class GradientLearner(Learner):
     """
@@ -13,38 +12,49 @@
     The user provides a Theano formula that maps the fields of a training example
     and parameters to output fields (for the use function), one of which must be a cost
     that is the training criterion to be minimized. Subclasses implement
-    a training strategy that uses the function to compute gradients and
+    a training strategy that uses the Theano formula to compute gradients and
     to compute outputs in the update method.
     The inputs, parameters, and outputs are lists of Theano tensors,
     while the example_wise_cost and regularization_term are Theano tensors.
     The user can specify a regularization coefficient that multiplies the regularization term.
     The training algorithm looks for parameters that minimize
-       regularization_coefficienet * regularization_term(parameters) +
+       regularization_coefficient * regularization_term(parameters) +
        sum_{inputs in training_set} example_wise_cost(inputs,parameters)
     i.e. the regularization_term should not depend on the inputs, only on the parameters.
     The learned function can map a subset of inputs to a subset of outputs (as long as the inputs subset
     includes all the inputs required in the Theano expression for the selected outputs).
-    It is assumed that all the inputs are provided in the training set, but
-    not necessarily when using the learned function.
+    It is assumed that all the inputs are provided in the training set (as dataset fields
+    with the corresponding name), but not necessarily when using the learned function.
     """
     def __init__(self, inputs, parameters, outputs, example_wise_cost, regularization_term,
-                 gradient_based_optimizer=StochasticGradientDescent(), regularization_coefficient = astensor(1.0)):
+                 regularization_coefficient = astensor(1.0)):
         self.inputs = inputs
         self.outputs = outputs
         self.parameters = parameters
         self.example_wise_cost = example_wise_cost
         self.regularization_term = regularization_term
-        self.gradient_based_optimizer = gradient_based_optimizer
         self.regularization_coefficient = regularization_coefficient
         self.parameters_example_wise_gradient = gradient.grad(example_wise_cost, parameters)
-        self.parameters_regularization_gradient = gradient.grad(self.regularization_coefficient * regularization, parameters)
+        self.parameters_regularization_gradient = gradient.grad(self.regularization_coefficient * regularization_term, parameters)
         if example_wise_cost not in outputs:
             outputs.append(example_wise_cost)
         if regularization_term not in outputs:
             outputs.append(regularization_term)
         self.example_wise_gradient_fn = Function(inputs + parameters,
                                        [self.parameters_example_wise_gradient + self.parameters_regularization_gradient])
-        self.use_functions = {frozenset([input.name for input in inputs]) : Function(inputs, outputs)}
+        self.use_functions = {frozenset([input.name for input in inputs]+[output.name for output in outputs])
+                                        : Function(inputs, outputs)}

-    def update(self,training_set):
-
+    def use(self,input_dataset,output_fields=None,copy_inputs=True):
+        # obtain the function that maps the desired inputs to desired outputs
+        input_fields = input_dataset.fieldNames()
+        if output_fields is None: output_fields = [output.name for output in outputs]
+        # handle special case of inputs that are directly copied into outputs
+
+        use_function_key = input_fields+output_fields
+        if not self.use_functions.has_key(use_function_key):
+            self.use_function[use_function_key]=Function(input_fields,output_fields)
+        use_function = self.use_functions[use_function_key]
+        # return a virtual dataset that computes the outputs on demand
+        return input_dataset.apply_function(use_function,input_fields,output_fields,copy_inputs,accept_minibatches=???)
+
--- a/learner.py	Thu Mar 27 01:59:44 2008 -0400
+++ b/learner.py	Mon Apr 07 09:48:39 2008 -0400
@@ -36,20 +36,21 @@
         return self.use # default behavior is 'non-adaptive', i.e. update does not do anything


-    def __call__(self,training_set):
+    def __call__(self,training_set,train_stats_collector=None):
         """
         Train a learner from scratch using the provided training set,
         and return the learned function.
         """
         self.forget()
-        return self.update(learning_task)
+        return self.update(learning_task,train_stats_collector)

-
-    def use(self,input_dataset,output_fields=None):
+    def use(self,input_dataset,output_fields=None,copy_inputs=True):
         """Once a Learner has been trained by one or more call to 'update', it can
         be used with one or more calls to 'use'. The argument is a DataSet (possibly
         containing a single example) and the result is a DataSet of the same length.
         If output_fields is specified, it may be use to indicate which fields should
         be constructed in the output DataSet (for example ['output','classification_error']).
+        Optionally, if copy_inputs, the input fields (of the input_dataset) can be made
+        visible in the output DataSet returned by this function.
         """
         raise NotImplementedError
--- a/lookup_list.py	Thu Mar 27 01:59:44 2008 -0400
+++ b/lookup_list.py	Mon Apr 07 09:48:39 2008 -0400
@@ -55,16 +55,6 @@
         except KeyError, e:
             raise AttributeError(name)

-    if 0:
-        # This makes subclassing horrible, just call append_keyval if it's
-        # really what you want to do.
-        # -JB
-        def __setattr__(self,name,value):
-            if name in self._name2index:
-                self._values[self._name2index[name]]=value
-            else:
-                raise AttributeError(name)
-
     def append_keyval(self, key, value):
         self._name2index[key]=len(self)
         self._values.append(value)