changeset 31:2d6c49ec5749

merged
author bergstrj@iro.umontreal.ca
date Fri, 11 Apr 2008 21:42:07 -0400
parents bf0145fa73e8 (current diff) 46c5c90019c2 (diff)
children 039c0f249859
files
diffstat 3 files changed, 110 insertions(+), 98 deletions(-) [+]
line wrap: on
line diff
--- a/_test_dataset.py	Fri Apr 11 21:41:09 2008 -0400
+++ b/_test_dataset.py	Fri Apr 11 21:42:07 2008 -0400
@@ -48,16 +48,6 @@
         a_y = a.y
         self.failUnless(numpy.all( a_y == arr[:,1:4]))
 
-    def test_asarray(self):
-        arr = numpy.random.rand(3,4)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(2,4)})
-        a_arr = numpy.asarray(a)
-        self.failUnless(a_arr.shape[1] == 2 + 2)
-        self.failUnless(numpy.sum(numpy.square(a_arr-a.data))==0)
-        a=ArrayDataSet(data=arr,fields={"x":slice(2),"y":slice(1,4)})
-        a_arr = numpy.asarray(a)
-        self.failUnless(a_arr.shape[1] == 2 + 3)
-
     def test_minibatch_wraparound_even(self):
         arr = numpy.random.rand(10,4)
         arr2 = ArrayDataSet.Iterator.matcat(arr,arr)
@@ -90,6 +80,17 @@
         b=a.rename({'xx':'x','zz':'z'})
         self.failUnless(b.hasFields('xx','zz') and not b.hasFields('x') and not b.hasFields('y'))
 
+class T_applyfunctiondataset(unittest.TestCase):
+    def setUp(self):
+        numpy.random.seed(123456)
+
+    def test_function(self):
+        n = numpy.random.rand(3,8)
+        a=ArrayDataSet(data=n,fields={"x":slice(2),"y":slice(1,4),"z":slice(4,6)})
+        b=a.apply_function(lambda x,y: x+y,x+1, ['x','y'], ['x+y','x+1'], False,False,False)
+        print b.fieldNames()
+        print b('x+y')
+        
 
 if __name__ == '__main__':
     unittest.main()
--- a/dataset.py	Fri Apr 11 21:41:09 2008 -0400
+++ b/dataset.py	Fri Apr 11 21:42:07 2008 -0400
@@ -37,8 +37,8 @@
 
     Datasets of finite length should be sub-classes of FiniteLengthDataSet.
 
-    Datasets whose elements can be indexed and sub-datasets of consecutive
-    examples (i.e. slices) can be extracted from should be sub-classes of
+    Datasets whose elements can be indexed and whose sub-datasets (with a subset
+    of examples) can be extracted should be sub-classes of
     SliceableDataSet.
 
     Datasets with a finite number of fields should be sub-classes of
@@ -150,6 +150,7 @@
         of the iterators).
         """
         raise AbstractFunction()
+
         
     def merge_fields(self,*specifications):
         """
@@ -182,7 +183,7 @@
     
     def rename(self,rename_dict):
         """
-        Return a new dataset that renames fields, using a dictionnary that maps old field
+        Changes a dataset into one that renames fields, using a dictionnary that maps old field
         names to new field names. The only fields visible by the returned dataset are those
         whose names are keys of the rename_dict.
         """
@@ -194,9 +195,9 @@
         SelfRenamingDataSet.__init__(self,self,rename_dict)
         return self
         
-    def applyFunction(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
+    def apply_function(self,function, input_fields, output_fields, copy_inputs=True, accept_minibatches=True, cache=True):
         """
-        Return a dataset that contains as fields the results of applying
+        Changes a dataset into one that contains as fields the results of applying
         the given function (example-wise) to the specified input_fields. The
         function should return a sequence whose elements will be stored in
         fields whose names are given in the output_fields list. If copy_inputs
@@ -209,7 +210,13 @@
         are cached (to avoid recomputation if the same examples are again
         requested).
         """
-        return ApplyFunctionDataSet(function, input_fields, output_fields, copy_inputs, accept_minibatches, cache)
+        self_class = self.__class__
+        class SelfApplyFunctionDataSet(ApplyFunctionDataSet,self_class):
+            pass
+        self.__class__ = SelfApplyFunctionDataSet
+        # set the required additional fields
+        ApplyFunctionDataSet.__init__(self,self,function, input_fields, output_fields, copy_inputs, accept_minibatches, cache)
+        return self
 
 
 class FiniteLengthDataSet(DataSet):
@@ -223,15 +230,31 @@
     def __len__(self):
         """len(dataset) returns the number of examples in the dataset."""
         raise AbstractFunction()
-    
+
+    def __call__(self,fieldname_or_fieldnames):
+        """
+        Extract one or more fields. This may be an expensive operation when the
+        dataset is large. It is not the recommanded way to access individual values
+        (use the iterators instead). If the argument is a string fieldname, then the result
+        is a sequence (iterable object) of values for that field, for the whole dataset. If the
+        argument is a list of field names, then the result is a 'batch', i.e., an Example with keys
+        corresponding to the given field names and values being iterable objects over the
+        individual example values.
+        """
+        if type(fieldname_or_fieldnames) is string:
+            minibatch = self.minibatches([fieldname_or_fieldnames],len(self)).next()
+            return minibatch[fieldname_or_fieldnames]
+        return self.minibatches(fieldname_or_fieldnames,len(self)).next()
                  
 class SliceableDataSet(DataSet):
     """
     Virtual interface, a subclass of DataSet for datasets which are sliceable
     and whose individual elements can be accessed, generally respecting the
     python semantics for [spec], where spec is either a non-negative integer
-    (for selecting one example), or a python slice (for selecting a sub-dataset
-    comprising the specified examples). This is useful for obtaining
+    (for selecting one example), a python slice(start,stop,step) for selecting a regular
+    sub-dataset comprising examples start,start+step,start+2*step,...,n (with n<stop), or a
+    sequence (e.g. a list) of integers [i1,i2,...,in] for selecting
+    an arbitrary subset of examples. This is useful for obtaining
     sub-datasets, e.g. for splitting a dataset into training and test sets.
     """
     def __init__(self):
@@ -250,11 +273,19 @@
         return DataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
     def __getitem__(self,i):
-        """dataset[i] returns the (i+1)-th example of the dataset."""
+        """
+        dataset[i] returns the (i+1)-th example of the dataset.
+        dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+        dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+        dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
+        """
         raise AbstractFunction()
 
     def __getslice__(self,*slice_args):
-        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
+        """
+        dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+        dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+        """
         raise AbstractFunction()
 
 
@@ -348,7 +379,8 @@
     It is a  fixed-length and fixed-width dataset 
     in which each element is a fixed dimension numpy array or a number, hence the whole 
     dataset corresponds to a numpy array. Fields
-    must correspond to a slice of array columns. If the dataset has fields,
+    must correspond to a slice of array columns or to a list of column numbers.
+    If the dataset has fields,
     each 'example' is just a one-row ArrayDataSet, otherwise it is a numpy array.
     Any dataset can also be converted to a numpy array (losing the notion of fields
     by the numpy.array(dataset) call.
@@ -396,7 +428,7 @@
             if self.next_count == self.next_max:
                 raise StopIteration
 
-            #determine the first and last elements of the slice we'll return
+            #determine the first and last elements of the minibatch slice we'll return
             n_rows = self.dataset.data.shape[0]
             self.current = self.next_index()
             upper = self.current + self.minibatch_size
@@ -423,7 +455,7 @@
         There are two ways to construct an ArrayDataSet: (1) from an
         existing dataset (which may result in a copy of the data in a numpy array),
         or (2) from a numpy.array (the data argument), along with an optional description
-        of the fields (a LookupList of column slices indexed by field names).
+        of the fields (a LookupList of column slices (or column lists) indexed by field names).
         """
         self.data=data
         self.fields=fields
@@ -431,17 +463,22 @@
 
         if fields:
             for fieldname,fieldslice in fields.items():
-                # make sure fieldslice.start and fieldslice.step are defined
-                start=fieldslice.start
-                step=fieldslice.step
-                if not start:
-                    start=0
-                if not step:
-                    step=1
-                if not fieldslice.start or not fieldslice.step:
-                    fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
-                # and coherent with the data array
-                assert fieldslice.start >= 0 and fieldslice.stop <= cols
+                assert type(fieldslice) is int or isinstance(fieldslice,slice) or hasattr(fieldslice,"__iter__")
+                if hasattr(fieldslice,"__iter__"): # is a sequence
+                    for i in fieldslice:
+                        assert type(i) is int
+                elif isinstance(fieldslice,slice):
+                    # make sure fieldslice.start and fieldslice.step are defined
+                    start=fieldslice.start
+                    step=fieldslice.step
+                    if not start:
+                        start=0
+                    if not step:
+                        step=1
+                    if not fieldslice.start or not fieldslice.step:
+                        fields[fieldname] = fieldslice = slice(start,fieldslice.stop,step)
+                    # and coherent with the data array
+                    assert fieldslice.start >= 0 and fieldslice.stop <= cols
 
     def minibatches(self,
             fieldnames = DataSet.minibatches_fieldnames,
@@ -457,28 +494,6 @@
         if n_batches is None: n_batches = len(self) / minibatch_size
         return ArrayDataSet.Iterator(self, fieldnames, minibatch_size, n_batches)
 
-    def __getattr__(self,fieldname):
-        """
-        Return a numpy array with the content associated with the given field name.
-        If this is a one-example dataset, then a row, i.e., numpy array (of one less dimension
-        than the dataset itself) is returned.
-        """
-        if len(self.data)==1:
-            return self.data[0,self.fields[fieldname]]
-        return self.data[:,self.fields[fieldname]]
-
-    def __call__(self,*fieldnames):
-        """Return a sub-dataset containing only the given fieldnames as fields."""
-        min_col=self.data.shape[1]
-        max_col=0
-        for field_slice in self.fields.values():
-            min_col=min(min_col,field_slice.start)
-            max_col=max(max_col,field_slice.stop)
-        new_fields=LookupList()
-        for fieldname,fieldslice in self.fields.items():
-            new_fields[fieldname]=slice(fieldslice.start-min_col,fieldslice.stop-min_col,fieldslice.step)
-        return ArrayDataSet(self.data[:,min_col:max_col],fields=new_fields)
-
     def fieldNames(self):
         """Return the list of field names that are supported by getattr and hasField."""
         return self.fields.keys()
@@ -489,8 +504,11 @@
     
     def __getitem__(self,i):
         """
-        dataset[i] returns the (i+1)-th Example of the dataset. If there are no fields
-        the result is just a numpy array (for the i-th row of the dataset data matrix).
+        dataset[i] returns the (i+1)-th Example of the dataset.
+        If there are no fields the result is just a numpy array (for the i-th row of the dataset data matrix).
+        dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+        dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+        dataset[[i1,i2,..,in]] returns the subdataset with examples i1,i2,...,in.
         """
         if self.fields:
             fieldnames,fieldslices=zip(*self.fields.items())
@@ -499,36 +517,34 @@
             return self.data[i]
 
     def __getslice__(self,*args):
-        """dataset[i:j] returns the subdataset with examples i,i+1,...,j-1."""
+        """
+        dataset[i:j] returns the subdataset with examples i,i+1,...,j-1.
+        dataset[i:j:s] returns the subdataset with examples i,i+2,i+4...,j-2.
+        """
         return ArrayDataSet(self.data.__getslice__(*args), fields=self.fields)
 
-    def __array__(self):
-        """Return a view of this dataset which is an numpy.ndarray (i.e. losing
-        the identity and name of fields within the dataset).
-
-        Numpy uses this special function name to retrieve an ndarray view for
-        function such as numpy.sum, numpy.dot, numpy.asarray, etc.
-
-        If this dataset has no fields, then we simply return self.data,
-        otherwise things are complicated. 
-        - why do we want this behaviour when there are fields? (JB)
-        - for convenience and completeness (but maybe it would make
-          more sense to implement this through a 'field-merging'
-          dataset). (YB)
+    def indices_of_unique_columns_used(self):
+        """
+        Return the unique indices of the columns actually used by the fields, and a boolean
+        that signals (if True) that used columns overlap. If they do then the
+        indices are not repeated in the result.
         """
-        if not self.fields:
-            return self.data
-        # else, select subsets of columns mapped by the fields
         columns_used = numpy.zeros((self.data.shape[1]),dtype=bool)
-        overlapping_fields = False
-        n_columns = 0
+        overlapping_columns = False
         for field_slice in self.fields.values():
-            for c in xrange(field_slice.start,field_slice.stop,field_slice.step):
-                n_columns += 1
-                if columns_used[c]: overlapping_fields=True
-                columns_used[c]=True
-        # try to figure out if we can map all the slices into one slice:
-        mappable_to_one_slice = not overlapping_fields
+            if sum(columns_used[field_slice])>0: overlapping_columns=True
+            columns_used[field_slice]=True
+        return [i for i,used in enumerate(columns_used) if used],overlapping_columns
+
+    def slice_of_unique_columns_used(self):
+        """
+        Return None if the indices_of_unique_columns_used do not form a slice. If they do,
+        return that slice. It means that the columns used can be extracted
+        from the data array without making a copy. If the fields overlap
+        but their unique columns used form a slice, still return that slice.
+        """
+        columns_used,overlapping_columns = self.indices_of_columns_used()
+        mappable_to_one_slice = True
         if not overlapping_fields:
             start=0
             while start<len(columns_used) and not columns_used[start]:
@@ -549,19 +565,9 @@
                 else:
                     step = j-i
                 i=j
-        if mappable_to_one_slice:
-            return self.data[:,slice(start,stop,step)]
-        # else make contiguous copy (copying the overlapping columns)
-        result = numpy.zeros((len(self.data),n_columns)+self.data.shape[2:],self.data.dtype)
-        c=0
-        for field_slice in self.fields.values():
-            slice_width=(field_slice.stop-field_slice.start)/field_slice.step
-            # copy the field here
-            result[:,slice(c,c+slice_width)]=self.data[:,field_slice]
-            c+=slice_width
-        return result
-
-class ApplyFunctionDataSet(DataSet):
+        return slice(start,stop,step)
+    
+class ApplyFunctionDataSet(FiniteWidthDataSet):
     """
     A dataset that contains as fields the results of applying
     a given function (example-wise) to specified input_fields of a source
@@ -604,6 +610,11 @@
             # in the case where src is FiniteDataSet. -YB
             self.cached_examples = []
 
+    def fieldNames(self):
+        if self.copy_inputs:
+            return self.output_fields + self.src.fieldNames()
+        return self.output_fields
+    
     def minibatches(self,
                     fieldnames = DataSet.minibatches_fieldnames,
                     minibatch_size = DataSet.minibatches_minibatch_size,
--- a/gradient_learner.py	Fri Apr 11 21:41:09 2008 -0400
+++ b/gradient_learner.py	Fri Apr 11 21:42:07 2008 -0400
@@ -59,7 +59,7 @@
             self.use_function[use_function_key]=Function(input_variables,output_variables)
         use_function = self.use_functions[use_function_key]
         # return a dataset that computes the outputs
-        return input_dataset.applyFunction(use_function,input_fields,output_fields,copy_inputs,compute_now=True)
+        return input_dataset.apply_function(use_function,input_fields,output_fields,copy_inputs,compute_now=True)
     
 
 class StochasticGradientDescent(object):